def getDemoData(): global COUNTER if COUNTER == 0: data = preprocessing.readData("data/data_master_2018-06.csv") COUNTER += 1 elif COUNTER == 1: data = preprocessing.readData("data/data_master_2018-07.csv") COUNTER += 1 elif COUNTER == 2: data = preprocessing.readData("data/data_master_2018-10.csv") COUNTER += 1 else: data = preprocessing.readData("data/data_master_2018-10.csv") COUNTER += 1 return data
def mode_baseline(files): raw_data = readData(files) all_scores = [] for data_point in raw_data: (original_sentence, replStart, replEnd), repl, score = data_point all_scores.append(score) return stats.mode(all_scores)[0][0]
def task1(): data = readData( [TASK_1 / 'train.csv', TASK_1 / 'dev.csv', EXTRA_TRAIN_TASK_1]) scores = [score for _, _, score in data] plt.hist(scores) plt.show()
def mainEnsemble(): n_estimators = [10, 15, 20] folds = 10 repeats = 3 n_classifiers = classifiers.len_classify() acc = [] result_acc = [0] * folds * repeats for i in range((folds * repeats)): result_acc[i] = [0] * n_classifiers data = preprocessing.readData() samples, targets = preprocessing.splitSamples(data) for num in n_estimators: kfold = preprocessing.repeatCrossValidation(folds, repeats) for train, test in kfold.split(samples, targets): X_train, X_test = samples[train], samples[test] y_train, y_test = targets[train], targets[test] for i in range(n_classifiers): y_pred = ensemble.runBagging(X_train, y_train, X_test, i, num) fold_cm, fold_acc = classifiers.mensureAcc(y_pred, y_test) acc.append(fold_acc) for j in range(n_classifiers): for k in range(folds): result_acc[k][j] = acc[k * n_classifiers + j] file_name = "bagging" + str(n_estimators) + ".csv" utils.writeCSV(file_name, result_acc) print("Finish!!\n")
def mean_baseline(files): raw_data = readData(files) all_scores = [] for data_point in raw_data: (_, _, _), _, score = data_point all_scores.append(score) return sum(all_scores) / len(all_scores)
def mainClassification(): folds = 10 n_classifiers = classifiers.len_classify() cm, acc = [], [] result_cm, result_acc = [0] * folds, [0] * folds for i in range(folds): result_cm[i], result_acc[i] = [0] * n_classifiers, [0] * n_classifiers data = preprocessing.readData() samples, targets = preprocessing.splitSamples(data) kfold = preprocessing.crossValidation(folds) for train, test in kfold.split(samples, targets): X_train, X_test = samples[train], samples[test] y_train, y_test = targets[train], targets[test] for i in range(n_classifiers): y_pred = classifiers.classify(X_train, y_train, X_test, i) fold_cm, fold_acc = classifiers.mensureAcc(y_pred, y_test) cm.append(fold_cm) acc.append(fold_acc) for j in range(n_classifiers): for k in range(folds): result_cm[k][j] = cm[k * n_classifiers + j] result_acc[k][j] = acc[k * n_classifiers + j] utils.writeCSV("all_classifiers_confusion-matrix.csv", result_cm) utils.writeCSV("all_classifiers_accuracy.csv", result_acc) print("Finish!!\n")
def test(): bias_clf = load_model(biasPKLfile) truth_clf = load_model(truthPKLfile) docBiasLabel,docTruthLabel = readLabel(testLabelPath,"test") title,article,numCitation = readData(testDataPath,"test") testdata = list(zip(title,numCitation,article)) truth_pred = truth_clf.predict(testdata) bias_pred = bias_clf.predict(testdata) print("Truth value accuracy: ",np.mean(truth_pred == docTruthLabel)) print("Bias value accuracy: ",np.mean(bias_pred == docBiasLabel))
def evaluate_baseline(baseline_score, files): print(baseline_score) raw_data = readData(files) square_error = 0 val_examples = 0 for data_point in raw_data: (original_sentence, replStart, replEnd), repl, score = data_point square_error += (baseline_score - score) ** 2 val_examples += 1 rmse = math.sqrt(square_error / val_examples) return rmse
def main(): data = preprocessing.readData() sil_values_ag1 = [] db_values_ag2 = [] cr_values_ag3 = [] for i in range(2, 22): # Hierarquico hier = hierarchical(n_clusters=i, linkage='complete').fit(data) sil_values_ag1.append(scoreSil(data, hier.labels_)) db_values_ag2.append(scoreDB(data, hier.labels_)) # cr_values_ag3.append(scoreCR(targets, hier.labels_)) writeCSV("sil_all.csv", sil_values_ag) writeCSV("db_all.csv", db_values_ag) writeCSV("cr_all.csv", cr_values_ag)
for row in csv_reader: if line_count == 0: ofile.write("AvSigVersion" + "," + "HasDetections" + "\n") t = 0 for field in row: columns[field] = t t += 1 print("Column names are {}".format(", ".join(row))) line_count += 1 else: ofile.write(row[columns["AvSigVersion"]] + "," + row[columns["HasDetections"]] + "\n") line_count += 1 print('Processed {} lines.'.format(line_count)) data = preprocessing.readData('graph_data.csv') byDate = data[["AvSigVersion", 'HasDetections']].values.tolist() signatures = [] detections = [] dateCount = defaultdict(lambda: 0) dateSum = defaultdict(lambda: 0) readingErrors = 0 for line in byDate: si = line[0].split(".") try: signature = int(("000" + str(si[1]))[-4:] + ("000" + str(si[2]))[-4:]) dateCount[signature] = dateCount[signature] + 1 dateSum[signature] = dateSum[signature] + line[1] except: readingErrors += 1 print("OK, just :", readingErrors, "Reading Errors")
import preprocessing data = preprocessing.readData('train_samples.csv') t = 2 outFile = 'prunning_test' while t < 30: preprocessing.cross_validation(data=data, k=5, file_out=outFile, min_sample=t) t += 1 with open(outFile, 'a') as ofile: ofile.write(str(t) + "\n")
weights_ = np.array(weights_)+1E-10 return (tfidf*weights_).tolist() def extractTokuchoWord(docs, weights): corpus = list(set(w for doc in docs for w in doc if len(w)>1)) ans = {} for word in corpus: ans[word] = max(calWeightedTFIDF(word,docs,weights)) #print(ans) return max(ans,key=ans.get) def min_max(x, axis=None): min = x.min(axis=axis, keepdims=True) max = x.max(axis=axis, keepdims=True) result = (x-min)/(max-min) return result if __name__ == "__main__": data_master = preprocessing.readData("data_master.csv") np.random.seed(seed=0) idx = np.random.randint(1000,4000,200) docs = data_master.loc[idx, "tango"].to_list() reaction = min_max(data_master.loc[idx, "reaction"].to_numpy()) eikyoudo = data_master.loc[idx, "eikyoudo"].to_numpy() weights = reaction*eikyoudo print(weights) ideal_weights = [1 for _ in range(len(weights))] normal_tokuchogo = extractTokuchoWord(docs,ideal_weights) weighted_tokuchogo = extractTokuchoWord(docs,weights) print("Normal tokuchogo",normal_tokuchogo) print("Weighted tokuchogo", weighted_tokuchogo)
def __init__(self, use_trained_model, mode, input_file_path): self.use_trained_model = use_trained_model self.mode = mode print("----READING DATA----") #inputs self.X = tf.placeholder(tf.int32, [None, None]) if self.mode != "Test": self.Y = tf.placeholder(tf.float32, [1, FLAGS.num_classes]) # tf variables #self.tf_ideal_learning_rate = tf.placeholder(tf.float32, shape=[]) self.tf_ideal_l2_reg_parameter = tf.placeholder(tf.float32, shape=[]) self.sequence_length = tf.placeholder(tf.int32, [None]) print("reading embeddings...") # read word embeddings self.vocabList, self.embeddings = preprocessing.readGloveEmbeddings( FLAGS.word_embed_path, FLAGS.word_embedding_size) self.char_list, self.char_embeddings = preprocessing.readCharEmbeddings( path=FLAGS.char_embed_path, embedding_size=FLAGS.embedding_dim) #create word embeddings self.tf_embeddings = tf.Variable(tf.constant( 0.0, shape=[self.embeddings.shape[0], self.embeddings.shape[1]]), trainable=False, name="tf_embeddings") self.embedding_placeholder = tf.placeholder( tf.float32, [self.embeddings.shape[0], self.embeddings.shape[1]]) self.embedding_init = self.tf_embeddings.assign( self.embedding_placeholder) print("transforming dictionaries...") # turn list to a dict for increase in performance self.vocabulary = {} self.char_vocabulary = {} for i in range(len(self.vocabList)): self.vocabulary[self.vocabList[i]] = i for i in range(len(self.char_list)): self.char_vocabulary[self.char_list[i]] = i del self.char_list, self.vocabList print("reading the text data...") #read tweets self.tr_set, self.target_val, self.seq_len = preprocessing.readData( input_file_path, self.mode) self.tweets = [row[1] for row in self.tr_set] self.users = [row[0] for row in self.tr_set] self.valid_set_size = int( len(self.tweets) * FLAGS.dev_sample_percentage) #split dataset into parts according to mode if mode == "Train": self.train_tweets = self.tweets self.train_users = self.users self.train_seqlen = self.seq_len print("Training set size of tweets: " + str(len(self.train_tweets))) elif mode == "Valid": self.valid_tweets = self.tweets[:self.valid_set_size] self.train_tweets = self.tweets[self.valid_set_size:] self.valid_users = self.users[:self.valid_set_size] self.train_users = self.users[self.valid_set_size:] self.valid_seqlen = self.seq_len[:self.valid_set_size] self.train_seqlen = self.seq_len[self.valid_set_size:] print("Training set size of tweets: " + str(len(self.train_tweets)) + " Validation set size of tweets: " + str(len(self.valid_tweets))) elif mode == "Test": self.test_tweets = self.tweets self.test_users = self.users self.test_seqlen = self.seq_len print("Test set size of tweets:" + str(len(self.test_tweets)))
import numpy as np import pandas as pd import time from sklearn.model_selection import StratifiedKFold from sklearn import metrics from sklearn.neighbors import KNeighborsClassifier import pipeline import preprocessing np.random.seed(123456789) classifier = KNeighborsClassifier(n_neighbors=1, n_jobs=-1) print("Reading train/test sets...") X_train, y_train, X_test = preprocessing.readData() print("Pipeline for train/test sets...") X_train, y_train, id_train, X_test, id_test = pipeline.Pipeline(X_train, y_train, X_test) id_test = id_test.astype(int) print("Fitting KNN classifier...") classifier.fit(X_train, y_train) print("Predicting over test set...") pred = classifier.predict(X_test) res = pd.DataFrame({"id":id_test, "status_group":pred}) res.to_csv("../submissions/new_submission.csv", index=False)
now_date = dt_now.strftime("%Y年%m月%d日") messageText = "----{5} 今週のチワワ----\n\ 今週もお疲れさまでした!\n\ 今週の皆さんの会話を聞いていたチワワの名前は……\n\ 「{0}チワワ」になりました。\n\ 同じ話題が過去に出てきたのは{1}でした。\n\ その時のチャットは「{2}」のような感じでした。\n\ {0}チワワのレベルは{3}になりました。\n\ {4}\ 来週も頑張りましょう!".format(tokuchogo, kako_wadai[1], kako_wadai[0], level, Summon_chiwawa.summon(level), now_date) return messageText def getTokuchogo(dataFrame): docs = dataFrame["tango"].to_list() reaction = tokuchogo.min_max(dataFrame["reaction"].to_numpy()) eikyoudo = dataFrame["now_eikyoudo"].to_numpy() weights = reaction * eikyoudo weighted_tokuchogo = tokuchogo.extractTokuchoWord(docs, weights) return weighted_tokuchogo if __name__ == "__main__": now_data = preprocessing.readData("data/data_master_2018-10.csv") now_date = datetime.datetime.strptime("2018-10-19", "%Y-%m-%d").date() print(makeMessage(now_data, now_date)) # companyId = "" # groupId = "" # print(get_today_message(companyId, groupId))
def train(): docBiasLabel,docTruthLabel = readLabel(trainLabelPath) title,article,numCitation = readData(trainDataPath) traindata = list(zip(title,numCitation,article)) dataExtractor = Pipeline([('TitleArticleExtractor', TitleArticleExtractor()),]) TfidfTitle = Pipeline([ ('selector', ItemSelector(key='title')), ('vect', TfidfVectorizer(min_df = 0.01)), ('to_dense', DenseTransformer()), ]) TfidfArticle = Pipeline([ ('selector', ItemSelector(key='article')), ('vect', TfidfVectorizer(min_df = 0.01)), ('to_dense', DenseTransformer()), ]) textStatsTitle = Pipeline([ ('selector', ItemSelector(key='title')), ('stats', Text_Stats()), ('to_dense', DenseTransformer()), ]) textStatsArticle = Pipeline([ ('selector', ItemSelector(key='article')), ('stats', Text_Stats()), ('to_dense', DenseTransformer()), ]) matchNgrams = Pipeline([ ('selector', ItemSelector(key='ngram')), ('func', extractFeature()), ('to_dense', DenseTransformer()), ]) bias_clf = Pipeline([ ('TitleArticleExtractor', dataExtractor), ('union', FeatureUnion( transformer_list=[ ('tfidf_title', TfidfTitle), ('tfidf_article', TfidfArticle), ('text_stats_title', textStatsTitle), ('text_stats_body', textStatsArticle), ('matchngrams', matchNgrams), ], )), ('clf', MultinomialNB()), ]) bias_clf.fit(traindata, docBiasLabel) with open(biasPKLfile,"wb") as f_pk: pickle.dump(bias_clf,f_pk,pickle.HIGHEST_PROTOCOL) truth_clf = Pipeline([ ('TitleArticleExtractor', dataExtractor), ('union', FeatureUnion( transformer_list=[ ('tfidf_title', TfidfTitle), ('tfidf_article',TfidfArticle), ('text_stats_headline',textStatsTitle), ('text_stats_body', textStatsArticle), ('matchngrams', matchNgrams), ], )), ('clf', GaussianNB()), ]) truth_clf.fit(traindata, docTruthLabel) with open(truthPKLfile,"wb") as f_pk: pickle.dump(truth_clf,f_pk,pickle.HIGHEST_PROTOCOL)
col_sums = np.sum(conf_matrix, axis=0) diag = np.diag(conf_matrix) scores = 2 * diag / (row_sums + col_sums) return np.mean(scores) accs = [] f1s = [] tiempos = [] #possibleK = [1,3,5,7,9,11] #dims = list(range(30,50)) #possibleK=[1]*len(dims) + [3]*len(dims) + [5]*len(dims) + [7]*len(dims) possibleK = [1] dims = [44] X, y, _ = preprocessing.readData() print("El conjunto de datos tiene dimension: " + str(len(X.iloc[0]))) for k, dim in zip(possibleK, dims): print("K=" + str(k)) print("Dim=" + str(dim)) classifier = KNeighborsClassifier(n_neighbors=k, n_jobs=-1) skf = StratifiedKFold(n_splits=5, random_state=123456789) scores = [] f1_scores = [] elapsed_times = [] conf_matrix = [] X_index = skf.split(X, y) missclassified_images = []
with open("density_graph_data.csv",'w') as ofile: for row in csv_reader: if line_count == 0: ofile.write("AvSigVersion"+"\n") t=0 for field in row: columns[field]=t t+=1 print("Column names are {}".format(", ".join(row))) line_count += 1 else: ofile.write(row[columns["AvSigVersion"]] +"\n") line_count += 1 print('Processed {} lines.'.format(line_count)) data=preprocessing.readData('density_graph_data.csv') byDate= data[["AvSigVersion"]].values.tolist() signatures=[] detections=[] dateCount=defaultdict(lambda : 0) readingErrors=0 for line in byDate: si = line[0].split(".") try: signature=int(("000"+str(si[1]))[-4:]+("000"+str(si[2]))[-4:]) dateCount[signature] = dateCount[signature]+1 except: readingErrors+=1 print("OK, just :",readingErrors,"Reading Errors") #signatures.append(int(("000"+str(si[1]))[-4:]+("000"+str(si[2]))[-4:])) #detections.append(line[1])