def main(data_type, vector_size, window_size, min_count, sampling_threshold, negative_size, train_epoch, dm, worker_count, train_wv, concatenate_wv, use_hierarchical_softmax): file_name = "Doc2Vec" + " VS" + str(vector_size) + " WS" + str(window_size) + " MC" + str(min_count) + " ST" + str( sampling_threshold) + \ " NS" + str(negative_size) + " TE" + str(train_epoch) + " DM" + str(dm) + " WC" + str( worker_count) + "spacy" " NS" + str(negative_size) + " TE" + str(train_epoch) + " DM" + str(dm) + " WC" + str(worker_count) + \ " TW" + str(train_wv) + " CW" + str(concatenate_wv) + " HS" + str(use_hierarchical_softmax) corpus_fn = "../data/raw/" + data_type + "/corpus_processed.txt" if os.path.exists(corpus_fn) is False: x_train = np.load("../data/raw/" + data_type + "/x_train_w.npy") x_test = np.load("../data/raw/" + data_type + "/x_test_w.npy") corpus = np.concatenate((x_train, x_test), axis=0) text_corpus = np.empty(len(corpus), dtype=np.object) for i in range(len(corpus)): text_corpus[i] = " ".join(corpus[i]) print(text_corpus[i]) dt.write1dArray(text_corpus, corpus_fn) embedding_fn = "/home/tom/Downloads/glove.6B/glove.6B.300d.txt" model_fn = "../data/" + data_type + "/doc2vec/" + file_name + ".bin" vector_fn = "../data/" + data_type + "/nnet/spaces/" + file_name + ".npy" score_fn = "../data/" + data_type + "/doc2vec/" + file_name + "catacc.score" if os.path.exists(model_fn): print("Imported model") model = g.utils.SaveLoad.load(model_fn) elif file_name[:7] == "Doc2Vec": model = doc2Vec(embedding_fn, corpus_fn, vector_size, window_size, min_count, sampling_threshold, negative_size, train_epoch, dm, worker_count, train_wv, concatenate_wv, use_hierarchical_softmax) model.save(model_fn) if os.path.exists(vector_fn) is False: vectors = [] for d in range(len(model.docvecs)): vectors.append(model.docvecs[d]) np.save(vector_fn, vectors) else: print("Imported vectors") vectors = np.load(vector_fn) if os.path.exists(score_fn) is False or file_name[:6] != "Doc2Vec": print("Getting score") if data_type == "sentiment": classes = dt.import1dArray("../data/" + data_type + "/classify/" + data_type + "/class-all", "i") x_train, y_train, x_test, y_test = sentiment.getSplits(vectors, classes) scores = linearSVMScore(x_train, y_train, x_test, y_test) else: classes = dt.import2dArray("../data/" + data_type + "/classify/" + data_type + "/class-all", "i") x_train, y_train, x_test, y_test = newsgroups.getSplits(vectors, classes) scores = multiClassLinearSVM(x_train, y_train, x_test, y_test) print(scores) dt.write1dArray(scores, score_fn)
def testAll(name_array, rep_array, class_array, data_type): csv_rows = [] for i in range(len(rep_array)): if data_type == "newsgroups": x_train, y_train, x_test, y_test, x_dev, y_dev = newsgroups.getSplits(rep_array[i], class_array[i]) elif data_type == "sentiment": x_train, y_train, x_test, y_test, x_dev, y_dev = sentiment.getSplits(rep_array[i], class_array[i]) elif data_type == "reuters": x_train, y_train, x_test, y_test, x_dev, y_dev = reuters.getSplits(rep_array[i], class_array[i]) scores = multiClassLinearSVM(x_train, y_train, x_dev, y_dev) f1 = scores[0] acc = scores[1] macro_f1 = scores[2] csv_rows.append((name_array[i], acc, f1, macro_f1)) print(csv_rows[i]) with open("../data/raw/" + data_type + "/test/reps.csv", 'wt') as f: writer = csv.writer(f) writer.writerow(("name", "acc", "micro f1", "macro f1")) writer.writerows(csv_rows)
x_train = np.load(x_train_fn) x_test = np.load(x_test_fn) y_train = np.load(y_train_fn) y_test = np.load(y_test_fn) output_size = 1 output_activation = "sigmoid" metric = 'accuracy' loss = 'binary_crossentropy' else: print("Loaded corpus", corpus_type + "_tokenized_corpus" + gram + ".npy") print("Loaded classes", corpus_type + "_classes_categorical.npy") corpus = np.load(folder_name + corpus_type + "_tokenized_corpus" + gram + ".npy") classes = np.load(folder_name + corpus_type + "_classes_categorical.npy") corpus = sequence.pad_sequences(corpus, maxlen=maxlen) x_train, y_train, x_test, y_test, x_dev, y_dev = newsgroups.getSplits( corpus, classes) output_size = len(y_test[0]) output_activation = "softmax" metric = categorical_accuracy loss = 'categorical_crossentropy' if test: x_train = x_train[:100] x_test = x_test[:100] y_train = y_train[:100] y_test = y_test[:100] model_fn = "../data/" + data_type + "/fastText/model/" + file_name + ".model" score_fn = "../data/" + data_type + "/fastText/score/" + file_name + ".txt" if os.path.exists(model_fn) is False: