Ejemplo n.º 1
0
def main(data_type, vector_size, window_size, min_count, sampling_threshold, negative_size,
                               train_epoch, dm, worker_count, train_wv, concatenate_wv, use_hierarchical_softmax):
    file_name = "Doc2Vec" + " VS" + str(vector_size) + " WS" + str(window_size) + " MC" + str(min_count) + " ST" + str(
        sampling_threshold) + \
                " NS" + str(negative_size) + " TE" + str(train_epoch) + " DM" + str(dm) + " WC" + str(
        worker_count) + "spacy"
    " NS" + str(negative_size) + " TE" + str(train_epoch) + " DM" + str(dm) + " WC" + str(worker_count) + \
    " TW" + str(train_wv) + " CW" + str(concatenate_wv) + " HS" + str(use_hierarchical_softmax)

    corpus_fn = "../data/raw/" + data_type + "/corpus_processed.txt"

    if os.path.exists(corpus_fn) is False:
        x_train = np.load("../data/raw/" + data_type + "/x_train_w.npy")
        x_test = np.load("../data/raw/" + data_type + "/x_test_w.npy")
        corpus = np.concatenate((x_train, x_test), axis=0)
        text_corpus = np.empty(len(corpus), dtype=np.object)
        for i in range(len(corpus)):
            text_corpus[i] = " ".join(corpus[i])
            print(text_corpus[i])
        dt.write1dArray(text_corpus, corpus_fn)

    embedding_fn = "/home/tom/Downloads/glove.6B/glove.6B.300d.txt"

    model_fn = "../data/" + data_type + "/doc2vec/" + file_name + ".bin"
    vector_fn = "../data/" + data_type + "/nnet/spaces/" + file_name + ".npy"
    score_fn = "../data/" + data_type + "/doc2vec/" + file_name + "catacc.score"

    if os.path.exists(model_fn):
        print("Imported model")
        model = g.utils.SaveLoad.load(model_fn)
    elif file_name[:7] == "Doc2Vec":
        model = doc2Vec(embedding_fn, corpus_fn, vector_size, window_size, min_count, sampling_threshold,
                        negative_size, train_epoch, dm, worker_count, train_wv, concatenate_wv, use_hierarchical_softmax)
        model.save(model_fn)

    if os.path.exists(vector_fn) is False:
        vectors = []
        for d in range(len(model.docvecs)):
            vectors.append(model.docvecs[d])
        np.save(vector_fn, vectors)
    else:
        print("Imported vectors")
        vectors = np.load(vector_fn)

    if os.path.exists(score_fn) is False or file_name[:6] != "Doc2Vec":
        print("Getting score")
        if data_type == "sentiment":
            classes = dt.import1dArray("../data/" + data_type + "/classify/" + data_type + "/class-all", "i")
            x_train, y_train, x_test, y_test = sentiment.getSplits(vectors, classes)
            scores = linearSVMScore(x_train, y_train, x_test, y_test)
        else:
            classes = dt.import2dArray("../data/" + data_type + "/classify/" + data_type + "/class-all", "i")
            x_train, y_train, x_test, y_test = newsgroups.getSplits(vectors, classes)
            scores = multiClassLinearSVM(x_train, y_train, x_test, y_test)
        print(scores)
        dt.write1dArray(scores, score_fn)
def testAll(name_array, rep_array, class_array, data_type):
    csv_rows = []

    for i in range(len(rep_array)):
        if data_type == "newsgroups":
            x_train, y_train, x_test, y_test, x_dev, y_dev = newsgroups.getSplits(rep_array[i], class_array[i])
        elif data_type == "sentiment":
            x_train, y_train, x_test, y_test, x_dev, y_dev = sentiment.getSplits(rep_array[i], class_array[i])
        elif data_type == "reuters":
            x_train, y_train, x_test, y_test, x_dev, y_dev = reuters.getSplits(rep_array[i], class_array[i])

        scores = multiClassLinearSVM(x_train, y_train, x_dev, y_dev)
        f1 = scores[0]
        acc = scores[1]
        macro_f1 = scores[2]
        csv_rows.append((name_array[i], acc, f1, macro_f1))
        print(csv_rows[i])
    with open("../data/raw/" + data_type + "/test/reps.csv", 'wt') as f:
        writer = csv.writer(f)
        writer.writerow(("name", "acc", "micro f1", "macro f1"))
        writer.writerows(csv_rows)
Ejemplo n.º 3
0
        x_train = np.load(x_train_fn)
        x_test = np.load(x_test_fn)
        y_train = np.load(y_train_fn)
        y_test = np.load(y_test_fn)
    output_size = 1
    output_activation = "sigmoid"
    metric = 'accuracy'
    loss = 'binary_crossentropy'
else:
    print("Loaded corpus", corpus_type + "_tokenized_corpus" + gram + ".npy")
    print("Loaded classes", corpus_type + "_classes_categorical.npy")
    corpus = np.load(folder_name + corpus_type + "_tokenized_corpus" + gram +
                     ".npy")
    classes = np.load(folder_name + corpus_type + "_classes_categorical.npy")
    corpus = sequence.pad_sequences(corpus, maxlen=maxlen)
    x_train, y_train, x_test, y_test, x_dev, y_dev = newsgroups.getSplits(
        corpus, classes)
    output_size = len(y_test[0])
    output_activation = "softmax"
    metric = categorical_accuracy
    loss = 'categorical_crossentropy'

if test:
    x_train = x_train[:100]
    x_test = x_test[:100]
    y_train = y_train[:100]
    y_test = y_test[:100]

model_fn = "../data/" + data_type + "/fastText/model/" + file_name + ".model"
score_fn = "../data/" + data_type + "/fastText/score/" + file_name + ".txt"

if os.path.exists(model_fn) is False: