Esempio n. 1
0
def build_datasets(reduced=False, columns=None):
    random.seed(0)
    vectors = WordVectors(load=True)
    build_dataset(vectors, "train", reduced=reduced, columns=columns, tune_fraction=0.15)
    build_dataset(vectors, "train", reduced=reduced, columns=columns)
    build_dataset(vectors, "dev", reduced=reduced, columns=columns)
    build_dataset(vectors, "test", reduced=reduced, columns=columns)
Esempio n. 2
0
from word_vectors import WordVectors
import numpy as np

# syn_final.npy: storing word embeddings, numpy array of shape [vocab_size, hidden_size]
# 'vocab.txt': text file storing words in vocabulary, one word per line

query = ','
num_similar_words = 10
syn0_final = np.load('syn0_final.npy')
vocab_words = []
with open('vocab.txt') as f:
    vocab_words = [l.strip() for l in f]

wv = WordVectors(syn0_final, vocab_words)
print(wv.most_similar(query, num_similar_words))
Esempio n. 3
0
from word_vectors import WordVectors
import numpy as np
import train
import evaluate
import sys

# dataset = Dataset("SST")
dataset = Dataset("SST_phrase")
# dataset = Dataset("IMDB", preprocess=True)
# dataset = Dataset("MR", preprocess=True)

# load pretrained word vectors
pretrained_vectors = []
for index, type_ in enumerate(config['word_vector_type']):
    pretrained_vectors.append(
        WordVectors(type_, config["pretrained_vectors"][index]))
    print("loaded vectors {}".format(config['word_vector_type'][index]))

# data = dataset.cv_split(index=5)

data = dataset.cv_split(index=2)
# # data = dataset.cv_split(index=5)

# SST splits (take only train and test)
# 1 = train
# 2 = test
# 3 = dev
t1 = dataset.cv_split(index=2)
# t1 = dataset.cv_split(index=3)
t2 = dataset.cv_split(index=1)
data = [t2[2], t2[3], t1[2], t1[3]]
Esempio n. 4
0
def main():
    """
    The user can provide the location of the config file as an argument. 
    If no location is specified, the default config file 
    (experiment_parameters.cfg) is used.
    """
    # load pretrained word vectors
    language = "english"
    pretrained_vectors = []
    for index, type_ in enumerate(config['word_vector_type']):
        pretrained_vectors.append(
            WordVectors(type_, config["pretrained_vectors"][index]))
        print("loaded vectors {}".format(config['word_vector_type'][index]))

    # load from evaulation path
    type_ = "from_model"
    # words_path = "./runs/1496739336/evaluations/1496933648/words_embds.csv"
    # words_path = "./runs/1496969351/best_snaps/../evaluations/1497016586/words_embds.csv"
    # words_path = "./runs/1497028147/best_snaps/../evaluations/1497092785/words_embds.csv"
    # words_path = "./runs/1497313304/best_snaps/../evaluations/1497438814/words_embds.csv"
    # words_path = "./runs/1497517616/best_snaps/../evaluations/1497862652/words_embds.csv"
    # words_path = "./runs/1497815260/best_snaps/../evaluations/1498091033/words_embds.csv"
    # words_path = "./runs/1497894091/best_snaps/../evaluations/1498148645/words_embds.csv"
    words_path = "./runs/1498124433/best_snaps/../evaluations/1498157705/words_embds.csv"
    pretrained_vectors.append(WordVectors(type_, words_path))
    config['word_vector_type'].append(type_)
    flag_ = True

    for vec_num_, word_vectors in enumerate(pretrained_vectors):
        # print (word_vectors.dictionary)
        # print (word_vectors.vectors[0])
        # print (word_vectors.word_to_index["sermonize"], "seromnize")
        print("\n============= Evaluating word vectors: {} for language: {}"
              " =============\n".format(config['word_vector_type'][vec_num_],
                                        (language)))

        simlex_score, simlex_coverage = evaluate_similarity(
            word_vectors, language)
        print("SimLex-999 score and coverage:", simlex_score, simlex_coverage)
        # sys.exit()

        # WordSim Validation scores:
        c1, cov1 = evaluate_similarity(word_vectors, language, source=language)
        c2, cov2 = evaluate_similarity(word_vectors,
                                       language,
                                       source=language + "-sim")
        c3, cov3 = evaluate_similarity(word_vectors,
                                       language,
                                       source=language + "-rel")
        print("WordSim overall score and coverage:", c1, cov1)
        print("WordSim Similarity score and coverage:", c2, cov2)
        print("WordSim Relatedness score and coverage:", c3, cov3, "\n")

        men_score, men_coverage = evaluate_similarity(word_vectors,
                                                      language,
                                                      source="MEN")
        print("MEN score and coverage:", men_score, men_coverage, "\n")

        # SimVerb
        sim_relations = [
            "SYNONYMS", "ANTONYMS", "HYPER/HYPONYMS", "COHYPONYMS", "NONE",
            "ALL"
        ]
        for relation_ in sim_relations:
            sv_score, sv_coverage = evaluate_similarity(word_vectors,
                                                        relation_,
                                                        source="SimVerb")
            print("SimVerb {} score and coverage: {} {}".format(
                relation_, sv_score, sv_coverage))
        print("\n")
        rw_score, rw_coverage = evaluate_similarity(word_vectors,
                                                    language,
                                                    source="rw")
        print("RW score and coverage:", rw_score, rw_coverage)

    if flag_:
        print(
            "\n======== Evaluating only words from specified dictionary ========\n"
        )
        for vec_num_, word_vectors in enumerate(pretrained_vectors[:-1]):
            dic_ = pretrained_vectors[-1].word_to_index
            simlex_score, simlex_coverage = evaluate_similarity(
                word_vectors, language, dictionary_=dic_)
            print("SimLex-999 score and coverage:", simlex_score,
                  simlex_coverage)
            # sys.exit()

            # WordSim Validation scores:
            c1, cov1 = evaluate_similarity(word_vectors,
                                           language,
                                           source=language,
                                           dictionary_=dic_)
            c2, cov2 = evaluate_similarity(word_vectors,
                                           language,
                                           source=language + "-sim",
                                           dictionary_=dic_)
            c3, cov3 = evaluate_similarity(word_vectors,
                                           language,
                                           source=language + "-rel",
                                           dictionary_=dic_)
            print("WordSim overall score and coverage:", c1, cov1)
            print("WordSim Similarity score and coverage:", c2, cov2)
            print("WordSim Relatedness score and coverage:", c3, cov3, "\n")

            men_score, men_coverage = evaluate_similarity(word_vectors,
                                                          language,
                                                          source="MEN",
                                                          dictionary_=dic_)
            print("MEN score and coverage:", men_score, men_coverage, "\n")

            # SimVerb
            sim_relations = [
                "SYNONYMS", "ANTONYMS", "HYPER/HYPONYMS", "COHYPONYMS", "NONE",
                "ALL"
            ]
            for relation_ in sim_relations:
                sv_score, sv_coverage = evaluate_similarity(word_vectors,
                                                            relation_,
                                                            source="SimVerb",
                                                            dictionary_=dic_)
                print("SimVerb {} score and coverage: {} {}".format(
                    relation_, sv_score, sv_coverage))
            print("\n")
            rw_score, rw_coverage = evaluate_similarity(word_vectors,
                                                        language,
                                                        source="rw",
                                                        dictionary_=dic_)
            print("RW score and coverage:", rw_score, rw_coverage)