def build_datasets(reduced=False, columns=None): random.seed(0) vectors = WordVectors(load=True) build_dataset(vectors, "train", reduced=reduced, columns=columns, tune_fraction=0.15) build_dataset(vectors, "train", reduced=reduced, columns=columns) build_dataset(vectors, "dev", reduced=reduced, columns=columns) build_dataset(vectors, "test", reduced=reduced, columns=columns)
from word_vectors import WordVectors import numpy as np # syn_final.npy: storing word embeddings, numpy array of shape [vocab_size, hidden_size] # 'vocab.txt': text file storing words in vocabulary, one word per line query = ',' num_similar_words = 10 syn0_final = np.load('syn0_final.npy') vocab_words = [] with open('vocab.txt') as f: vocab_words = [l.strip() for l in f] wv = WordVectors(syn0_final, vocab_words) print(wv.most_similar(query, num_similar_words))
from word_vectors import WordVectors import numpy as np import train import evaluate import sys # dataset = Dataset("SST") dataset = Dataset("SST_phrase") # dataset = Dataset("IMDB", preprocess=True) # dataset = Dataset("MR", preprocess=True) # load pretrained word vectors pretrained_vectors = [] for index, type_ in enumerate(config['word_vector_type']): pretrained_vectors.append( WordVectors(type_, config["pretrained_vectors"][index])) print("loaded vectors {}".format(config['word_vector_type'][index])) # data = dataset.cv_split(index=5) data = dataset.cv_split(index=2) # # data = dataset.cv_split(index=5) # SST splits (take only train and test) # 1 = train # 2 = test # 3 = dev t1 = dataset.cv_split(index=2) # t1 = dataset.cv_split(index=3) t2 = dataset.cv_split(index=1) data = [t2[2], t2[3], t1[2], t1[3]]
def main(): """ The user can provide the location of the config file as an argument. If no location is specified, the default config file (experiment_parameters.cfg) is used. """ # load pretrained word vectors language = "english" pretrained_vectors = [] for index, type_ in enumerate(config['word_vector_type']): pretrained_vectors.append( WordVectors(type_, config["pretrained_vectors"][index])) print("loaded vectors {}".format(config['word_vector_type'][index])) # load from evaulation path type_ = "from_model" # words_path = "./runs/1496739336/evaluations/1496933648/words_embds.csv" # words_path = "./runs/1496969351/best_snaps/../evaluations/1497016586/words_embds.csv" # words_path = "./runs/1497028147/best_snaps/../evaluations/1497092785/words_embds.csv" # words_path = "./runs/1497313304/best_snaps/../evaluations/1497438814/words_embds.csv" # words_path = "./runs/1497517616/best_snaps/../evaluations/1497862652/words_embds.csv" # words_path = "./runs/1497815260/best_snaps/../evaluations/1498091033/words_embds.csv" # words_path = "./runs/1497894091/best_snaps/../evaluations/1498148645/words_embds.csv" words_path = "./runs/1498124433/best_snaps/../evaluations/1498157705/words_embds.csv" pretrained_vectors.append(WordVectors(type_, words_path)) config['word_vector_type'].append(type_) flag_ = True for vec_num_, word_vectors in enumerate(pretrained_vectors): # print (word_vectors.dictionary) # print (word_vectors.vectors[0]) # print (word_vectors.word_to_index["sermonize"], "seromnize") print("\n============= Evaluating word vectors: {} for language: {}" " =============\n".format(config['word_vector_type'][vec_num_], (language))) simlex_score, simlex_coverage = evaluate_similarity( word_vectors, language) print("SimLex-999 score and coverage:", simlex_score, simlex_coverage) # sys.exit() # WordSim Validation scores: c1, cov1 = evaluate_similarity(word_vectors, language, source=language) c2, cov2 = evaluate_similarity(word_vectors, language, source=language + "-sim") c3, cov3 = evaluate_similarity(word_vectors, language, source=language + "-rel") print("WordSim overall score and coverage:", c1, cov1) print("WordSim Similarity score and coverage:", c2, cov2) print("WordSim Relatedness score and coverage:", c3, cov3, "\n") men_score, men_coverage = evaluate_similarity(word_vectors, language, source="MEN") print("MEN score and coverage:", men_score, men_coverage, "\n") # SimVerb sim_relations = [ "SYNONYMS", "ANTONYMS", "HYPER/HYPONYMS", "COHYPONYMS", "NONE", "ALL" ] for relation_ in sim_relations: sv_score, sv_coverage = evaluate_similarity(word_vectors, relation_, source="SimVerb") print("SimVerb {} score and coverage: {} {}".format( relation_, sv_score, sv_coverage)) print("\n") rw_score, rw_coverage = evaluate_similarity(word_vectors, language, source="rw") print("RW score and coverage:", rw_score, rw_coverage) if flag_: print( "\n======== Evaluating only words from specified dictionary ========\n" ) for vec_num_, word_vectors in enumerate(pretrained_vectors[:-1]): dic_ = pretrained_vectors[-1].word_to_index simlex_score, simlex_coverage = evaluate_similarity( word_vectors, language, dictionary_=dic_) print("SimLex-999 score and coverage:", simlex_score, simlex_coverage) # sys.exit() # WordSim Validation scores: c1, cov1 = evaluate_similarity(word_vectors, language, source=language, dictionary_=dic_) c2, cov2 = evaluate_similarity(word_vectors, language, source=language + "-sim", dictionary_=dic_) c3, cov3 = evaluate_similarity(word_vectors, language, source=language + "-rel", dictionary_=dic_) print("WordSim overall score and coverage:", c1, cov1) print("WordSim Similarity score and coverage:", c2, cov2) print("WordSim Relatedness score and coverage:", c3, cov3, "\n") men_score, men_coverage = evaluate_similarity(word_vectors, language, source="MEN", dictionary_=dic_) print("MEN score and coverage:", men_score, men_coverage, "\n") # SimVerb sim_relations = [ "SYNONYMS", "ANTONYMS", "HYPER/HYPONYMS", "COHYPONYMS", "NONE", "ALL" ] for relation_ in sim_relations: sv_score, sv_coverage = evaluate_similarity(word_vectors, relation_, source="SimVerb", dictionary_=dic_) print("SimVerb {} score and coverage: {} {}".format( relation_, sv_score, sv_coverage)) print("\n") rw_score, rw_coverage = evaluate_similarity(word_vectors, language, source="rw", dictionary_=dic_) print("RW score and coverage:", rw_score, rw_coverage)