def main(): parser = argparse.ArgumentParser() parser.add_argument('-i', '--embedding', required=True) parser.add_argument('-m', '--method', choices=['PCA', 'tSVD', 'DRA'], default='DRA') parser.add_argument('-o', '--output', required=True) parser.add_argument('-n', '--reduce_to', type=int, default=300) parser.add_argument('-b', '--do_in_batches', action='store_true') parser.add_argument('-nb', '--batch_size', type=int, default=1024) args = parser.parse_args() emb = load_embedding(args.embedding, lower=False, length_normalize=False, delete_duplicates=True) if args.method == 'PCA': if args.do_in_batches: emb.vectors = PPA_batches(emb.vectors, args.reduce_to, args.batch_size) else: emb.vectors = PCA(emb.vectors, args.reduce_to) elif args.method == 'tSVD': emb.vectors = T_SVD(emb.vectors, args.reduce_to) elif args.method == 'DRA': if args.do_in_batches: emb.vectors = DRA_batches(emb.vectors, args.reduce_to, args.batch_size) else: emb.vectors = DRA(emb.vectors, args.reduce_to) else: raise ValueError(str(args.method) + ' reduction method not supported. Reduction method supported: PCA, tSVD, DRA') emb.export(args.output)
def emb_converter(path_input, path_output, args): printTrace('Loading Embedding ' + str(path_input) + '...') format = 'bin' if path_input.split('/')[-1].split('.')[-1] == 'bin' else 'text' emb = load_embedding(path_input, format=format, vocabulary=None if args.vocab is None else vocab_from_path(args.vocab), length_normalize=args.length_normalize, normalize_dimensionwise=args.normalize_dimensionwise, to_unicode=True, lower=args.lower, path2='', delete_duplicates=True, method_vgg="delete") printTrace('Saving result to ' + str(path_output) + '...') num_words = 0 with open(path_output, 'w+') as file: for i_word, word in enumerate(emb.words): if i_word % 5000 ==0: string = "<" + str(datetime.datetime.now()) + "> " + 'Converting : ' + str( int(100 * i_word / len(emb.words))) + '%' print(string, end="\r") if args.language is None or any(l in word.split(args.delimiter) for l in args.language): print(word.split(args.delimiter)[-1] + ' ' + ' '.join(['%.6g' % x for x in emb.word_to_vector(word)]), file=file) num_words+=1 print() if args.word2vec: excec_com = 'sed -i \'1s/^/' + str(num_words) + ' ' + str(emb.dims) + '\\n/\' ' + str(path_output) print(excec_com) os.system(excec_com) printTrace('Done.')
def kmeans4embedding(embedding_path, output_path, k, metric, batch_size): printTrace('Loading embedding ' + str(embedding_path)) emb = load_embedding(embedding_path, lower=False, length_normalize=False, delete_duplicates=True) printTrace('Clustering for embedding ' + str(embedding_path)) labels = doKmeans(emb.vectors, k, metric, batch_size) printTrace('Printing clusters for embedding ' + str(embedding_path)) with open(output_path, 'w') as file: for i_label, label in enumerate(labels): print(emb.vocabulary.index_to_word(i_label) + ' ' + str(label), file=file) printTrace('Sorting clusters for embedding ' + str(embedding_path)) excec_com = 'sort -k2 -n ' + str(output_path) + ' > ' + str(output_path) + '_sorted' print(excec_com) os.system(excec_com) excec_com = 'rm ' + str(output_path) print(excec_com) os.system(excec_com) excec_com = 'mv ' + str(output_path) + '_sorted ' + str(output_path) print(excec_com) os.system(excec_com) printTrace('Done, clusters saved in ' + str(output_path))
def __init__( self, embedding_path='/home/iker/Documents/QuestionCluster/TechEmbeddings/embeddings_lower.vec' ): self.questions = [] self.questions_normalized = [] self.questions_vectors = [] self.keywords = collections.defaultdict() self.embedding = load_embedding(embedding_path)
def main(): parser = argparse.ArgumentParser() inputtype = parser.add_mutually_exclusive_group(required=True) inputtype.add_argument('-i', '--embedding', type=str) inputtype.add_argument('-d', '--directory', type=str) parser.add_argument('-b', '--batch_size', type=int, default=512) parser.add_argument('-dic', '--dictionary_path', type=str, default='DictionaryInductionDataset/es-en.test') parser.add_argument('-p', '--add_lang_prefix', action='store_true') args = parser.parse_args() emb_list = [] if args.embedding is not None: emb_list.append(args.embedding) else: emb_list = [ os.path.join(args.directory, f) for f in os.listdir(args.directory) if os.path.isfile(os.path.join(args.directory, f)) ] if not os.path.exists('Results'): os.makedirs('Results') for emb_i, emb_path in enumerate(emb_list): printTrace('Evaluating Embedding ' + str(emb_i + 1) + ' of ' + str(len(emb_list)) + ' : ' + str(emb_path)) emb = load_embedding(emb_path, lower=False, length_normalize=True, delete_duplicates=True) top1, top2, top3, top5, top10, coverage = evaluate_dictionary_induction( emb, args.dictionary_path, args.batch_size, emb_path, args.add_lang_prefix) with open('Results/dictionary_induction', 'a+') as file: print(','.join([ str(emb_path), str(top1), str(top2), str(top3), str(top5), str(top10), str(coverage) ]), file=file) print('Results have been exported in csv format to the Results folder')
def main(): parser = argparse.ArgumentParser() parser.add_argument('-i', '--embedding', required=True) parser.add_argument('-n', '--name_of_embedding', default=None) parser.add_argument('-l', '--lowercase_dataset', action='store_true') parser.add_argument('-d', '--dataset_path', default='AnalogyDataset/questions-words.txt') parser.add_argument('-b', '--batch_size', type=int, default=512) args = parser.parse_args() print(' >>> loading embedding <<< ') emb = load_embedding(args.embedding, lower=False, length_normalize=True, delete_duplicates=True) name = args.embedding if args.name_of_embedding is None else args.name_of_embedding if not os.path.exists('Results'): os.makedirs('Results') print('>>> Results deleting oov <<< ') results = evaluate_analogy(emb.vocabulary.word_id, emb.vectors, dataset_path=args.dataset_path, lowercase=args.lowercase_dataset, BATCH_SIZE=args.batch_size) print_to_csv_analogy(results, name=name, filenameResults='Results/Analogy_Results_delete.csv') print() print() print('>>> Result using mean of all word vectors as OOV <<<') #emb.vocabulary.word_id['<<UKN>>'] = len(emb.words) emb.vectors = np.append(emb.vectors, [np.mean(emb.vectors, axis=0)], axis=0) results = evaluate_analogy(emb.vocabulary.word_id, emb.vectors, dataset_path=args.dataset_path, lowercase=args.lowercase_dataset, BATCH_SIZE=args.batch_size, backoff_vector=len(emb.vectors) - 1) print_to_csv_analogy(results, name=name, filenameResults='Results/Analogy_Results_mean.csv') print('Results have been exported in csv format to the Results folder')
def benchmark(): global device get_files() print("Running Benchmark..") time = datetime.datetime.now() emb = load_embedding('RWSGwn.emb', length_normalize=False, delete_duplicates=True) time = print_time('Loading embedding from Disk to RAM step', time) emb.length_normalize() time = print_time( 'Embedding length normalization step (' + CPUcolor + 'CPU' + RESETcolor + ')', time) vocab_to_search = emb.words for i in range(100): for word in vocab_to_search: v = emb.word_to_vector(word) time = print_time( 'Searching for vocabulary step (' + CPUcolor + 'CPU' + RESETcolor + ')', time) m = emb.vectors M = emb.vectors for i_batch, mb in enumerate(batch(m, batch_size)): _ = matrix_dot(mb, M) time = print_time( 'Matrix dot product step step ' + ('(' + CPUcolor + 'CPU' + RESETcolor + ')' if device == 'CPU' else '(' + GPUcolor + 'GPU' + RESETcolor + ')'), time) for i_batch, mb in enumerate(batch(m, batch_size)): _ = cosine_knn(mb, M, 10) time = print_time( 'Searching for nearest neighbors step ' + ('(' + CPUcolor + 'CPU' + RESETcolor + ')' if device == 'CPU' else '(' + GPUcolor + 'GPU' + RESETcolor + ')'), time) emb.export('temp.emb') time = print_time('Exporting embedding from RAM to Disk step', time) os.remove("temp.emb") print() print("Benchmark is over.")
def __init__(self, dictname, wordvectdim): print('Loading ' + dictname + '...(This might take one or two minutes.)') self.wordtoindex = dict() self.indextovector = [] self.indextovector.append(np.zeros(wordvectdim)) emb = load_embedding(dictname, length_normalize=False, to_unicode=False, lower=False, delete_duplicates=True) for i_word, word in enumerate(emb.words): self.wordtoindex.update([(word, i_word+1)]) self.indextovector.append(emb.word_to_vector(word)) del emb #print(self.indextovector.shape) self.indextovector = np.array(self.indextovector, dtype='float32')
def main(): parser = argparse.ArgumentParser() parser.add_argument('-i', '--embedding', required=True) parser.add_argument('-l', '--search_words', required=True) parser.add_argument('-o', '--output', required=True) parser.add_argument('-b', '--batch_size', type=int, default=1024) parser.add_argument('-k', '--num_nearest_neighbor', type=int, default=10) args = parser.parse_args() emb = load_embedding(args.embedding, vocabulary=None, lower=False, length_normalize=True, normalize_dimensionwise=False, delete_duplicates=True) words_2_search = vocab_from_path(args.search_words) m = emb.words_to_matrix(words_2_search) M = emb.words_to_matrix(emb.words) nn = [] for i_batch, mb in enumerate(batch(m, args.batch_size)): string = "<" + str( datetime.datetime.now()) + "> " + 'Calculating nn words ' + str( int(100 * (i_batch * args.batch_size) / len(m))) + '%' print(string, end="\r") result = cosine_knn(mb, M, args.num_nearest_neighbor) for i_result, indexes in enumerate(result): nn.append(["\"" + emb.words[i] + "\"" for i in indexes]) file = open(args.output, 'w+', encoding='utf-8') for word, nns in zip(words_2_search, nn): print(word + ': ' + ' '.join(nns), file=file)
def emb_converter(path_input, path_output, args): printTrace('Loading Embedding ' + str(path_input) + '...') format = 'bin' if path_input.split('/')[-1].split( '.')[-1] == 'bin' else 'text' emb = load_embedding( path_input, format=format, vocabulary=None if args.vocab is None else vocab_from_path(args.vocab), length_normalize=args.length_normalize, normalize_dimensionwise=args.normalize_dimensionwise, to_unicode=True, lower=args.lower, path2='', delete_duplicates=True, method_vgg="delete") printTrace('Saving result to ' + str(path_output) + '...') emb.export(path=path_output, printHeader=args.word2vec) printTrace('Done.')
print('Train: %d | Valid: %d | Test: %d' % (len(train), len(valid), len(test))) train_engine = DataLoader(data.DataEngine(train, vocabulary, pad_lens), batch_size=args.batch_size, shuffle=True, num_workers=8, pin_memory=use_cuda) valid_engine = DataLoader(data.DataEngine(valid, vocabulary, pad_lens), batch_size=args.batch_size, shuffle=True, num_workers=8, pin_memory=use_cuda) test_engine = data.DataEngine(test, vocabulary, pad_lens) if args.init_embedding: w2v = load_embedding(args.embedding_source, vocabulary.to_idx, 300) else: w2v = None fusion_net = FusionNet(vocab_size=len(vocabulary), word_dim=300, hidden_size=125, rnn_layer=args.rnn_layer, dropout=args.dropout, pretrained_embedding=w2v) if use_cuda: fusion_net = fusion_net.cuda() criterion = nn.CrossEntropyLoss() optimizer = optim.Adamax(fusion_net.parameters())
Joint_path = '../../Embeddings/' print("====ENGLISH-SPANISH===") words_eng = [] words_eng.append(vocab_from_path(Joint_path + 'JOINTC-HYB-ENES.emb')) words_eng.append(vocab_from_path(Joint_path + 'JOINTC-HYB-ENIT.emb')) english_words = list(set.intersection(*words_eng)) words_es = [] words_es.append(vocab_from_path(Joint_path + 'JOINTC-HYB-ENES.emb')) words_es.append(vocab_from_path(Joint_path + 'JOINTC-HYB-ESIT.emb')) spanish_words = list(set.intersection(*words_es)) emb = load_embedding(Joint_path + 'JOINTC-HYB-ENES.emb', length_normalize=False, delete_duplicates=True) with open('../../Embeddings/separated/JointENES.vec', 'w') as file: print(str(len(spanish_words) + len(english_words)) + ' 300', file=file) for word in english_words: print('en/' + word + ' ' + ' '.join(['%.6g' % x for x in emb.word_to_vector(word)]), file=file) for word in spanish_words: print('es/' + word + ' ' + ' '.join(['%.6g' % x for x in emb.word_to_vector(word)]), file=file)
# run the test for SemEval2016 if args.semeval2016 or args.all: lr = 1e-3 weight_decay = 0 embed_lr = 1e-2 embed_weight_decay = 1e-4 data = dataset.load.SemEval(year="2016", leave_dev=True, fraction=0.1, seed=args.seed) print("number of train data:{}".format(len(data['train'][0]))) print("number of dev data:{}".format(len(data['dev'][0]))) print("number of test data:{}".format(len(data['test'][0]))) word_list = dataset.load.get_word_list(data['train'][0]) embedding_matrix, token_to_idx, idx_to_token = embedding.load_embedding( args.embedding_path, word_list, isBinary=args.isBinary) obj_value, result = run_exp(data, embedding_matrix, token_to_idx, seed=args.seed, weight_decay=weight_decay, lr=lr, max_len=args.max_len, batch_size=50, obj='loss', embed_lr=embed_lr, epoches=100, embed_weight_decay=embed_weight_decay) print("{}\t{}\t{}".format(obj_value, result['macro_f1'],
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--embedding", type=str, required=True) parser.add_argument("-c", "--emb_4_generation", type=str, required=True) parser.add_argument("-d", "--dataset", type=str, required=True) parser.add_argument("-b", "--batch_size", type=int, default=1024) parser.add_argument("-k", "--num_nearest_neighbor", type=int, default=10) args = parser.parse_args() dims = get_dimensions(args.embedding) if dims != get_dimensions(args.emb_4_generation): raise ValueError( "All the embeddings must have the same number of dimensions and the embeddings must be in the word2vec format" ) printTrace("Reading vocab...") vocab_emb = vocab_from_path(args.embedding) vocab_cross = vocab_from_path(args.emb_4_generation) dataset = get_dataset(args.dataset) vocab_to_generate = list( set(np.append((dataset.X[:, 0]), (dataset.X[:, 1])))) vocab_to_generate_set = set(vocab_to_generate) vocab_emb_delete = [x for x in vocab_emb if x not in vocab_to_generate_set] total_vocab = set.union(set(vocab_emb_delete), set(vocab_cross)) interset_vocab = list( set.intersection(set(vocab_emb_delete), set(vocab_cross))) print("Final embedding will have " + str(len(total_vocab)) + " words") print("We will generate " + str(len(vocab_to_generate)) + " words") emb = load_embedding( args.emb_4_generation, vocabulary=None, lower=False, length_normalize=True, normalize_dimensionwise=False, delete_duplicates=True, ) m = emb.words_to_matrix(vocab_to_generate) M = emb.words_to_matrix(interset_vocab) nn = [] for i_batch, mb in enumerate(batch(m, args.batch_size)): string = ("<" + str(datetime.datetime.now()) + "> " + "Using Embedding " + str(args.emb_4_generation) + " to generate vocab for Embedding " + str(args.embedding) + ": " + str(int(100 * (i_batch * args.batch_size) / len(m))) + "%") print(string, end="\r") # print(np.asarray(mb).shape) # print(np.asarray(M).shape) result = cosine_knn(mb, M, args.num_nearest_neighbor) for i_result, indexes in enumerate(result): nn.append([interset_vocab[i] for i in indexes]) del emb printTrace("===> Generating new_vocab <===") emb = load_embedding( args.embedding, vocabulary=vocab_emb_delete, lower=False, length_normalize=False, normalize_dimensionwise=False, delete_duplicates=True, ) new_vectors = [] for i_word, word in enumerate(vocab_to_generate): if i_word % 1000 == 0: string = ("<" + str(datetime.datetime.now()) + "> " + "Generating vocab " + ": " + str(int(100 * i_word / len(vocab_to_generate))) + "%") print(string, end="\r") try: lw = nn[i_word] v = np.zeros([dims], dtype=float) for word_nn in lw: v += emb.word_to_vector(word_nn) except KeyError as r: raise ValueError( "Something went wrong in the word generation process") new_vectors.append(v / args.num_nearest_neighbor) print() del emb printTrace("===> Loading embeddings to compare <===") emb_generated = Embedding(vocabulary=Vocabulary(vocab_to_generate), vectors=new_vectors) emb_original = load_embedding( args.embedding, vocabulary=vocab_to_generate, lower=False, length_normalize=False, normalize_dimensionwise=False, delete_duplicates=True, ) printTrace("===> Evaluate <===") print("Original Embedding: ", end="") print( similarity_emd( emb_original, dataset.X, dataset.y, backoff_vector=None, lower=False, lang1prefix=None, lang2prefix=None, )) print("Generated Embedding: ", end="") print( similarity_emd( emb_generated, dataset.X, dataset.y, backoff_vector=None, lower=False, lang1prefix=None, lang2prefix=None, ))
def main(): """Steps: 1. preprocess data: - tokenization (sentence tokenizer) - separate article and reference summary - chunk into train and test 2. data generation: for each reference summary, do the following mutation operations: deletion, insertion, mutation. According to how much are changed, assign a score. 3. sentence embedding: embed article and summary into sentence vectors. This is the first layer, the embedding layer. Then, do a padding to get the vector to the same and fixed dimension (e.g. summary 20, article 100). FIXME what to do for very long article? Then, fully connected layer directly to the final result. """ # data v1 (x_train, y_train), (x_val, y_val) = prepare_data() # data v2 articles, summaries, scores = load_text_data() # this is pretty time consuming, so save it tokenizer = prepare_tokenizer(articles + summaries) # alternatively, save and load. Note that you must ensure to fit # on the same text. save_tokenizer(tokenizer) tokenizer = load_tokenizer() # this is also slow (x_train, y_train), (x_val, y_val) = prepare_data_v2(articles, summaries, scores, tokenizer) # save and load the data # save_data(x_train, y_train, x_val, y_val) # (x_train, y_train), (x_val, y_val) = load_data() x_train.shape y_train.shape x_val.shape y_val.shape # model v1 model = build_model() # model v2 embedding_layer = load_embedding(tokenizer) model = build_glove_model(embedding_layer) # training op optimizer = tf.train.RMSPropOptimizer(0.001) # optimizer=tf.train.AdamOptimizer(0.01) model.compile( optimizer=optimizer, # loss='binary_crossentropy', loss='mse', # metrics=['accuracy'] metrics=['mae']) model.fit(x_train, y_train, epochs=40, batch_size=128, validation_data=(x_val, y_val), verbose=1) model.summary()
def main(): ''' Main function that coordinates the entire process. Parses arguments that specify the exercise and the experiment that should be run. Initializes the model and the checkpoint managers. ''' parser = argparse.ArgumentParser( description='Define configuration of experiments') parser.add_argument('--mode', type=str, nargs='+', choices=['train', 'evaluate', 'generate'], required=True) parser.add_argument('--experiment', type=str, choices=['a', 'b', 'c'], required=True) parser.add_argument('--id', type=str, required=False) parser.add_argument('--epochs', type=int, default=EPOCHS, required=False) args = parser.parse_args() # Setting Experiment Id if args.id is None: exp_id = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") print(f"No Experiment Id Set, Creating New: {exp_id}") else: exp_id = args.id print(f"Using Experiment Id: {exp_id}") # Setting Directories base_dir = f"{OUTPUT_DIR}/exp_{args.experiment}/{exp_id}" log_dir = f"{base_dir}/logs" submission_dir = f"{base_dir}/submissions" if not os.path.exists(submission_dir): os.makedirs(submission_dir) ckpt_dir = f"{base_dir}/ckpts" print(f"Experiment Directory: {base_dir}") print(f"Using Tensorflow Version: {tf.__version__}") print("Building Vocabulary...") build_vocab(input_file=PATH_TRAIN, output_file=PATH_VOCAB, top_k=VOCAB_SIZE, special=SPECIAL) word2id, id2word = build_vocab_lookup(PATH_VOCAB, "<unk>") # Setting Experiment Specific Configurations if args.experiment == 'a': lstm_hidden_state_size = 512 word_embeddings = None elif args.experiment == 'b': lstm_hidden_state_size = 512 word_embeddings = load_embedding(dim_embedding=EMBEDDING_SIZE, vocab_size=VOCAB_SIZE) elif args.experiment == 'c': lstm_hidden_state_size = 1024 word_embeddings = load_embedding(dim_embedding=EMBEDDING_SIZE, vocab_size=VOCAB_SIZE) else: raise ValueError(f"Unknown Experiment {args.experiment}") print(f'Initializing Model...') model = LanguageModel(vocab_size=VOCAB_SIZE, sentence_length=SENTENCE_LENGTH, embedding_size=EMBEDDING_SIZE, hidden_state_size=lstm_hidden_state_size, output_size=LSTM_OUTPUT_SIZE, batch_size=BATCH_SIZE, word_embeddings=word_embeddings, index_to_word_table=id2word) print(f'Initializing Optimizer...') optimizer = tf.keras.optimizers.Adam() ckpt = tf.train.Checkpoint(step=tf.Variable(1), optimizer=optimizer, net=model) manager = tf.train.CheckpointManager(ckpt, ckpt_dir, max_to_keep=5) if manager.latest_checkpoint: print(f"Restoring Model from {manager.latest_checkpoint}...") ckpt.restore(manager.latest_checkpoint) model_loaded = True else: print("Initializing Model from Scratch") model_loaded = False if "train" in args.mode: print(f"Starting Training...") train_summary_writer = tf.summary.create_file_writer( f"{log_dir}/train") with train_summary_writer.as_default(): train(ckpt=ckpt, manager=manager, model=model, optimizer=optimizer, word2id=word2id, id2word=id2word, epochs=args.epochs) model_loaded = True if "evaluate" in args.mode: print(f"Starting Evaluation...") assert model_loaded, 'model must be loaded from checkpoint in order to be evaluated' test_summary_writer = tf.summary.create_file_writer( f"{log_dir}/evaluate") with test_summary_writer.as_default(): evaluate( model=model, word2id=word2id, id2word=id2word, step=optimizer.iterations, path_submission= f"{submission_dir}/group35.perplexity{args.experiment.upper()}" ) if "generate" in args.mode: print(f"Starting Generation...") assert model_loaded, 'model must be loaded from checkpoint in order to start generation' generate_summary_writer = tf.summary.create_file_writer( f"{log_dir}/generate") with generate_summary_writer.as_default(): generate(word2id, id2word, model=model, path_submission=f"{submission_dir}/group35.continuation")
def concatenate_embeddings_generate(embeddings_path, out_path, vocab=None, batch_size=1024, k=10): printTrace("Reading vocab...") # [[vocab_emb1], [vocab_emb_2], ...] vocab_embeddings = [vocab_from_path(x) for x in embeddings_path] word_id = set() if vocab is None: word_id = list(set.union(*vocab_embeddings)) else: word_id = set(vocab) union = set.union(*vocab_embeddings) [ print("Word " + str(w) + " not found in any embedding") for w in word_id - union ] word_id = list(word_id.intersection(union)) print("The final embedding will have " + str(len(word_id)) + " words.") for i_voc, voc in enumerate(vocab_embeddings): print("Embedding " + str(i_voc) + " has " + str(len(voc)) + " words.") print("We will generate " + str(len(set(word_id) - voc)) + " words for the embedding " + str(i_voc)) print() printTrace("Building matrix for word generation...") generation_vocab_matrix = [[x for x in range(len(embeddings_path))] for x in range(len(embeddings_path))] nn_vocab = [defaultdict() for x in range(len(embeddings_path))] for x, emb1 in enumerate(vocab_embeddings): vocab_to_generate = set(word_id) - emb1 for y, emb2 in enumerate(vocab_embeddings): generation_vocab_matrix[y][x] = list( vocab_to_generate.intersection(emb2)) vocab_to_generate = vocab_to_generate - emb2 printTrace("===> Calculating nearest neighbors <===") for i_emb_path, emb_path in enumerate(embeddings_path): printTrace("Loading file: " + str(emb_path)) emb = load_embedding( emb_path, vocabulary=None, length_normalize=True, normalize_dimensionwise=False, delete_duplicates=True, ) for i_g, g in enumerate(generation_vocab_matrix[i_emb_path]): if len(g) > 0: # print('G: ' + str(g)) m = emb.words_to_matrix( g) # generation_vocab_matrix[i_emb_path][i_g]) # print(len(m)) # print(generation_vocab_matrix[x][gi]) interset_vocab = list( set.intersection(vocab_embeddings[i_emb_path], vocab_embeddings[i_g])) M = emb.words_to_matrix(interset_vocab) total_words = len(m) for i_batch, mb in enumerate(batch(m, batch_size)): string = ( "<" + str(datetime.datetime.now()) + "> " + "Using Embedding " + str(i_emb_path) + " to generate vocab for Embedding " + str(i_g) + ": " + str(int(100 * (i_batch * batch_size) / total_words)) + "%") print(string, end="\r") result = cosine_knn(mb, M, k) for i_result, indexes in enumerate(result): nn_vocab[i_g][g[i_result + (batch_size * i_batch)]] = [ interset_vocab[i] for i in indexes ] print() printTrace("===> Calculating meta embedding <===") total_words = len(word_id) first_emb = True if not os.path.exists("tmp"): os.makedirs("tmp") total_dims = 0 for x, emb_path in enumerate(embeddings_path): matrix = [] printTrace("Loading file: " + str(emb_path)) emb = load_embedding( emb_path, vocabulary=None, length_normalize=True, normalize_dimensionwise=False, delete_duplicates=True, ) total_dims += emb.dims string = "<" + str( datetime.datetime.now()) + "> " + "Embedding " + str(x) print(string, end="\r") actual_matrix = [] for wi, w in enumerate(word_id): m = np.zeros([emb.dims], dtype=float) try: m = emb.word_to_vector(w) except KeyError as r: try: lw = nn_vocab[x][w] v = np.zeros([emb.dims], dtype=float) for word in lw: v += emb.word_to_vector(word) except KeyError as r: raise ValueError( "Something went wrong in the word generation process") m = normalize_vector(v / k) matrix.append(m) if wi % 1000 == 0: string = ("<" + str(datetime.datetime.now()) + "> " + "Calculating meta embeddind for embedding " + str(x) + ": " + str(int(100 * wi / total_words)) + "%") print(string, end="\r") print() with open("tmp/" + str(x), "w") as file: for wi, w in enumerate(word_id): if first_emb: print(w + " " + " ".join(["%.6g" % x for x in matrix[wi]]), file=file) else: print(" ".join(["%.6g" % x for x in matrix[wi]]), file=file) if wi % 1000 == 0: string = ("<" + str(datetime.datetime.now()) + "> " + "Saving embedding " + str(x) + " to file : " + str(int(100 * wi / total_words)) + "%") print(string, end="\r") print() first_emb = False printTrace("Concatenation...") excec_com = "paste -d ' ' " for x in range(len(embeddings_path)): excec_com = excec_com + "tmp/" + str(x) + " " excec_com = excec_com + "> " + str(out_path) print(excec_com) os.system(excec_com) excec_com = ("sed -i '1s/^/" + str(len(word_id)) + " " + str(total_dims) + "\\n/' " + str(out_path)) print(excec_com) os.system(excec_com) try: os.system("rm -rf tmp") except: print("Could not delete the tmp folder, do it manually") printTrace("Done. Meta embedding saved in " + str(out_path))
def main(): parser = argparse.ArgumentParser() parser.add_argument('-i', '--embedding', required=True) parser.add_argument('-c', '--cross_embedding', required=True) parser.add_argument('-o', '--output', required=True) parser.add_argument('-b', '--batch_size', type=int, default=1024) parser.add_argument('-k', '--num_nearest_neighbor', type=int, default=10) args = parser.parse_args() dims = get_dimensions(args.embedding) if dims != get_dimensions(args.cross_embedding): raise ValueError('All the embeddings must have the same number of dimensions and the embeddings must be in the word2vec format') printTrace('Reading vocab...') vocab_emb = vocab_from_path(args.embedding) vocab_cross = vocab_from_path(args.cross_embedding) total_vocab = set.union(set(vocab_emb), set(vocab_cross)) interset_vocab = list(set.intersection(set(vocab_emb), set(vocab_cross))) vocab_to_generate = set(vocab_cross) - set(vocab_emb) print('Final embedding will have ' + str(len(total_vocab)) + ' words') print('We will generate ' + str(len(vocab_to_generate)) + ' words') emb = load_embedding(args.cross_embedding, vocabulary=None, lower=False, length_normalize=True, normalize_dimensionwise=False, delete_duplicates=True) m = emb.words_to_matrix(vocab_to_generate) M = emb.words_to_matrix(interset_vocab) nn=[] for i_batch, mb in enumerate(batch(m, args.batch_size)): string = "<" + str(datetime.datetime.now()) + "> " + 'Using Embedding ' + str( args.cross_embedding) + ' to generate vocab for Embedding ' + str(args.embedding) + ': ' + str( int(100 * (i_batch * args.batch_size) / len(m))) + '%' print(string, end="\r") # print(np.asarray(mb).shape) # print(np.asarray(M).shape) result = cosine_knn(mb, M, args.num_nearest_neighbor) for i_result, indexes in enumerate(result): nn.append([interset_vocab[i] for i in indexes]) del emb printTrace('===> Generating new_vocab <===') emb = load_embedding(args.embedding, vocabulary=None, lower=False, length_normalize=False, normalize_dimensionwise=False, delete_duplicates=True) new_vectors = [] for i_word, word in enumerate(vocab_to_generate): if i_word%1000 == 0: string = "<" + str(datetime.datetime.now()) + "> " + 'Generating vocab ' + args.output + ': ' + str( int(100 * i_word / len(vocab_to_generate))) + '%' print(string, end="\r") try: lw = nn[i_word] v = np.zeros([dims], dtype=float) for word_nn in lw: v += emb.word_to_vector(word_nn) except KeyError as r: raise ValueError('Something went wrong in the word generation process') new_vectors.append(v/args.num_nearest_neighbor) print() printTrace('===> Printing to file <===') with open(args.output,'w') as file: print(str(len(emb.words)+len(vocab_to_generate)) + ' ' + str(dims),file=file) for w in emb.words: print(w + ' ' + ' '.join(['%.6g' % x for x in emb.word_to_vector(w)]), file=file) for w_i, w in enumerate(vocab_to_generate): print(w + ' ' + ' '.join(['%.6g' % x for x in new_vectors[w_i]]), file=file)
import sys import argparse sys.path.insert(0, '../') from embedding import load_embedding parser = argparse.ArgumentParser() parser.add_argument('-i', '--input', required=True) parser.add_argument('-o', '--output', required=True) args = parser.parse_args() emb = load_embedding(args.input, length_normalize=False, delete_duplicates=True) with open(args.output, 'w+') as file: print(str(len(emb.words)) + ' ' + str(emb.dims), file=file) for word in emb.words: print(''.join(word.split('/')[1:]) + ' ' + ' '.join(['%.6g' % x for x in emb.word_to_vector(word)]), file=file)
logging.basicConfig(format=fmt, level=lvl) logging.debug("Logging started on %s for %s" % (logging.root.name, logging.getLevelName(lvl))) # nltk.download('punkt') # nltk.download('stopwords') API_KEY = 'AIzaSyB-OHHc_t1PlT2TbiQY67-yUuAe7tqotfg' ruta = os.path.join(os.path.abspath("."), "Consultas\\Embeddings") ruta = os.path.join(ruta, "esTech_enTech_1.vec") # EMBEDDING_PATH = '.\Embeddings\esTech_enTech_1.vec' EMBEDDING_PATH = ruta embedding = load_embedding(EMBEDDING_PATH) QM2 = Question_Manager(embedding) K_CLUSTERS = 3 MAX_COMMENTS = 50 def transformarListaComentarios_Json(ListaDeListas): ListaFinal = [] TemasNumeros = list(range(1, len(ListaDeListas) + 1)) for indice, Lista in enumerate(ListaDeListas): Dicc = {} data = [] Dicc["topic name"] = "Tema #" + str(TemasNumeros[indice]) for i, element in enumerate(Lista):
import sys import argparse sys.path.insert(0, '../') from embedding import load_embedding parser = argparse.ArgumentParser() parser.add_argument('-i', '--embeddings', nargs='+', required=True) parser.add_argument('-o', '--output', required=True) args = parser.parse_args() embs = [] total_words = 0 dims = 0 for emb in args.embeddings: embs.append( load_embedding(emb, length_normalize=False, delete_duplicates=True)) for emb in embs: total_words += len(emb.words) dims = embs[0].dims for emb in embs: if emb.dims != dims: raise ValueError( 'All the embeddings must have the same number of dimensions and the embeddings must be in the word2vec format' ) with open(args.output, 'w+', encoding='utf-8') as file: print(str(total_words) + ' ' + str(dims), file=file)
def concatenate_embeddings( embeddings_path, out_path, vocab, ): printTrace("===> Calculating meta embedding (No OOV) <===") vocab_embeddings = [vocab_from_path(x) for x in embeddings_path] if vocab is None: word_id = list(set.union(*vocab_embeddings)) else: word_id = set(vocab) union = set.union(*vocab_embeddings) [ print("Word " + str(w) + " not found in any embedding") for w in word_id - union ] word_id = list(word_id.intersection(union)) print("The final embedding will have " + str(len(word_id)) + " words.") first_emb = True if not os.path.exists("tmp_conc"): os.makedirs("tmp_conc") total_dims = 0 for x, emb_path in enumerate(embeddings_path): matrix = [] printTrace("Loading file: " + str(emb_path)) emb = load_embedding( emb_path, vocabulary=None, length_normalize=True, normalize_dimensionwise=False, delete_duplicates=True, ) total_dims += emb.dims string = "<" + str( datetime.datetime.now()) + "> " + "Embedding " + str(x) print(string, end="\r") for wi, w in enumerate(word_id): m = np.zeros([emb.dims], dtype=float) try: m = emb.word_to_vector(w) except KeyError as r: pass matrix.append(m) if wi % 1000 == 0: string = ("<" + str(datetime.datetime.now()) + "> " + "Calculating meta embeddind for embedding " + str(x) + ": " + str(int(100 * wi / len(word_id))) + "%") print(string, end="\r") print() with open("tmp_conc/" + str(x), "w+", encoding="utf-8") as file: for wi, w in enumerate(word_id): if first_emb: print(w + " " + " ".join(["%.6g" % x for x in matrix[wi]]), file=file) else: print(" ".join(["%.6g" % x for x in matrix[wi]]), file=file) if wi % 1000 == 0: string = ("<" + str(datetime.datetime.now()) + "> " + "Saving embedding " + str(x) + " to file : " + str(int(100 * wi / len(word_id))) + "%") print(string, end="\r") print() first_emb = False printTrace("Concatenation...") excec_com = "paste -d ' ' " for x in range(len(embeddings_path)): excec_com = excec_com + "tmp_conc/" + str(x) + " " excec_com = excec_com + "> " + str(out_path) print(excec_com) os.system(excec_com) excec_com = ("sed -i '1s/^/" + str(len(word_id)) + " " + str(total_dims) + "\\n/' " + str(out_path)) print(excec_com) os.system(excec_com) try: shutil.rmtree("/tmp_conc") except: print("Could not delete the tmp folder, do it manually") printTrace("Done. Meta embedding saved in " + str(out_path))
def main(): parser = argparse.ArgumentParser() inputtype = parser.add_mutually_exclusive_group(required=True) inputtype.add_argument('-i', '--embedding', type=str) inputtype.add_argument('-d', '--directory', type=str) #parser.add_argument('-n', '--name_of_embedding', default=None) parser.add_argument('-l', '--lowercase_dataset', action='store_true') parser.add_argument('-lg', '--language', nargs='+', default=['en']) parser.add_argument('-p', '--add_lang_prefix', action='store_true') parser.add_argument('-v', '--vocab', type=str, default=None) args = parser.parse_args() emb_list = [] if args.embedding is not None: emb_list.append(args.embedding) else: emb_list = [ os.path.join(args.directory, f) for f in os.listdir(args.directory) if os.path.isfile(os.path.join(args.directory, f)) ] for emb_i, emb_path in enumerate(emb_list): printTrace('Evaluating Embedding ' + str(emb_i + 1) + ' of ' + str(len(emb_list)) + ' : ' + str(emb_path)) emb = load_embedding(emb_path, vocabulary=(None if args.vocab is None else vocab_from_path(args.vocab)), lower=False, length_normalize=False, delete_duplicates=True) for lang in args.language: lang1prefix = None lang2prefix = None if args.add_lang_prefix: if lang == 'en': lang1prefix = 'en' lang2prefix = 'en' elif lang == 'es': lang1prefix = 'es' lang2prefix = 'es' elif lang == 'enes': lang1prefix = 'en' lang2prefix = 'es' else: logging.warning( 'Language not supported, could not add prefix') if not os.path.exists('Results_' + lang): os.makedirs('Results_' + lang) print('>>> Results deleting oov <<< ') a, b = results_to_csv(evaluate_on_all( emb, backoff_vector=None, lowercase_dataset=args.lowercase_dataset, lang=lang, lang1prefix=lang1prefix, lang2prefix=lang2prefix), printRes=False, returnRes=True) export_to_csv( txtResults=a, txtCov=b, name=emb_path, filenameResults='Results_' + lang + '/Sim_Results_delete.csv', filenameCoverage='Results_' + lang + '/Sim_Coverage.csv') print('>>> Result using mean of all word vectors as OOV <<<') a, b = results_to_csv(evaluate_on_all( emb, backoff_vector=np.mean(emb.vectors, axis=0), lowercase_dataset=args.lowercase_dataset, lang=lang, lang1prefix=lang1prefix, lang2prefix=lang2prefix), printRes=False, returnRes=True) export_to_csv( txtResults=a, txtCov=b, name=emb_path, filenameResults='Results_' + lang + '/Sim_Results_mean.csv', filenameCoverage='Results_' + lang + '/Sim_Coverage.csv') print('Results have been exported in csv format to the Results folder')
import argparse import os import sys import os sys.path.insert(0, '../') from embedding import load_embedding parser = argparse.ArgumentParser() parser.add_argument('-i', '--input', required=True) parser.add_argument('-o', '--output', required=True) parser.add_argument('-p', '--prefix', required=True) args = parser.parse_args() emb = load_embedding(args.input, vocabulary=None, length_normalize=False, normalize_dimensionwise=False, to_unicode=True, lower=False, delete_duplicates=True) n_words = 0 with open(args.output,'w') as file: for word in emb.words: if word.split('/')[0] == args.prefix: print(''.join(word.split('/')[1:]) + ' ' + ' '.join(['%.6g' % x for x in emb.word_to_vector(word)]), file=file) n_words+=1 excec_com = 'sed -i \'1s/^/' + str(n_words) + ' ' + str(emb.dims) + '\\n/\' ' + str(args.output) print(excec_com) os.system(excec_com)
def inference(sentence): print("input sentence:") print(sentence) sentences = [] words = sentence.split(' ') sentences.append(words) sentences_embedding = embedding(sentences, batch_size, single_sentence_length) print("input embedding:") print(sentences_embedding) output = mod_inference(sentences_embedding) print("output vector:") print(output) return output if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--embedding", dest="embedding", help="Read embedding from the path", metavar="FILE", required=True) parser.add_argument('--seed', nargs='?', dest="seed", const=1, type=int) # set default random seed to 1 args = parser.parse_args() load_embedding(args.embedding, args.seed) inference( "in his first stab at the form , jacquot takes a slightly anarchic approach that works only sporadically ." )