def init(wvec, cvec, output="", only_letters=False, vocab_limit=None, pairs=False, batch_size=1000): print >> stderr, "Vectors: {}, only_letters: {}".format(wvec, only_letters) print >> stderr, "Loading vectors from {}".format(wvec) tic = time() wvectors = load_vectors(wvec, binary=False) print >> stderr, "Vectors loaded in %d sec." % (time()-tic) print >> stderr, "Vectors shape is: ", wvectors.syn0norm.shape print >> stderr, "Loading vectors from {}".format(cvec) tic = time() cvectors = load_vectors(cvec, binary=False) print >> stderr, "Vectors loaded in %d sec." % (time()-tic) print >> stderr, "Vectors shape is: ", cvectors.syn0norm.shape vocab_size = len(wvectors.vocab) print("Vocabulary size: %i" % vocab_size) # Limit the number of words for which to collect contexts if vocab_limit and vocab_limit < vocab_size: vocab_size = vocab_limit words = wvectors.index2word[:vocab_size] print("Collect activated contexts for %i most frequent words" % vocab_size) with codecs.open(output, 'wb') if output else stdout as out: process(out, wvectors, cvectors, words, only_letters=only_letters, batch_size=batch_size, pairs=pairs)
def init(fvec, output="", only_letters=False, vocab_limit=None, pairs=False, batch_size=1000, word_freqs=None): print >> stderr, "Vectors: {}, only_letters: {}".format(fvec, only_letters) print >> stderr, "Loading vectors from {}".format(fvec) tic = time() vectors = load_vectors(fvec, binary=True) print >> stderr, "Vectors loaded in %d sec." % (time()-tic) print >> stderr, "Vectors shape is: ", vectors.syn0norm.shape vocab_size = len(vectors.vocab) print("Vocabulary size: %i" % vocab_size) # Limit the number of words for which to collect neighbours if vocab_limit and vocab_limit < vocab_size: vocab_size = vocab_limit words = vectors.index2word[:vocab_size] print("Collect neighbours for %i most frequent words" % vocab_size) freq=None if word_freqs: freq_dict = load_freq(word_freqs) freq = order_freq(vectors, freq_dict) print "freqs loaded. Length ", len(freq), freq[:10] with codecs.open(output, 'wb') if output else stdout as out: process(out, vectors, words, only_letters=only_letters, batch_size=batch_size, pairs=pairs, freq=freq)
def main(): parser = argparse.ArgumentParser(description="Running Simlex test") parser.add_argument( "--vocab_file_pattern", type=str, default=None, help="vocab path file or file pattern in case of multiple files", required=True) parser.add_argument( "--vector_file_pattern", type=str, default=None, help="vector path file or file pattern in case of multiple files", required=True) parser.add_argument("--output_file", type=str, default=None, help="file to write output to", required=True) args = parser.parse_args() vocab_files = glob.glob(str(args.vocab_file_pattern)) vector_files = glob.glob(str(args.vector_file_pattern)) with open(os.path.join(ROOT_DIR, f'simlex/{args.output_file}'), 'w') as f: for voc, vec in zip(vocab_files, vector_files): file_name = os.path.splitext(os.path.basename(voc))[0][4:] vocab = load_vocab(voc) vectors = load_vectors(vec) simlex_score = eval_simlex(simlex_pairs, vocab, vectors) f.write('{}: {}'.format(file_name, simlex_score)) f.write('\n') f.close()
def build_embedding_layer(word2index, emb_type='glove', embedding_dim=300, max_len=40, trainable=True): vocab_size = len(word2index) + 1 if 'glove' in emb_type: word2vec_map = utils.load_vectors(filename='glove.6B.%dd.txt' % embedding_dim) emb_layer = pretrained_embedding_layer(word2vec_map, word2index, embedding_dim, vocab_size, trainable=trainable) elif 'emoji' in emb_type: emoji2vec_map = utils.load_vectors( filename='emoji_embeddings_%dd.txt' % embedding_dim) emb_layer = pretrained_embedding_layer(emoji2vec_map, word2index, embedding_dim, vocab_size, trainable=trainable) elif 'random' in emb_type: words = word2index.keys() random2vec_map = utils.build_random_word2vec( words, embedding_dim=embedding_dim, variance=1) emb_layer = pretrained_embedding_layer(random2vec_map, word2index, embedding_dim, vocab_size, trainable=trainable) else: emb_layer = Embedding(vocab_size, embedding_dim, input_length=max_len, trainable=trainable) emb_layer.build((None, )) return emb_layer
def main(): parser = argparse.ArgumentParser( description='Calculates semantic similarity for each pair of words in the specified input file and saves result ' 'to the output file. Uses specified word2vec word vectors file.') parser.add_argument('vectors', help='word2vec word vectors file.', default='') parser.add_argument('input', help='Input file in csv format (with comma as separator). Each line is a pair of words, ' 'for which similarity is calculated.', default='') parser.add_argument('-output', help='Output file in csv format (with comma as separator).', default='') parser.add_argument('-column', help="Name of the output column. If column doesn't exist, it will be added, otherwise " "data in existing column will be modified.", default='res') parser.add_argument('-morph', help="Enable morphology hack.", action='store_true') args = parser.parse_args() fvec = args.vectors fin = args.input fout = fin + '-' + os.path.basename(fvec) if args.output == '' else args.output cout = args.column morph_hack = args.morph print >> stderr, "Loading vectors from {}".format(fvec) vectors = load_vectors(fvec) # for words with underscore (ex: берег_v, берег_s, северо_запад, вода_и_медные_трубы) prefix2word = {} for x in vectors.vocab.iterkeys(): if '_' not in x: continue prefix = x.split('_')[0] if prefix not in prefix2word: prefix2word[prefix] = [x] else: prefix2word[prefix].append(x) # for e in prefix2word.iteritems(): # if len(e[1]) > 1: # print >> stderr, ("%s: %s" % (e[0], ':'.join(e[1]))).encode('utf-8') print >> stderr, "Calculating similarity for {}; writing result to {}".format(fin, fout) with open(fin, 'r') as input_file, open(fout, "w") as output_file, open(fout+'.log', "w") as log_file: inp = csv.DictReader(input_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, quotechar="'") # inp_fieldnames = inp.fieldnames out_fieldnames = inp.fieldnames if cout in inp.fieldnames else inp.fieldnames + [cout] out = csv.DictWriter(output_file, out_fieldnames, delimiter=',', quoting=csv.QUOTE_MINIMAL, quotechar="'") out.writeheader() for linenum, ex in enumerate(inp): if linenum % 1000 == 0: print 'Lines processed: {}'.format(linenum) ex[cout] = sim(vectors, prefix2word, ex['word1'].decode('utf-8'), ex['word2'].decode('utf-8'), log_file, morph_hack) out.writerow(ex)
def main(): parser = argparse.ArgumentParser(description="Running K-means test") parser.add_argument( "--vocab_file_pattern", type=str, default=None, help="vocab path file or file pattern in case of multiple files", required=True) parser.add_argument( "--vector_file_pattern", type=str, default=None, help="vector path file or file pattern in case of multiple files", required=True) parser.add_argument( "--protocol_type", nargs='?', choices=['RT', 'BRD'], help= "Whether to run test for Reichstagsprotokolle (RT) or Bundestagsprotokolle (BRD)", required=True) args = parser.parse_args() vocab_files = glob.glob(str(args.vocab_file_pattern)) vector_files = glob.glob(str(args.vector_file_pattern)) for voc, vec in zip(vocab_files, vector_files): file_name = os.path.splitext(os.path.basename(voc))[0] vocab = load_vocab(voc) vectors = load_vectors(vec) with open(os.path.join(ROOT_DIR, f'kmeans/{file_name}.txt'), 'w') as f: for test in weat_tests: f.write(f'K-means score {test.__name__ }: ') targets_1 = test( 'sentiment', args.protocol_type, )[0] targets_2 = test('sentiment', args.protocol_type)[1] k_means_score = eval_k_means(targets_1, targets_2, vectors, vocab) f.write(str(k_means_score)) f.write('\n') f.close()
def main(): parser = argparse.ArgumentParser( description='Reads words from vector model. Writes to stdout word + similar words and their distances to the original word.') parser.add_argument('vectors', help='word2vec word vectors file.', default='') parser.add_argument('-output', help='Output file in on-pair-per-line format, gziped', default='') parser.add_argument('-only_letters', help='Skip words containing non-letter symbols from stding / similar words.', action="store_true") parser.add_argument("-vocab_limit", help="Collect neighbours only for specified number of most frequent words. By default use all words.", default=None, type=int) parser.add_argument('-pairs', help="Use pairs format: 2 words and distance in each line. Otherwise echo line is a word and all it's neighbours with distances." , action="store_true") parser.add_argument('-batch-size', help='Batch size for finding neighbours.', default="1000") parser.add_argument('-word_freqs', help="Weight similar words by frequency. Pass frequency file as parameter", default=None) args = parser.parse_args() fvec = args.vectors batch_size = int(args.batch_size) print >> stderr, "Vectors: {}, only_letters: {}".format(args.vectors, args.only_letters) print >> stderr, "Loading vectors from {}".format(fvec) tic = time() vectors = load_vectors(fvec) print >> stderr, "Vectors loaded in %d sec." % (time()-tic) print >> stderr, "Vectors shape is: ", vectors.syn0norm.shape vocab_size = len(vectors.vocab) print("Vocabulary size: %i" % vocab_size) # Limit the number of words for which to collect neighbours if args.vocab_limit and args.vocab_limit < vocab_size: vocab_size = args.vocab_limit print("Collect neighbours for %i most frequent words" % vocab_size) freq=None if args.word_freqs: freq_dict = load_freq(args.word_freqs) freq = order_freq(vectors, freq_dict) print "freqs loaded. Length ", len(freq), freq[:10] with gzip.open(args.output, 'wb') if args.output else stdout as out: process(out, vectors, only_letters=args.only_letters, vocab_size=vocab_size, batch_size=batch_size, pairs=args.pairs, freq=freq)
def init(fvec, output="", only_letters=False, vocab_limit=None, pairs=False, batch_size=1000, word_freqs=None): print >> stderr, "Vectors: {}, only_letters: {}".format(fvec, only_letters) print >> stderr, "Loading vectors from {}".format(fvec) tic = time() vectors = load_vectors(fvec, binary=True) print >> stderr, "Vectors loaded in %d sec." % (time() - tic) print >> stderr, "Vectors shape is: ", vectors.syn0norm.shape vocab_size = len(vectors.vocab) print("Vocabulary size: %i" % vocab_size) # Limit the number of words for which to collect neighbours if vocab_limit and vocab_limit < vocab_size: vocab_size = vocab_limit words = vectors.index2word[:vocab_size] print("Collect neighbours for %i most frequent words" % vocab_size) freq = None if word_freqs: freq_dict = load_freq(word_freqs) freq = order_freq(vectors, freq_dict) print "freqs loaded. Length ", len(freq), freq[:10] with codecs.open(output, 'wb') if output else stdout as out: process(out, vectors, words, only_letters=only_letters, batch_size=batch_size, pairs=pairs, freq=freq)
print(' Loading word synsets and building synset vocabulary...') word_synsets, synsets = utils.load_word_synsets( pos=setup.POS, include_named_entities=setup.INCLUDE_NAMED_ENTITIES) print(' Loading word synsets and building synset vocabulary... Done! [{} synsets]\n'.format(len(synsets))) print(' Loading synset weights...') synset_weights = utils.load_synset_weights(synsets) print(' Loading synset weights... Done!\n') print(' Loading NASARI vectors...') vectors = utils.load_vectors( nasari_vectors.value, synsets, normalize=setup.NORMALIZE_SCORES, min_length=setup.MIN_VECTOR_LENGTH, max_length=setup.MAX_VECTOR_LENGTH, skip_dims=1, component_separator='_', int_synset=False) print(' Loading NASARI vectors... Done! [{} vectors]\n'.format(len(vectors))) utils.evaluate( 'NASARI_{}'.format(args.nasari_language), word_pairs, word_synsets, synset_weights, vectors, setup.SIMILARITY_FUNCTION, setup.SCORING_FUNCTION, verbose=args.verbose)
def get_emoji2vec(): # Load the emoji data - both true and false descriptions pos_emojis = read_csv(emoji_positive, sep='\t', engine='python', encoding='utf_8', names=['description', 'emoji']) neg_emojis = read_csv(emoji_negative, sep='\t', engine='python', encoding='utf_8', names=['description', 'emoji']) print('Number of true emoji descriptions: %d' % len(pos_emojis)) print('Number of false emoji descriptions: %d' % len(neg_emojis)) # Set the labels to 1 (for true descriptions) and 0 (for false descriptions) pos_emojis['label'] = 1 neg_emojis['label'] = 0 # Concatenate and shuffle negative and positive examples of emojis all_emojis = concat([pos_emojis, neg_emojis]).sample(frac=1, random_state=144803) # Group all emojis in positive examples by descriptions emoji_grouping = pos_emojis.groupby('emoji')['description'].apply( lambda x: ', '.join(x)) grouped_by_description = DataFrame({ 'emoji': emoji_grouping.index, 'description': emoji_grouping.values }) # Build an emoji vocabulary and map each emoji to an index (beginning from 1) emojis = grouped_by_description['emoji'].values emoji_to_index = { emoji: index + 1 for emoji, index in zip(emojis, range(len(emojis))) } index_to_emoji = {index: emoji for emoji, index in emoji_to_index.items()} emoji_vocab_size = len(emoji_to_index) print('Total number of unique emojis: %d' % emoji_vocab_size) # Build a word vocabulary and map each emoji to an index (beginning from 1) descriptions = all_emojis['description'].values tokenizer = Tokenizer() tokenizer.fit_on_texts(descriptions.tolist()) word_sequences = tokenizer.texts_to_sequences(descriptions.tolist()) word_to_index = tokenizer.word_index index_to_word = {index: word for word, index in word_to_index.items()} word_vocab_size = len(word_to_index) print('Total number of unique words found in emoji descriptions: %d' % word_vocab_size) # Load GLoVe word embeddings print("Loading GLoVe...") word2vec_map = utils.load_vectors(glove_filename) # Prepare the word-embedding matrix embedding_matrix = utils.get_embedding_matrix(word2vec_map, word_to_index, embedding_dim, init_unk=False) print('Number of non-existent word-embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0)) # Prepare training data train_emoji = np.array( [emoji_to_index[emoji] for emoji in all_emojis['emoji'].values]) train_words = pad_sequences(word_sequences, maxlen=maximum_length) labels = np.array([[0, 1] if label == 0 else [1, 0] for label in all_emojis['label'].values]) print('Shape of emoji data:', train_emoji.shape) print('Shape of emoji description data:', train_words.shape) print('Shape of label tensor:', labels.shape) print('Number of emojis:', emoji_vocab_size) # Build the emoji DNN model print("Building the emoji2vec model...") emoji_model, word_model, model = emoji2vec_model(embedding_matrix, emoji_vocab_size, word_vocab_size) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy']) print(model.summary()) # Train a model if one hasn't been trained yet if not os.path.exists(emoji2vec_weights): print("Training the emoji2vec model...") callbacks = [ ModelCheckpoint(emoji2vec_weights, monitor='val_categorical_accuracy', save_best_only=True) ] history = model.fit([train_emoji, train_words], labels, epochs=50, validation_split=0.1, verbose=1, callbacks=callbacks) # Plot accuracy and loss utils.plot_training_statistics( history, path + "/plots/emoji2vec/emoji2vec_%dd" % embedding_dim, also_plot_validation=True, acc_mode='categorical_accuracy', loss_mode='loss') # Load the pre-trained weights and get the embeddings print("Loading the trained weights of the emoji2vec model...") model.load_weights(emoji2vec_weights) weights = emoji_model.layers[0].get_weights()[0] # Get the emoji2vec mapping emoji2vec = {} for e, w in zip(grouped_by_description['emoji'], weights[1:]): emoji2vec[e] = w # Get the emoji embeddings and save them to file if not os.path.exists(emoji2vec_embeddings): embeddings = DataFrame(weights[1:]) embeddings = concat([grouped_by_description['emoji'], embeddings], axis=1) embeddings.to_csv(emoji2vec_embeddings, sep=' ', header=False, index=False) # Get the t-SNE representation if not os.path.exists(emoji2vec_visualization): tsne = TSNE(n_components=2, perplexity=30, init='pca', n_iter=5000) # Following are the exact tsne settings used in the emoji visualization in the original paper # tsne = TSNE(perplexity=50, n_components=2, init='random', n_iter=300000, early_exaggeration=1.0, # n_iter_without_progress=1000) trans = tsne.fit_transform(weights) # Save the obtained emoji visualization visualization = DataFrame(trans[1:], columns=['x', 'y']) visualization['emoji'] = grouped_by_description['emoji'].values visualization.to_csv(emoji2vec_visualization) # Visualize the embeddings as a tsne figure visualization.plot('x', 'y', kind='scatter', grid=True) plt.savefig(path + '/plots/emoji2vec/tsne_%dd.pdf' % embedding_dim) return emoji2vec
def main(): parser = argparse.ArgumentParser() parser.add_argument('-m', '--saved_model', help='path to saved model',default='results/CELoss_both_tenfold0_s256_cnn3_lstm2_d5_lr1e-3_wd1e-5_f11_p5_sum_b2000_h128_e300_v3000.pt') parser.add_argument("-c", "--config", help="path to config file", default='conf/mustc.yaml') parser.add_argument('-d', '--datasplit', help='optional path to datasplit yaml file to override path specified in config') parser.add_argument('-hid', '--hidden_size', help='hidden size for LSTM -- optional, overrides the one in the config') parser.add_argument('-f', '--frame_filter_size', help='width of CNN filters -- optional, overrides the one in the config') parser.add_argument('-pad', '--frame_pad_size', help='width of CNN padding -- optional, overrides the one in the config') parser.add_argument('-cnn', '--cnn_layers', help='number of CNN layers -- optional, overrides the one in the config') parser.add_argument('-l', '--lstm_layers', help='number of LSTM layers -- optional, overrides the one in the config') parser.add_argument('-dr', '--dropout', help='dropout -- optional, overrides the one in the config') parser.add_argument('-wd', '--weight_decay', help='weight decay -- optional, overrides the one in the config') parser.add_argument('-lr', '--learning_rate', help='learning rate -- optional, overrides the one in the config') parser.add_argument('-flat', '--flatten_method', help='method for flattening tokens -- optional, overrides the one in the config') parser.add_argument('-b', '--bottleneck_feats', help='number of bottlneckfeats -- optional, overrides the one in the config') parser.add_argument('-e', '--embedding_dim', help='number of bottlneckfeats -- optional, overrides the one in the config') parser.add_argument('-v', '--vocab_size', help='vocab size -- optional, overrides the one in the config') parser.add_argument('-s', '--stopword_baseline', action='store_true', default=False) parser.add_argument('-o', '--output_file', help='name of output file') parser.add_argument('-rs', '--seed', help='random seed -- optional, overrides the one in the config') args = parser.parse_args() with open(args.config, 'r') as f: cfg = yaml.load(f, yaml.FullLoader) if args.stopword_baseline: print('WARNING: STOPWORD BASELINE') cfg2arg = {'datasplit': args.datasplit, 'frame_filter_size': args.frame_filter_size, 'frame_pad_size': args.frame_pad_size, 'cnn_layers': args.cnn_layers, 'lstm_layers': args.lstm_layers, 'dropout': args.dropout, 'weight_decay': args.weight_decay, 'learning_rate': args.learning_rate, 'flatten_method': args.flatten_method, 'bottleneck_feats': args.bottleneck_feats, 'hidden_size': args.hidden_size, 'embedding_dim': args.embedding_dim, 'vocab_size': args.vocab_size, 'seed': args.seed } int_args = ['frame_filter_size', 'frame_pad_size', 'cnn_layers', 'lstm_layers', 'bottleneck_feats', 'hidden_size', 'embedding_dim', 'vocab_size','seed'] float_args = ['dropout', 'weight_decay', 'learning_rate'] device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") if args.datasplit: datasplit = args.datasplit else: datasplit = cfg['datasplit'] for arg in cfg2arg: if cfg2arg[arg]: if arg in int_args: cfg[arg] = int(cfg2arg[arg]) elif arg in float_args: cfg[arg] = float(cfg2arg[arg]) else: cfg[arg] = cfg2arg[arg] seed = cfg['seed'] with open(cfg['all_data'], 'rb') as f: data_dict = pickle.load(f) #import pdb;pdb.set_trace() with open(cfg['datasplit'].replace('yaml', 'vocab'), 'rb') as f: vocab_dict = pickle.load(f) # Load text data: with open(cfg['datasplit'].replace('yaml', 'vocab'), 'rb') as f: vocab_dict = pickle.load(f) print(f'Original vocab size: {len(vocab_dict["w2i"])}') set_seeds(seed) def truncate_dicts(vocab_dict, vocab_size): i2w = {} w2i = {} for i in range(vocab_size + 2): if i in vocab_dict['i2w']: w = vocab_dict['i2w'][i] i2w[i] = w w2i[w] = i else: if cfg['inputs']=='text' or cfg['inputs']=='both': print("WARNING: vocab size is not smaller than actual vocab") return w2i, i2w w2i, i2w = truncate_dicts(vocab_dict, cfg['vocab_size']) if cfg['use_pretrained']: print('using pretrained') if cfg['embedding_dim']==100: glove_path = cfg['glove_path_100'] elif cfg['embedding_dim'] == 300: glove_path = cfg['glove_path_300'] i2vec = load_vectors(glove_path, w2i) weights_matrix = np.zeros((cfg['vocab_size'] + 2, cfg['embedding_dim'])) for i in i2w: try: weights_matrix[i] = i2vec[i] except: weights_matrix[i] = np.random.normal(scale=0.6, size=(cfg['embedding_dim'],)) weights_matrix = torch.tensor(weights_matrix) else: weights_matrix = None if 'overwrite_speech' in cfg: overwrite_speech = cfg['overwrite_speech'] else: overwrite_speech = False if 'scramble_speech' in cfg: scramble_speech = cfg['scramble_speech'] else: scramble_speech = False if 'stopwords_only' in cfg: stopwords_only = cfg['stopwords_only'] else: stopwords_only = False if 'binary_vocab' in cfg: binary_vocab = cfg['binary_vocab'] else: binary_vocab = False if 'ablate_feat' in cfg: ablate_feat = cfg['ablate_feat'] else: ablate_feat = None model = SpeechEncoder(seq_len=cfg['frame_pad_len'], batch_size=cfg['train_params']['batch_size'], lstm_layers=cfg['lstm_layers'], bidirectional=cfg['bidirectional'], num_classes=cfg['num_classes'], dropout=cfg['dropout'], include_lstm=cfg['include_lstm'], tok_level_pred=cfg['tok_level_pred'], feat_dim=cfg['feat_dim'], postlstm_context=cfg['postlstm_context'], device=device, tok_seq_len=cfg['tok_pad_len'], flatten_method=cfg['flatten_method'], frame_filter_size=cfg['frame_filter_size'], frame_pad_size=cfg['frame_pad_size'], cnn_layers=cfg['cnn_layers'], inputs=cfg['inputs'], embedding_dim=cfg['embedding_dim'], vocab_size=cfg['vocab_size'], bottleneck_feats=cfg['bottleneck_feats'], use_pretrained=cfg['use_pretrained'], weights_matrix=weights_matrix) model.load_state_dict(torch.load(args.saved_model)) model.to(device) #testset = BurncDataset(cfg, data_dict, w2i, cfg['vocab_size'], mode='test',datasplit=datasplit, # overwrite_speech=overwrite_speech,stopwords_only=stopwords_only,binary_vocab=binary_vocab, # ablate_feat=ablate_feat) devset = BurncDataset(cfg, data_dict, w2i, cfg['vocab_size'], mode='dev',datasplit=datasplit, overwrite_speech=overwrite_speech,stopwords_only=stopwords_only,binary_vocab=binary_vocab, ablate_feat=ablate_feat) ### Stopword baseline here: with open(cfg['datasplit'].replace('yaml', 'stop'), 'rb') as f: stopword_list = pickle.load(f) acc = evaluate(cfg,devset, cfg['eval_params'], model, device,tok_level_pred=cfg['tok_level_pred'],noisy=True, print_predictions=True,vocab_dict=vocab_dict,stopword_baseline=args.stopword_baseline,stopword_list=stopword_list,bootstrap_resample=True) datasplit_name = os.path.split(datasplit)[-1].split('.')[0] out_path = os.path.join(os.path.dirname(args.saved_model),f'{args.output_file}_{datasplit_name}.tsv') with open(out_path,'w') as f: f.write(f'\tepochs\ttrain_losses\ttrain_accs\tdev_accs\n') f.write(f'0\t0\t0\t0\t{acc[0]}') print(f'wrote results to {out_path}')
from utils import load_vectors, rnd_labeled_data_generator, test_data_generator, conseq_labeled_data_generator # Загружаем векторы # In[3]: vectors = { "auto": "../data/auto_vectors.csv", "mus": "../data/mus_vectors.csv" } # In[4]: print("Loading vectors") manually = True auto_df = load_vectors(vectors["auto"], manually=manually, verbose=False) if not manually: auto_df["vectors"].apply(lambda a: np.array(eval(a))) # In[8]: auto_df.groupby("overall").count() # Классификация с 5 категориями, не бинарная # In[8]: auto_df["target"] = pd.get_dummies(auto_df["overall"]).values.tolist() auto_df["target"] = auto_df["target"].apply(np.array) # In[9]:
def analogy_task(): try: emoji2vec_str = utils.load_vectors(filename=emoji2vec_embeddings) # Convert to unicode all emoji entries in the dictionary of emoji embeddings emoji2vec = {} for k, v in emoji2vec_str.items(): unicode_emoji = utils.convert_emoji_to_unicode(k) emoji2vec[unicode_emoji] = v # Get some intuition whether the model is good by seeing what analogies it can make based on what it learnt utils.make_analogy("👑", "🚹", "🚺", emoji2vec) # Crown - Man + Woman utils.make_analogy("👑", "👦", "👧", emoji2vec) # Crown - Boy + Girl utils.make_analogy("💵", "🇺🇸", "🇬🇧", emoji2vec) utils.make_analogy("💵", "🇺🇸", "🇪🇺", emoji2vec) utils.make_analogy("👪", "👦", "👧", emoji2vec) utils.make_analogy("🕶", "☀️", "⛈", emoji2vec) # Sunglasses - Sun + Cloud utils.make_analogy("☂", "⛈️", "☀", emoji2vec) # Umbrella - Clouds + Sun utils.make_analogy( "🍣", "🏯️", "🏰", emoji2vec) # Sushi - Japanese Castle + European Castle utils.make_analogy( "👹", "🏯️", "🏰", emoji2vec) # Japanese Ogre - Japanese Castle + European Castle utils.make_analogy( "🍣", "🗼️", "🗽", emoji2vec) # Sushi - Japanese Tower + Statue of Liberty utils.make_analogy( "🍣", "🗾️", "🗽", emoji2vec) # Sushi - Japanese Tower + Statue of Liberty utils.make_analogy( "🍣", "🏯️", "🗽", emoji2vec) # Sushi - Japanese Castle + Statue of Liberty utils.make_analogy("🐅", "🐈️", "🐕", emoji2vec) # Jaguar - Cat + Dog utils.make_analogy("🐆", "🐈️", "🐕", emoji2vec) # Leopard - Cat + Dog utils.make_analogy("🐭", "🐈️", "🐕", emoji2vec) # Mouse - Cat + Dog utils.make_analogy("🌅", "🌞️", "🌙", emoji2vec) # Sunrise - Sun + Moon utils.make_analogy("🌅", "🌞️", "🌑", emoji2vec) # Sunrise - Sun + Moon utils.make_analogy( "🌃", "🌙️", "🌞", emoji2vec) # Night with stars - Moon + Sun With Face utils.make_analogy( "🌃", "🌑️", "☀", emoji2vec) # Night with stars - Moon + Sun With Face utils.make_analogy( "🌃", "🌙️️", "☀", emoji2vec) # Night with stars - Moon + Sun With Face utils.make_analogy( "😴", "💤️", "🏃", emoji2vec) # Sleeping face - sleeping symbol + running utils.make_analogy( "😴", "🛌️", "🏃", emoji2vec) # Sleeping face - sleeping accommodatin + running utils.make_analogy( "😴", "🛏", "🏃", emoji2vec) # Sleeping face - bed + active symbol (running) utils.make_analogy("🏦", "💰", "🏫", emoji2vec) # Money - Bank + School utils.make_analogy("🏦", "💰", "🏥", emoji2vec) # Money - Bank + Hospital utils.make_analogy("💉", "🏥", "🏦", emoji2vec) # Syringe - Hospital + Bank utils.make_analogy("💊", "🏥", "🏦", emoji2vec) # Pill - Hospital + Bank utils.make_analogy("💒", "💍", "👰", emoji2vec) # Wedding - Ring + Bride utils.make_analogy("💒", "💑", "💔", emoji2vec) # Wedding - Couple + Broken Heart utils.make_analogy("💒", "❤", "💔", emoji2vec) # Wedding - Heart + Broken Heart utils.make_analogy("😀", "💰", "🤑", emoji2vec) # Grinning person - Money + Money Face utils.make_analogy("😠", "💰", "🤑", emoji2vec) # Angry person - Money + Money Face except IOError: print("Emoji embeddings not found at the provided embeddings file %s. " "You have to train them before proceeding to make analogies." % emoji2vec_embeddings)
def train_emoji2vec(): # Load the true emoji data pos_emojis = read_csv(emoji_positive, sep="\t", engine="python", encoding="utf_8", names=["description", "emoji"]) pos_emojis["label"] = 0 # Load the false emoji data (negative examples) neg_emojis = read_csv(emoji_negative, sep="\t", engine="python", encoding="utf_8", names=["description", "emoji"]) neg_emojis["label"] = 1 print( "There are %d true emoji descriptions and %d false emoji descriptions." % (len(pos_emojis), len(neg_emojis))) # Group all the positive emoji examples by their description emoji_grouping = pos_emojis.groupby("emoji")["description"].apply( lambda x: ", ".join(x)) grouped_by_description = DataFrame({ "emoji": emoji_grouping.index, "description": emoji_grouping.values }) # Build an emoji vocabulary and map each emoji to an index (beginning from 1) emojis = grouped_by_description["emoji"].values emoji_to_index = { emoji: index + 1 for emoji, index in zip(emojis, range(len(emojis))) } emoji_vocab_size = len(emoji_to_index) + 1 print("There are %d unique emojis." % (emoji_vocab_size - 1)) # Concatenate and shuffle negative and positive examples of emojis all_emojis = concat([pos_emojis, neg_emojis]).sample(frac=1, random_state=150493) # Build a word vocabulary and map each emoji to an index (beginning from 1) descriptions = all_emojis["description"].values tokenizer = Tokenizer() tokenizer.fit_on_texts(descriptions.tolist()) word_sequences = tokenizer.texts_to_sequences(descriptions.tolist()) word_to_index = tokenizer.word_index word_vocab_size = len(word_to_index) + 1 print("There are %d unique words in the descriptions." % (word_vocab_size - 1)) # Load GLoVe word embeddings word_emb = utils.load_vectors(glove_filename) # Prepare the word-embedding matrix embedding_matrix = utils.get_embedding_matrix(word_emb, word_to_index, embedding_dim, init_unk=False) # Prepare training data train_emoji = np.array( [emoji_to_index[e] for e in all_emojis["emoji"].values]) print("The emoji tensor shape is ", train_emoji.shape) if use_lstm: train_words = pad_sequences(word_sequences, maxlen=seq_length, padding='post', truncating='post', value=0.) else: train_words = sum_emb(word_sequences, embedding_matrix) print("The descriptions tensor shape is ", train_words.shape) labels = to_categorical( np.asarray([label for label in all_emojis["label"].values])) print("The label tensor shape is ", labels.shape) # Build the emoji DNN model if use_lstm: model = emoji2vec_lstm_model(embedding_matrix, emoji_vocab_size, word_vocab_size, seq_length) else: model = emoji2vec_model(emoji_vocab_size) my_optimizer = Adam(lr=learning_rate, beta_1=0.9, beta_2=0.99, decay=0.01) model.compile(loss="categorical_crossentropy", optimizer=my_optimizer, metrics=["categorical_accuracy", utils.f1_score]) print(model.summary()) plot_model(model, to_file=path + '/plots/emoji2vec_' + str(embedding_dim) + 'd_model_summary.png', show_shapes=False, show_layer_names=True) # Prepare the callbacks and fit the model save_best = ModelCheckpoint(monitor='val_categorical_accuracy', save_best_only=True, filepath=emoji2vec_weights) reduceLR = ReduceLROnPlateau(monitor='val_categorical_accuracy', factor=0.1, patience=3, verbose=1) early_stopping = EarlyStopping(monitor='val_categorical_accuracy', patience=15, verbose=1) history = model.fit([train_emoji, train_words], labels, epochs=epochs, batch_size=batch_size, validation_split=0.1, verbose=1, callbacks=[save_best, reduceLR, early_stopping]) # Plot accuracy and loss utils.plot_training_statistics(history, "/plots/emoji2vec_%dd" % embedding_dim, plot_validation=True, acc_mode="categorical_accuracy", loss_mode="loss") # Get the weights of the trained emoji model weights = [ layer.get_weights()[0] for layer in model.layers if layer.name == 'emoji_emb' ] weights = weights[0] # Get the emoji embeddings and save them to file embeddings = DataFrame(weights[1:]) embeddings = concat([grouped_by_description["emoji"], embeddings], axis=1) embeddings.to_csv(emoji2vec_embeddings, sep=" ", header=False, index=False) # Get the t-SNE representation tsne = TSNE(n_components=2, perplexity=30, init="pca", n_iter=5000) trans = tsne.fit_transform(weights) # Save the obtained emoji visualization visualization = DataFrame(trans[1:], columns=["x", "y"]) visualization["emoji"] = grouped_by_description["emoji"].values visualization.to_csv(emoji2vec_visualization) # Visualize the embeddings as a t-sne figure visualization.plot("x", "y", kind="scatter", grid=True) plt.savefig(path + "/plots/tsne_%dd.png" % embedding_dim)
def main(): parser = argparse.ArgumentParser(description="Running BAT or ECT") parser.add_argument( "--test_type", nargs='?', choices=['ECT', 'BAT'], help="Specify BAT or ECT depending on which test shall be run", required=True) parser.add_argument( "--protocol_type", nargs='?', choices=['RT', 'BRD'], help= "Whether to run test for Reichstagsprotokolle (RT) or Bundestagsprotokolle (BRD)", required=True) parser.add_argument("--output_file", type=str, default=None, help="File to store the results)", required=True) parser.add_argument("--vocab_file", type=str, default=None, help="path to vocab file", required=True) parser.add_argument("--vector_file", type=str, default=None, help="path to vector file", required=True) args = parser.parse_args() if not args.test_type in ['ECT', 'BAT']: parser.print_help() sys.exit(2) vocab = load_vocab(str(vocab_path / args.vocab_file)) vectors = load_vectors(str(models_path / args.vector_file)) results = {} for test in weat_tests: results[test.__name__] = {} for dim in DIMENSIONS: weat_terms = test(dim, args.protocol_type) if args.test_type == 'BAT': result = run_bat(vectors, vocab, weat_terms) logging.info(f'{test.__name__} - {dim}: {result}') results[test.__name__][dim] = result elif args.test_type == 'ECT': result = run_ect(vectors, vocab, weat_terms) logging.info(f'{test.__name__} - {dim}: {result}') results[test.__name__][dim] = result if args.test_type == 'BAT': res_df = pd.DataFrame(results).T.round(3) elif args.test_type == 'ECT': res_df = pd.DataFrame(index=pd.MultiIndex.from_product( [DIMENSIONS, ['corr', 'p']]), columns=results.keys()).T for k1, v1 in results.items(): for k2, v2 in v1.items(): res_df.loc[k1, (k2, 'corr')] = results[k1][k2].correlation res_df.loc[k1, (k2, 'p')] = results[k1][k2].pvalue res_df.to_csv(f'{str(ROOT_DIR)}/{args.output_file}.csv', index=True, header=True)
import extract_baseline_features import extract_ml_features2 as extract_features import utils, classifiers import data_processing as data_proc # Settings for the up-coming ML model pragmatic = True lexical = True pos_grams = True sentiment = True topic = True similarity = True pos_ngram_list = [1] ngram_list = [1] embedding_dim = 100 word2vec_map = utils.load_vectors(filename='glove.6B.%dd.txt' % embedding_dim) # Set the values for the portion fo data n_train = 3000 n_test = 500 def baseline(tweets_train, train_labels, tweets_test, test_labels): # Import the subjectivity lexicon subj_dict = data_proc.get_subj_lexicon() types_of_features = ['1', '2', '3', 'ngrams'] for t in types_of_features: start = time.time() utils.print_model_title("Classification using feature type " + t) if t is '1':
def main(): parser = argparse.ArgumentParser( description= 'Calculates semantic similarity for each pair of words in the specified input file and saves result ' 'to the output file. Uses specified word2vec word vectors file.') parser.add_argument('vectors', help='word2vec word vectors file.', default='') parser.add_argument( 'input', help= 'Input file in csv format (with comma as separator). Each line is a pair of words, ' 'for which similarity is calculated.', default='') parser.add_argument( '-output', help='Output file in csv format (with comma as separator).', default='') parser.add_argument( '-column', help= "Name of the output column. If column doesn't exist, it will be added, otherwise " "data in existing column will be modified.", default='res') parser.add_argument('-morph', help="Enable morphology hack.", action='store_true') args = parser.parse_args() fvec = args.vectors fin = args.input fout = fin + '-' + os.path.basename( fvec) if args.output == '' else args.output cout = args.column morph_hack = args.morph print >> stderr, "Loading vectors from {}".format(fvec) vectors = load_vectors(fvec) # for words with underscore (ex: берег_v, берег_s, северо_запад, вода_и_медные_трубы) prefix2word = {} for x in vectors.vocab.iterkeys(): if '_' not in x: continue prefix = x.split('_')[0] if prefix not in prefix2word: prefix2word[prefix] = [x] else: prefix2word[prefix].append(x) # for e in prefix2word.iteritems(): # if len(e[1]) > 1: # print >> stderr, ("%s: %s" % (e[0], ':'.join(e[1]))).encode('utf-8') print >> stderr, "Calculating similarity for {}; writing result to {}".format( fin, fout) with open(fin, 'r') as input_file, open(fout, "w") as output_file, open( fout + '.log', "w") as log_file: inp = csv.DictReader(input_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, quotechar="'") # inp_fieldnames = inp.fieldnames out_fieldnames = inp.fieldnames if cout in inp.fieldnames else inp.fieldnames + [ cout ] out = csv.DictWriter(output_file, out_fieldnames, delimiter=',', quoting=csv.QUOTE_MINIMAL, quotechar="'") out.writeheader() for linenum, ex in enumerate(inp): if linenum % 1000 == 0: print 'Lines processed: {}'.format(linenum) ex[cout] = sim(vectors, prefix2word, ex['word1'].decode('utf-8'), ex['word2'].decode('utf-8'), log_file, morph_hack) out.writerow(ex)
## Script that trains an instance of a CATS (or TLT) model import tensorflow as tf import model import serializer import config import utils import numpy as np import get_data import pickle import os dirname = dirname = os.path.dirname(os.path.realpath(__file__)) print("Dirname: " + dirname) print("Loading word embeddings...") embs = utils.load_vectors(os.path.join(dirname, config.vecs_path_en)) vocab = utils.load_vocab(os.path.join(dirname, config.vocab_path_en)) print("Loaded.") print("Defining estimator...") rconf = tf.estimator.RunConfig(save_checkpoints_steps=config.SAVE_CHECKPOINT_STEPS, save_checkpoints_secs=None, model_dir=os.path.join(dirname, config.MODEL_HOME)) print("Defined.") params = {"padding_value" : vocab["<PAD>"], "wembs" : embs, "vocab" : vocab, "coherence_hinge_margin" : 1, "learning_rate" : 0.0001} estimator = tf.estimator.Estimator(model_fn=model.model_fn, config=rconf, params=params) print("Training the model...") res = estimator.train(input_fn=lambda : get_data.get_data(os.path.join(dirname, config.tfrec_train), is_train = True, epochs = config.EPOCHS))
def main(): parser = argparse.ArgumentParser( description= 'Reads words from vector model. Writes to stdout word + similar words and their distances to the original word.' ) parser.add_argument('vectors', help='word2vec word vectors file.', default='') parser.add_argument('-output', help='Output file in on-pair-per-line format, gziped', default='') parser.add_argument( '-only_letters', help= 'Skip words containing non-letter symbols from stding / similar words.', action="store_true") parser.add_argument( "-vocab_limit", help= "Collect neighbours only for specified number of most frequent words. By default use all words.", default=None, type=int) parser.add_argument( '-pairs', help= "Use pairs format: 2 words and distance in each line. Otherwise echo line is a word and all it's neighbours with distances.", action="store_true") parser.add_argument('-batch-size', help='Batch size for finding neighbours.', default="1000") parser.add_argument( '-word_freqs', help= "Weight similar words by frequency. Pass frequency file as parameter", default=None) args = parser.parse_args() fvec = args.vectors batch_size = int(args.batch_size) print >> stderr, "Vectors: {}, only_letters: {}".format( args.vectors, args.only_letters) print >> stderr, "Loading vectors from {}".format(fvec) tic = time() vectors = load_vectors(fvec) print >> stderr, "Vectors loaded in %d sec." % (time() - tic) print >> stderr, "Vectors shape is: ", vectors.syn0norm.shape vocab_size = len(vectors.vocab) print("Vocabulary size: %i" % vocab_size) # Limit the number of words for which to collect neighbours if args.vocab_limit and args.vocab_limit < vocab_size: vocab_size = args.vocab_limit print("Collect neighbours for %i most frequent words" % vocab_size) freq = None if args.word_freqs: freq_dict = load_freq(args.word_freqs) freq = order_freq(vectors, freq_dict) print "freqs loaded. Length ", len(freq), freq[:10] with gzip.open(args.output, 'wb') if args.output else stdout as out: process(out, vectors, only_letters=args.only_letters, vocab_size=vocab_size, batch_size=batch_size, pairs=args.pairs, freq=freq)
print(' Loading word pairs... Done!\n') print(' Loading word synsets and building synset vocabulary...') word_synsets, synsets = utils.load_word_synsets( pos=setup.POS, include_named_entities=setup.INCLUDE_NAMED_ENTITIES) print(' Loading word synsets and building synset vocabulary... Done! [{} synsets]\n'.format(len(synsets))) print(' Loading synset weights...') synset_weights = utils.load_synset_weights(synsets) print(' Loading synset weights... Done!\n') print(' Loading Conception vectors...') vectors = utils.load_vectors( conception_vectors.value, synsets, normalize=setup.NORMALIZE_SCORES, min_length=setup.MIN_VECTOR_LENGTH, max_length=setup.MAX_VECTOR_LENGTH) print(' Loading Conception vectors... Done! [{} vectors]\n'.format(len(vectors))) utils.evaluate( 'Conception_{}'.format(args.conception_type), word_pairs, word_synsets, synset_weights, vectors, setup.SIMILARITY_FUNCTION, setup.SCORING_FUNCTION, verbose=args.verbose)