def init(wvec, cvec, output="", only_letters=False, vocab_limit=None, pairs=False, batch_size=1000):

    print >> stderr, "Vectors: {}, only_letters: {}".format(wvec, only_letters)
    print >> stderr, "Loading vectors from {}".format(wvec)
    tic = time()
    wvectors = load_vectors(wvec, binary=False)
    print >> stderr, "Vectors loaded in %d sec." % (time()-tic)
    print >> stderr, "Vectors shape is: ", wvectors.syn0norm.shape
    
    print >> stderr, "Loading vectors from {}".format(cvec)
    tic = time()
    cvectors = load_vectors(cvec, binary=False)
    print >> stderr, "Vectors loaded in %d sec." % (time()-tic)
    print >> stderr, "Vectors shape is: ", cvectors.syn0norm.shape
    

    vocab_size = len(wvectors.vocab)
    print("Vocabulary size: %i" % vocab_size)
    
    # Limit the number of words for which to collect contexts
    if vocab_limit and vocab_limit < vocab_size:
        vocab_size = vocab_limit
    words = wvectors.index2word[:vocab_size]
    
    print("Collect activated contexts for %i most frequent words" % vocab_size)
    

    with codecs.open(output, 'wb') if output else stdout as out:
        process(out, wvectors, cvectors, words, only_letters=only_letters, batch_size=batch_size, pairs=pairs)
def init(fvec, output="", only_letters=False, vocab_limit=None, pairs=False, batch_size=1000, word_freqs=None):

    print >> stderr, "Vectors: {}, only_letters: {}".format(fvec, only_letters)
    print >> stderr, "Loading vectors from {}".format(fvec)
    tic = time()
    vectors = load_vectors(fvec, binary=True)
    print >> stderr, "Vectors loaded in %d sec." % (time()-tic)
    print >> stderr, "Vectors shape is: ", vectors.syn0norm.shape

    vocab_size = len(vectors.vocab)
    print("Vocabulary size: %i" % vocab_size)
    
    # Limit the number of words for which to collect neighbours
    if vocab_limit and vocab_limit < vocab_size:
        vocab_size = vocab_limit
    words = vectors.index2word[:vocab_size]
    
    print("Collect neighbours for %i most frequent words" % vocab_size)
    
    freq=None
    if word_freqs:
        freq_dict = load_freq(word_freqs)
        freq = order_freq(vectors, freq_dict)
        print "freqs loaded. Length ", len(freq), freq[:10]

    with codecs.open(output, 'wb') if output else stdout as out:
        process(out, vectors, words, only_letters=only_letters, batch_size=batch_size, pairs=pairs, freq=freq)
Beispiel #3
0
def main():
    parser = argparse.ArgumentParser(description="Running Simlex test")
    parser.add_argument(
        "--vocab_file_pattern",
        type=str,
        default=None,
        help="vocab path file or file pattern in case of multiple files",
        required=True)
    parser.add_argument(
        "--vector_file_pattern",
        type=str,
        default=None,
        help="vector path file or file pattern in case of multiple files",
        required=True)
    parser.add_argument("--output_file",
                        type=str,
                        default=None,
                        help="file to write output to",
                        required=True)

    args = parser.parse_args()
    vocab_files = glob.glob(str(args.vocab_file_pattern))
    vector_files = glob.glob(str(args.vector_file_pattern))

    with open(os.path.join(ROOT_DIR, f'simlex/{args.output_file}'), 'w') as f:
        for voc, vec in zip(vocab_files, vector_files):
            file_name = os.path.splitext(os.path.basename(voc))[0][4:]
            vocab = load_vocab(voc)
            vectors = load_vectors(vec)
            simlex_score = eval_simlex(simlex_pairs, vocab, vectors)
            f.write('{}: {}'.format(file_name, simlex_score))
            f.write('\n')
        f.close()
Beispiel #4
0
def build_embedding_layer(word2index,
                          emb_type='glove',
                          embedding_dim=300,
                          max_len=40,
                          trainable=True):
    vocab_size = len(word2index) + 1
    if 'glove' in emb_type:
        word2vec_map = utils.load_vectors(filename='glove.6B.%dd.txt' %
                                          embedding_dim)
        emb_layer = pretrained_embedding_layer(word2vec_map,
                                               word2index,
                                               embedding_dim,
                                               vocab_size,
                                               trainable=trainable)
    elif 'emoji' in emb_type:
        emoji2vec_map = utils.load_vectors(
            filename='emoji_embeddings_%dd.txt' % embedding_dim)
        emb_layer = pretrained_embedding_layer(emoji2vec_map,
                                               word2index,
                                               embedding_dim,
                                               vocab_size,
                                               trainable=trainable)
    elif 'random' in emb_type:
        words = word2index.keys()
        random2vec_map = utils.build_random_word2vec(
            words, embedding_dim=embedding_dim, variance=1)
        emb_layer = pretrained_embedding_layer(random2vec_map,
                                               word2index,
                                               embedding_dim,
                                               vocab_size,
                                               trainable=trainable)
    else:
        emb_layer = Embedding(vocab_size,
                              embedding_dim,
                              input_length=max_len,
                              trainable=trainable)
        emb_layer.build((None, ))
    return emb_layer
Beispiel #5
0
def main():
    parser = argparse.ArgumentParser(
        description='Calculates semantic similarity for each pair of words in the specified input file and saves result '
                    'to the output file. Uses specified word2vec word vectors file.')
    parser.add_argument('vectors', help='word2vec word vectors file.', default='')
    parser.add_argument('input', help='Input file in csv format (with comma as separator). Each line is a pair of words, '
                                      'for which similarity is calculated.', default='')
    parser.add_argument('-output', help='Output file in csv format (with comma as separator).', default='')
    parser.add_argument('-column', help="Name of the output column. If column doesn't exist, it will be added, otherwise "
                                        "data in existing column will be modified.", default='res')
    parser.add_argument('-morph', help="Enable morphology hack.", action='store_true')
    args = parser.parse_args()

    fvec = args.vectors
    fin = args.input
    fout = fin + '-' + os.path.basename(fvec) if args.output == '' else args.output
    cout = args.column
    morph_hack = args.morph

    print >> stderr, "Loading vectors from {}".format(fvec)
    vectors = load_vectors(fvec)

    # for words with underscore (ex: берег_v, берег_s, северо_запад, вода_и_медные_трубы)
    prefix2word = {}
    for x in vectors.vocab.iterkeys():
        if '_' not in x:
            continue
        prefix = x.split('_')[0]
        if prefix not in prefix2word:
            prefix2word[prefix] = [x]
        else:
            prefix2word[prefix].append(x)

    # for e in prefix2word.iteritems():
    #     if len(e[1]) > 1:
    #         print >> stderr, ("%s: %s" % (e[0], ':'.join(e[1]))).encode('utf-8')

    print >> stderr, "Calculating similarity for {}; writing result to {}".format(fin, fout)
    with open(fin, 'r') as input_file, open(fout, "w") as output_file, open(fout+'.log', "w") as log_file:
        inp = csv.DictReader(input_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, quotechar="'")
        # inp_fieldnames = inp.fieldnames
        out_fieldnames = inp.fieldnames if cout in inp.fieldnames else inp.fieldnames + [cout]
        out = csv.DictWriter(output_file, out_fieldnames, delimiter=',', quoting=csv.QUOTE_MINIMAL, quotechar="'")
        out.writeheader()
        for linenum, ex in enumerate(inp):
            if linenum % 1000 == 0:
                    print 'Lines processed: {}'.format(linenum)

            ex[cout] = sim(vectors, prefix2word, ex['word1'].decode('utf-8'), ex['word2'].decode('utf-8'), log_file, morph_hack)
            out.writerow(ex)
def main():
    parser = argparse.ArgumentParser(description="Running K-means test")
    parser.add_argument(
        "--vocab_file_pattern",
        type=str,
        default=None,
        help="vocab path file or file pattern in case of multiple files",
        required=True)
    parser.add_argument(
        "--vector_file_pattern",
        type=str,
        default=None,
        help="vector path file or file pattern in case of multiple files",
        required=True)
    parser.add_argument(
        "--protocol_type",
        nargs='?',
        choices=['RT', 'BRD'],
        help=
        "Whether to run test for Reichstagsprotokolle (RT) or Bundestagsprotokolle (BRD)",
        required=True)
    args = parser.parse_args()

    vocab_files = glob.glob(str(args.vocab_file_pattern))
    vector_files = glob.glob(str(args.vector_file_pattern))
    for voc, vec in zip(vocab_files, vector_files):
        file_name = os.path.splitext(os.path.basename(voc))[0]
        vocab = load_vocab(voc)
        vectors = load_vectors(vec)
        with open(os.path.join(ROOT_DIR, f'kmeans/{file_name}.txt'), 'w') as f:
            for test in weat_tests:
                f.write(f'K-means score {test.__name__ }: ')
                targets_1 = test(
                    'sentiment',
                    args.protocol_type,
                )[0]
                targets_2 = test('sentiment', args.protocol_type)[1]
                k_means_score = eval_k_means(targets_1, targets_2, vectors,
                                             vocab)
                f.write(str(k_means_score))
                f.write('\n')
        f.close()
def main():
    parser = argparse.ArgumentParser(
        description='Reads words from vector model. Writes to stdout word + similar words and their distances to the original word.')
    parser.add_argument('vectors', help='word2vec word vectors file.', default='')
    parser.add_argument('-output', help='Output file in on-pair-per-line format, gziped', default='')
    parser.add_argument('-only_letters', help='Skip words containing non-letter symbols from stding / similar words.', action="store_true")
    parser.add_argument("-vocab_limit", help="Collect neighbours only for specified number of most frequent words. By default use all words.", default=None, type=int)
    parser.add_argument('-pairs', help="Use pairs format: 2 words and distance in each line. Otherwise echo line is a word and all it's neighbours with distances." , action="store_true")
    parser.add_argument('-batch-size', help='Batch size for finding neighbours.', default="1000")
    parser.add_argument('-word_freqs', help="Weight similar words by frequency. Pass frequency file as parameter", default=None)

    args = parser.parse_args()
     
    fvec = args.vectors
    batch_size = int(args.batch_size)

    print >> stderr, "Vectors: {}, only_letters: {}".format(args.vectors, args.only_letters)
    print >> stderr, "Loading vectors from {}".format(fvec)
    tic = time()
    vectors = load_vectors(fvec)
    print >> stderr, "Vectors loaded in %d sec." % (time()-tic)
    print >> stderr, "Vectors shape is: ", vectors.syn0norm.shape

    vocab_size = len(vectors.vocab)
    print("Vocabulary size: %i" % vocab_size)
    
    # Limit the number of words for which to collect neighbours
    if args.vocab_limit and args.vocab_limit < vocab_size:
        vocab_size = args.vocab_limit
    
    print("Collect neighbours for %i most frequent words" % vocab_size)
    
    freq=None
    if args.word_freqs:
        freq_dict = load_freq(args.word_freqs)
        freq = order_freq(vectors, freq_dict)
        print "freqs loaded. Length ", len(freq), freq[:10]

    with gzip.open(args.output, 'wb') if args.output else stdout as out:
        process(out, vectors, only_letters=args.only_letters, vocab_size=vocab_size, batch_size=batch_size, pairs=args.pairs, freq=freq)
Beispiel #8
0
def init(fvec,
         output="",
         only_letters=False,
         vocab_limit=None,
         pairs=False,
         batch_size=1000,
         word_freqs=None):

    print >> stderr, "Vectors: {}, only_letters: {}".format(fvec, only_letters)
    print >> stderr, "Loading vectors from {}".format(fvec)
    tic = time()
    vectors = load_vectors(fvec, binary=True)
    print >> stderr, "Vectors loaded in %d sec." % (time() - tic)
    print >> stderr, "Vectors shape is: ", vectors.syn0norm.shape

    vocab_size = len(vectors.vocab)
    print("Vocabulary size: %i" % vocab_size)

    # Limit the number of words for which to collect neighbours
    if vocab_limit and vocab_limit < vocab_size:
        vocab_size = vocab_limit
    words = vectors.index2word[:vocab_size]

    print("Collect neighbours for %i most frequent words" % vocab_size)

    freq = None
    if word_freqs:
        freq_dict = load_freq(word_freqs)
        freq = order_freq(vectors, freq_dict)
        print "freqs loaded. Length ", len(freq), freq[:10]

    with codecs.open(output, 'wb') if output else stdout as out:
        process(out,
                vectors,
                words,
                only_letters=only_letters,
                batch_size=batch_size,
                pairs=pairs,
                freq=freq)
Beispiel #9
0
    print('  Loading word synsets and building synset vocabulary...')
    word_synsets, synsets = utils.load_word_synsets(
        pos=setup.POS,
        include_named_entities=setup.INCLUDE_NAMED_ENTITIES)
    print('  Loading word synsets and building synset vocabulary... Done! [{} synsets]\n'.format(len(synsets)))

    print('  Loading synset weights...')
    synset_weights = utils.load_synset_weights(synsets)
    print('  Loading synset weights... Done!\n')

    print('  Loading NASARI vectors...')
    vectors = utils.load_vectors(
        nasari_vectors.value,
        synsets,
        normalize=setup.NORMALIZE_SCORES,
        min_length=setup.MIN_VECTOR_LENGTH,
        max_length=setup.MAX_VECTOR_LENGTH,
        skip_dims=1,
        component_separator='_',
        int_synset=False)
    print('  Loading NASARI vectors... Done! [{} vectors]\n'.format(len(vectors)))

    utils.evaluate(
        'NASARI_{}'.format(args.nasari_language),
        word_pairs,
        word_synsets,
        synset_weights,
        vectors,
        setup.SIMILARITY_FUNCTION,
        setup.SCORING_FUNCTION,
        verbose=args.verbose)
Beispiel #10
0
def get_emoji2vec():
    # Load the emoji data - both true and false descriptions
    pos_emojis = read_csv(emoji_positive,
                          sep='\t',
                          engine='python',
                          encoding='utf_8',
                          names=['description', 'emoji'])
    neg_emojis = read_csv(emoji_negative,
                          sep='\t',
                          engine='python',
                          encoding='utf_8',
                          names=['description', 'emoji'])

    print('Number of true emoji descriptions: %d' % len(pos_emojis))
    print('Number of false emoji descriptions: %d' % len(neg_emojis))

    # Set the labels to 1 (for true descriptions) and 0 (for false descriptions)
    pos_emojis['label'] = 1
    neg_emojis['label'] = 0

    # Concatenate and shuffle negative and positive examples of emojis
    all_emojis = concat([pos_emojis, neg_emojis]).sample(frac=1,
                                                         random_state=144803)

    # Group all emojis in positive examples by descriptions
    emoji_grouping = pos_emojis.groupby('emoji')['description'].apply(
        lambda x: ', '.join(x))
    grouped_by_description = DataFrame({
        'emoji': emoji_grouping.index,
        'description': emoji_grouping.values
    })

    # Build an emoji vocabulary and map each emoji to an index (beginning from 1)
    emojis = grouped_by_description['emoji'].values
    emoji_to_index = {
        emoji: index + 1
        for emoji, index in zip(emojis, range(len(emojis)))
    }
    index_to_emoji = {index: emoji for emoji, index in emoji_to_index.items()}
    emoji_vocab_size = len(emoji_to_index)
    print('Total number of unique emojis: %d' % emoji_vocab_size)

    # Build a word vocabulary and map each emoji to an index (beginning from 1)
    descriptions = all_emojis['description'].values
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(descriptions.tolist())
    word_sequences = tokenizer.texts_to_sequences(descriptions.tolist())
    word_to_index = tokenizer.word_index
    index_to_word = {index: word for word, index in word_to_index.items()}
    word_vocab_size = len(word_to_index)
    print('Total number of unique words found in emoji descriptions: %d' %
          word_vocab_size)

    # Load GLoVe word embeddings
    print("Loading GLoVe...")
    word2vec_map = utils.load_vectors(glove_filename)

    # Prepare the word-embedding matrix
    embedding_matrix = utils.get_embedding_matrix(word2vec_map,
                                                  word_to_index,
                                                  embedding_dim,
                                                  init_unk=False)
    print('Number of non-existent word-embeddings: %d' %
          np.sum(np.sum(embedding_matrix, axis=1) == 0))

    # Prepare training data
    train_emoji = np.array(
        [emoji_to_index[emoji] for emoji in all_emojis['emoji'].values])
    train_words = pad_sequences(word_sequences, maxlen=maximum_length)
    labels = np.array([[0, 1] if label == 0 else [1, 0]
                       for label in all_emojis['label'].values])

    print('Shape of emoji data:', train_emoji.shape)
    print('Shape of emoji description data:', train_words.shape)
    print('Shape of label tensor:', labels.shape)
    print('Number of emojis:', emoji_vocab_size)

    # Build the emoji DNN model
    print("Building the emoji2vec model...")
    emoji_model, word_model, model = emoji2vec_model(embedding_matrix,
                                                     emoji_vocab_size,
                                                     word_vocab_size)
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['categorical_accuracy'])
    print(model.summary())

    # Train a model if one hasn't been trained yet
    if not os.path.exists(emoji2vec_weights):
        print("Training the emoji2vec model...")
        callbacks = [
            ModelCheckpoint(emoji2vec_weights,
                            monitor='val_categorical_accuracy',
                            save_best_only=True)
        ]
        history = model.fit([train_emoji, train_words],
                            labels,
                            epochs=50,
                            validation_split=0.1,
                            verbose=1,
                            callbacks=callbacks)
        # Plot accuracy and loss
        utils.plot_training_statistics(
            history,
            path + "/plots/emoji2vec/emoji2vec_%dd" % embedding_dim,
            also_plot_validation=True,
            acc_mode='categorical_accuracy',
            loss_mode='loss')

    # Load the pre-trained weights and get the embeddings
    print("Loading the trained weights of the emoji2vec model...")
    model.load_weights(emoji2vec_weights)
    weights = emoji_model.layers[0].get_weights()[0]

    # Get the emoji2vec mapping
    emoji2vec = {}
    for e, w in zip(grouped_by_description['emoji'], weights[1:]):
        emoji2vec[e] = w

    # Get the emoji embeddings and save them to file
    if not os.path.exists(emoji2vec_embeddings):
        embeddings = DataFrame(weights[1:])
        embeddings = concat([grouped_by_description['emoji'], embeddings],
                            axis=1)
        embeddings.to_csv(emoji2vec_embeddings,
                          sep=' ',
                          header=False,
                          index=False)

    # Get the t-SNE representation
    if not os.path.exists(emoji2vec_visualization):
        tsne = TSNE(n_components=2, perplexity=30, init='pca', n_iter=5000)
        # Following are the exact tsne settings used in the emoji visualization in the original paper
        # tsne = TSNE(perplexity=50, n_components=2, init='random', n_iter=300000, early_exaggeration=1.0,
        #             n_iter_without_progress=1000)
        trans = tsne.fit_transform(weights)

        # Save the obtained emoji visualization
        visualization = DataFrame(trans[1:], columns=['x', 'y'])
        visualization['emoji'] = grouped_by_description['emoji'].values
        visualization.to_csv(emoji2vec_visualization)

        # Visualize the embeddings as a tsne figure
        visualization.plot('x', 'y', kind='scatter', grid=True)
        plt.savefig(path + '/plots/emoji2vec/tsne_%dd.pdf' % embedding_dim)

    return emoji2vec
Beispiel #11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-m', '--saved_model', help='path to saved model',default='results/CELoss_both_tenfold0_s256_cnn3_lstm2_d5_lr1e-3_wd1e-5_f11_p5_sum_b2000_h128_e300_v3000.pt')
    parser.add_argument("-c", "--config", help="path to config file", default='conf/mustc.yaml')
    parser.add_argument('-d', '--datasplit',
                        help='optional path to datasplit yaml file to override path specified in config')

    parser.add_argument('-hid', '--hidden_size',
                        help='hidden size for LSTM -- optional, overrides the one in the config')
    parser.add_argument('-f', '--frame_filter_size',
                        help='width of CNN filters -- optional, overrides the one in the config')
    parser.add_argument('-pad', '--frame_pad_size',
                        help='width of CNN padding -- optional, overrides the one in the config')
    parser.add_argument('-cnn', '--cnn_layers',
                        help='number of CNN layers -- optional, overrides the one in the config')
    parser.add_argument('-l', '--lstm_layers',
                        help='number of LSTM layers -- optional, overrides the one in the config')
    parser.add_argument('-dr', '--dropout', help='dropout -- optional, overrides the one in the config')
    parser.add_argument('-wd', '--weight_decay', help='weight decay -- optional, overrides the one in the config')
    parser.add_argument('-lr', '--learning_rate', help='learning rate -- optional, overrides the one in the config')
    parser.add_argument('-flat', '--flatten_method',
                        help='method for flattening tokens -- optional, overrides the one in the config')
    parser.add_argument('-b', '--bottleneck_feats',
                        help='number of bottlneckfeats -- optional, overrides the one in the config')
    parser.add_argument('-e', '--embedding_dim',
                        help='number of bottlneckfeats -- optional, overrides the one in the config')
    parser.add_argument('-v', '--vocab_size',
                        help='vocab size -- optional, overrides the one in the config')
    parser.add_argument('-s', '--stopword_baseline', action='store_true', default=False)
    parser.add_argument('-o', '--output_file', help='name of output file')
    parser.add_argument('-rs', '--seed', help='random seed -- optional, overrides the one in the config')

    args = parser.parse_args()
    with open(args.config, 'r') as f:
        cfg = yaml.load(f, yaml.FullLoader)

    if args.stopword_baseline:
        print('WARNING: STOPWORD BASELINE')

    cfg2arg = {'datasplit': args.datasplit,
               'frame_filter_size': args.frame_filter_size,
               'frame_pad_size': args.frame_pad_size,
               'cnn_layers': args.cnn_layers,
               'lstm_layers': args.lstm_layers,
               'dropout': args.dropout,
               'weight_decay': args.weight_decay,
               'learning_rate': args.learning_rate,
               'flatten_method': args.flatten_method,
               'bottleneck_feats': args.bottleneck_feats,
               'hidden_size': args.hidden_size,
               'embedding_dim': args.embedding_dim,
               'vocab_size': args.vocab_size,
               'seed': args.seed
               }

    int_args = ['frame_filter_size', 'frame_pad_size', 'cnn_layers', 'lstm_layers', 'bottleneck_feats', 'hidden_size',
                'embedding_dim', 'vocab_size','seed']
    float_args = ['dropout', 'weight_decay', 'learning_rate']


    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    if args.datasplit:
        datasplit = args.datasplit
    else:
        datasplit = cfg['datasplit']

    for arg in cfg2arg:
        if cfg2arg[arg]:
            if arg in int_args:
                cfg[arg] = int(cfg2arg[arg])
            elif arg in float_args:
                cfg[arg] = float(cfg2arg[arg])
            else:
                cfg[arg] = cfg2arg[arg]

    seed = cfg['seed']

    with open(cfg['all_data'], 'rb') as f:
        data_dict = pickle.load(f)


    #import pdb;pdb.set_trace()

    with open(cfg['datasplit'].replace('yaml', 'vocab'), 'rb') as f:
        vocab_dict = pickle.load(f)



    # Load text data:
    with open(cfg['datasplit'].replace('yaml', 'vocab'), 'rb') as f:
        vocab_dict = pickle.load(f)
    print(f'Original vocab size: {len(vocab_dict["w2i"])}')

    set_seeds(seed)

    def truncate_dicts(vocab_dict, vocab_size):
        i2w = {}
        w2i = {}
        for i in range(vocab_size + 2):
            if i in vocab_dict['i2w']:
                w = vocab_dict['i2w'][i]
                i2w[i] = w
                w2i[w] = i
            else:
                if cfg['inputs']=='text' or cfg['inputs']=='both':
                    print("WARNING: vocab size is not smaller than actual vocab")
        return w2i, i2w

    w2i, i2w = truncate_dicts(vocab_dict, cfg['vocab_size'])

    if cfg['use_pretrained']:
        print('using pretrained')
        if cfg['embedding_dim']==100:
            glove_path = cfg['glove_path_100']
        elif cfg['embedding_dim'] == 300:
            glove_path = cfg['glove_path_300']

        i2vec = load_vectors(glove_path, w2i)

        weights_matrix = np.zeros((cfg['vocab_size'] + 2, cfg['embedding_dim']))
        for i in i2w:
            try:
                weights_matrix[i] = i2vec[i]

            except:
                weights_matrix[i] = np.random.normal(scale=0.6, size=(cfg['embedding_dim'],))
        weights_matrix = torch.tensor(weights_matrix)
    else:
        weights_matrix = None


    if 'overwrite_speech' in cfg:
        overwrite_speech = cfg['overwrite_speech']
    else:
        overwrite_speech = False

    if 'scramble_speech' in cfg:
        scramble_speech = cfg['scramble_speech']
    else:
        scramble_speech = False

    if 'stopwords_only' in cfg:
        stopwords_only = cfg['stopwords_only']
    else:
        stopwords_only = False

    if 'binary_vocab' in cfg:
        binary_vocab = cfg['binary_vocab']
    else:
        binary_vocab = False

    if 'ablate_feat' in cfg:
        ablate_feat = cfg['ablate_feat']
    else:
        ablate_feat = None

    model = SpeechEncoder(seq_len=cfg['frame_pad_len'],
                          batch_size=cfg['train_params']['batch_size'],
                          lstm_layers=cfg['lstm_layers'],
                          bidirectional=cfg['bidirectional'],
                          num_classes=cfg['num_classes'],
                          dropout=cfg['dropout'],
                          include_lstm=cfg['include_lstm'],
                          tok_level_pred=cfg['tok_level_pred'],
                          feat_dim=cfg['feat_dim'],
                          postlstm_context=cfg['postlstm_context'],
                          device=device,
                          tok_seq_len=cfg['tok_pad_len'],
                          flatten_method=cfg['flatten_method'],
                          frame_filter_size=cfg['frame_filter_size'],
                          frame_pad_size=cfg['frame_pad_size'],
                          cnn_layers=cfg['cnn_layers'],
                          inputs=cfg['inputs'],
                          embedding_dim=cfg['embedding_dim'],
                          vocab_size=cfg['vocab_size'],
                          bottleneck_feats=cfg['bottleneck_feats'],
                          use_pretrained=cfg['use_pretrained'],
                          weights_matrix=weights_matrix)



    model.load_state_dict(torch.load(args.saved_model))

    model.to(device)

    #testset = BurncDataset(cfg, data_dict, w2i, cfg['vocab_size'], mode='test',datasplit=datasplit,
    #                       overwrite_speech=overwrite_speech,stopwords_only=stopwords_only,binary_vocab=binary_vocab,
    #                       ablate_feat=ablate_feat)

    devset = BurncDataset(cfg, data_dict, w2i, cfg['vocab_size'], mode='dev',datasplit=datasplit,
                           overwrite_speech=overwrite_speech,stopwords_only=stopwords_only,binary_vocab=binary_vocab,
                           ablate_feat=ablate_feat)


    ### Stopword baseline here:
    with open(cfg['datasplit'].replace('yaml', 'stop'), 'rb') as f:
        stopword_list = pickle.load(f)

    acc = evaluate(cfg,devset, cfg['eval_params'], model, device,tok_level_pred=cfg['tok_level_pred'],noisy=True,
                   print_predictions=True,vocab_dict=vocab_dict,stopword_baseline=args.stopword_baseline,stopword_list=stopword_list,bootstrap_resample=True)

    
    datasplit_name = os.path.split(datasplit)[-1].split('.')[0]

    out_path = os.path.join(os.path.dirname(args.saved_model),f'{args.output_file}_{datasplit_name}.tsv')
    with open(out_path,'w') as f:
        f.write(f'\tepochs\ttrain_losses\ttrain_accs\tdev_accs\n')
        f.write(f'0\t0\t0\t0\t{acc[0]}')
        print(f'wrote results to {out_path}')
Beispiel #12
0
from utils import load_vectors, rnd_labeled_data_generator, test_data_generator, conseq_labeled_data_generator

# Загружаем векторы

# In[3]:

vectors = {
    "auto": "../data/auto_vectors.csv",
    "mus": "../data/mus_vectors.csv"
}

# In[4]:
print("Loading vectors")
manually = True
auto_df = load_vectors(vectors["auto"], manually=manually, verbose=False)
if not manually:
    auto_df["vectors"].apply(lambda a: np.array(eval(a)))

# In[8]:

auto_df.groupby("overall").count()

# Классификация с 5 категориями, не бинарная

# In[8]:

auto_df["target"] = pd.get_dummies(auto_df["overall"]).values.tolist()
auto_df["target"] = auto_df["target"].apply(np.array)

# In[9]:
Beispiel #13
0
def analogy_task():
    try:
        emoji2vec_str = utils.load_vectors(filename=emoji2vec_embeddings)
        # Convert to unicode all emoji entries in the dictionary of emoji embeddings
        emoji2vec = {}
        for k, v in emoji2vec_str.items():
            unicode_emoji = utils.convert_emoji_to_unicode(k)
            emoji2vec[unicode_emoji] = v
        # Get some intuition whether the model is good by seeing what analogies it can make based on what it learnt
        utils.make_analogy("👑", "🚹", "🚺", emoji2vec)  # Crown - Man + Woman
        utils.make_analogy("👑", "👦", "👧", emoji2vec)  # Crown - Boy + Girl
        utils.make_analogy("💵", "🇺🇸", "🇬🇧", emoji2vec)
        utils.make_analogy("💵", "🇺🇸", "🇪🇺", emoji2vec)
        utils.make_analogy("👪", "👦", "👧", emoji2vec)
        utils.make_analogy("🕶", "☀️", "⛈",
                           emoji2vec)  # Sunglasses - Sun + Cloud
        utils.make_analogy("☂", "⛈️", "☀",
                           emoji2vec)  # Umbrella - Clouds + Sun
        utils.make_analogy(
            "🍣", "🏯️", "🏰",
            emoji2vec)  # Sushi - Japanese Castle + European Castle
        utils.make_analogy(
            "👹", "🏯️", "🏰",
            emoji2vec)  # Japanese Ogre - Japanese Castle + European Castle
        utils.make_analogy(
            "🍣", "🗼️", "🗽",
            emoji2vec)  # Sushi - Japanese Tower + Statue of Liberty
        utils.make_analogy(
            "🍣", "🗾️", "🗽",
            emoji2vec)  # Sushi - Japanese Tower + Statue of Liberty
        utils.make_analogy(
            "🍣", "🏯️", "🗽",
            emoji2vec)  # Sushi - Japanese Castle + Statue of Liberty
        utils.make_analogy("🐅", "🐈️", "🐕", emoji2vec)  # Jaguar - Cat + Dog
        utils.make_analogy("🐆", "🐈️", "🐕", emoji2vec)  # Leopard - Cat + Dog
        utils.make_analogy("🐭", "🐈️", "🐕", emoji2vec)  # Mouse - Cat + Dog
        utils.make_analogy("🌅", "🌞️", "🌙", emoji2vec)  # Sunrise - Sun + Moon
        utils.make_analogy("🌅", "🌞️", "🌑", emoji2vec)  # Sunrise - Sun + Moon
        utils.make_analogy(
            "🌃", "🌙️", "🌞",
            emoji2vec)  # Night with stars - Moon + Sun With Face
        utils.make_analogy(
            "🌃", "🌑️", "☀",
            emoji2vec)  # Night with stars - Moon + Sun With Face
        utils.make_analogy(
            "🌃", "🌙️️", "☀",
            emoji2vec)  # Night with stars - Moon + Sun With Face
        utils.make_analogy(
            "😴", "💤️", "🏃",
            emoji2vec)  # Sleeping face - sleeping symbol + running
        utils.make_analogy(
            "😴", "🛌️", "🏃",
            emoji2vec)  # Sleeping face - sleeping accommodatin + running
        utils.make_analogy(
            "😴", "🛏", "🏃",
            emoji2vec)  # Sleeping face - bed + active symbol (running)
        utils.make_analogy("🏦", "💰", "🏫", emoji2vec)  # Money - Bank + School
        utils.make_analogy("🏦", "💰", "🏥", emoji2vec)  # Money - Bank + Hospital
        utils.make_analogy("💉", "🏥", "🏦",
                           emoji2vec)  # Syringe - Hospital + Bank
        utils.make_analogy("💊", "🏥", "🏦", emoji2vec)  # Pill - Hospital + Bank
        utils.make_analogy("💒", "💍", "👰", emoji2vec)  # Wedding - Ring + Bride
        utils.make_analogy("💒", "💑", "💔",
                           emoji2vec)  # Wedding - Couple + Broken Heart
        utils.make_analogy("💒", "❤", "💔",
                           emoji2vec)  # Wedding - Heart + Broken Heart
        utils.make_analogy("😀", "💰", "🤑",
                           emoji2vec)  # Grinning person - Money + Money Face
        utils.make_analogy("😠", "💰", "🤑",
                           emoji2vec)  # Angry person - Money + Money Face
    except IOError:
        print("Emoji embeddings not found at the provided embeddings file %s. "
              "You have to train them before proceeding to make analogies." %
              emoji2vec_embeddings)
Beispiel #14
0
def train_emoji2vec():
    # Load the true emoji data
    pos_emojis = read_csv(emoji_positive,
                          sep="\t",
                          engine="python",
                          encoding="utf_8",
                          names=["description", "emoji"])
    pos_emojis["label"] = 0

    # Load the false emoji data (negative examples)
    neg_emojis = read_csv(emoji_negative,
                          sep="\t",
                          engine="python",
                          encoding="utf_8",
                          names=["description", "emoji"])
    neg_emojis["label"] = 1

    print(
        "There are %d true emoji descriptions and %d false emoji descriptions."
        % (len(pos_emojis), len(neg_emojis)))

    # Group all the positive emoji examples by their description
    emoji_grouping = pos_emojis.groupby("emoji")["description"].apply(
        lambda x: ", ".join(x))
    grouped_by_description = DataFrame({
        "emoji": emoji_grouping.index,
        "description": emoji_grouping.values
    })

    # Build an emoji vocabulary and map each emoji to an index (beginning from 1)
    emojis = grouped_by_description["emoji"].values
    emoji_to_index = {
        emoji: index + 1
        for emoji, index in zip(emojis, range(len(emojis)))
    }
    emoji_vocab_size = len(emoji_to_index) + 1
    print("There are %d unique emojis." % (emoji_vocab_size - 1))

    # Concatenate and shuffle negative and positive examples of emojis
    all_emojis = concat([pos_emojis, neg_emojis]).sample(frac=1,
                                                         random_state=150493)

    # Build a word vocabulary and map each emoji to an index (beginning from 1)
    descriptions = all_emojis["description"].values
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(descriptions.tolist())
    word_sequences = tokenizer.texts_to_sequences(descriptions.tolist())
    word_to_index = tokenizer.word_index
    word_vocab_size = len(word_to_index) + 1
    print("There are %d unique words in the descriptions." %
          (word_vocab_size - 1))

    # Load GLoVe word embeddings
    word_emb = utils.load_vectors(glove_filename)

    # Prepare the word-embedding matrix
    embedding_matrix = utils.get_embedding_matrix(word_emb,
                                                  word_to_index,
                                                  embedding_dim,
                                                  init_unk=False)

    # Prepare training data
    train_emoji = np.array(
        [emoji_to_index[e] for e in all_emojis["emoji"].values])
    print("The emoji tensor shape is ", train_emoji.shape)

    if use_lstm:
        train_words = pad_sequences(word_sequences,
                                    maxlen=seq_length,
                                    padding='post',
                                    truncating='post',
                                    value=0.)
    else:
        train_words = sum_emb(word_sequences, embedding_matrix)
    print("The descriptions tensor shape is ", train_words.shape)

    labels = to_categorical(
        np.asarray([label for label in all_emojis["label"].values]))
    print("The label tensor shape is ", labels.shape)

    # Build the emoji DNN model
    if use_lstm:
        model = emoji2vec_lstm_model(embedding_matrix, emoji_vocab_size,
                                     word_vocab_size, seq_length)
    else:
        model = emoji2vec_model(emoji_vocab_size)
    my_optimizer = Adam(lr=learning_rate, beta_1=0.9, beta_2=0.99, decay=0.01)
    model.compile(loss="categorical_crossentropy",
                  optimizer=my_optimizer,
                  metrics=["categorical_accuracy", utils.f1_score])
    print(model.summary())

    plot_model(model,
               to_file=path + '/plots/emoji2vec_' + str(embedding_dim) +
               'd_model_summary.png',
               show_shapes=False,
               show_layer_names=True)

    # Prepare the callbacks and fit the model
    save_best = ModelCheckpoint(monitor='val_categorical_accuracy',
                                save_best_only=True,
                                filepath=emoji2vec_weights)
    reduceLR = ReduceLROnPlateau(monitor='val_categorical_accuracy',
                                 factor=0.1,
                                 patience=3,
                                 verbose=1)
    early_stopping = EarlyStopping(monitor='val_categorical_accuracy',
                                   patience=15,
                                   verbose=1)
    history = model.fit([train_emoji, train_words],
                        labels,
                        epochs=epochs,
                        batch_size=batch_size,
                        validation_split=0.1,
                        verbose=1,
                        callbacks=[save_best, reduceLR, early_stopping])

    # Plot accuracy and loss
    utils.plot_training_statistics(history,
                                   "/plots/emoji2vec_%dd" % embedding_dim,
                                   plot_validation=True,
                                   acc_mode="categorical_accuracy",
                                   loss_mode="loss")

    # Get the weights of the trained emoji model
    weights = [
        layer.get_weights()[0] for layer in model.layers
        if layer.name == 'emoji_emb'
    ]
    weights = weights[0]

    # Get the emoji embeddings and save them to file
    embeddings = DataFrame(weights[1:])
    embeddings = concat([grouped_by_description["emoji"], embeddings], axis=1)
    embeddings.to_csv(emoji2vec_embeddings, sep=" ", header=False, index=False)

    # Get the t-SNE representation
    tsne = TSNE(n_components=2, perplexity=30, init="pca", n_iter=5000)
    trans = tsne.fit_transform(weights)

    # Save the obtained emoji visualization
    visualization = DataFrame(trans[1:], columns=["x", "y"])
    visualization["emoji"] = grouped_by_description["emoji"].values
    visualization.to_csv(emoji2vec_visualization)

    # Visualize the embeddings as a t-sne figure
    visualization.plot("x", "y", kind="scatter", grid=True)
    plt.savefig(path + "/plots/tsne_%dd.png" % embedding_dim)
def main():
    parser = argparse.ArgumentParser(description="Running BAT or ECT")
    parser.add_argument(
        "--test_type",
        nargs='?',
        choices=['ECT', 'BAT'],
        help="Specify BAT or ECT depending on which test shall be run",
        required=True)
    parser.add_argument(
        "--protocol_type",
        nargs='?',
        choices=['RT', 'BRD'],
        help=
        "Whether to run test for Reichstagsprotokolle (RT) or Bundestagsprotokolle (BRD)",
        required=True)
    parser.add_argument("--output_file",
                        type=str,
                        default=None,
                        help="File to store the results)",
                        required=True)
    parser.add_argument("--vocab_file",
                        type=str,
                        default=None,
                        help="path to vocab file",
                        required=True)
    parser.add_argument("--vector_file",
                        type=str,
                        default=None,
                        help="path to vector file",
                        required=True)

    args = parser.parse_args()

    if not args.test_type in ['ECT', 'BAT']:
        parser.print_help()
        sys.exit(2)

    vocab = load_vocab(str(vocab_path / args.vocab_file))
    vectors = load_vectors(str(models_path / args.vector_file))
    results = {}
    for test in weat_tests:
        results[test.__name__] = {}
        for dim in DIMENSIONS:
            weat_terms = test(dim, args.protocol_type)
            if args.test_type == 'BAT':
                result = run_bat(vectors, vocab, weat_terms)
                logging.info(f'{test.__name__} - {dim}: {result}')
                results[test.__name__][dim] = result
            elif args.test_type == 'ECT':
                result = run_ect(vectors, vocab, weat_terms)
                logging.info(f'{test.__name__} - {dim}: {result}')
                results[test.__name__][dim] = result

    if args.test_type == 'BAT':
        res_df = pd.DataFrame(results).T.round(3)

    elif args.test_type == 'ECT':
        res_df = pd.DataFrame(index=pd.MultiIndex.from_product(
            [DIMENSIONS, ['corr', 'p']]),
                              columns=results.keys()).T
        for k1, v1 in results.items():
            for k2, v2 in v1.items():
                res_df.loc[k1, (k2, 'corr')] = results[k1][k2].correlation
                res_df.loc[k1, (k2, 'p')] = results[k1][k2].pvalue

    res_df.to_csv(f'{str(ROOT_DIR)}/{args.output_file}.csv',
                  index=True,
                  header=True)
Beispiel #16
0
import extract_baseline_features
import extract_ml_features2 as extract_features
import utils, classifiers
import data_processing as data_proc

# Settings for the up-coming ML model
pragmatic = True
lexical = True
pos_grams = True
sentiment = True
topic = True
similarity = True
pos_ngram_list = [1]
ngram_list = [1]
embedding_dim = 100
word2vec_map = utils.load_vectors(filename='glove.6B.%dd.txt' % embedding_dim)

# Set the values for the portion fo data
n_train = 3000
n_test = 500


def baseline(tweets_train, train_labels, tweets_test, test_labels):
    # Import the subjectivity lexicon
    subj_dict = data_proc.get_subj_lexicon()

    types_of_features = ['1', '2', '3', 'ngrams']
    for t in types_of_features:
        start = time.time()
        utils.print_model_title("Classification using feature type " + t)
        if t is '1':
Beispiel #17
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Calculates semantic similarity for each pair of words in the specified input file and saves result '
        'to the output file. Uses specified word2vec word vectors file.')
    parser.add_argument('vectors',
                        help='word2vec word vectors file.',
                        default='')
    parser.add_argument(
        'input',
        help=
        'Input file in csv format (with comma as separator). Each line is a pair of words, '
        'for which similarity is calculated.',
        default='')
    parser.add_argument(
        '-output',
        help='Output file in csv format (with comma as separator).',
        default='')
    parser.add_argument(
        '-column',
        help=
        "Name of the output column. If column doesn't exist, it will be added, otherwise "
        "data in existing column will be modified.",
        default='res')
    parser.add_argument('-morph',
                        help="Enable morphology hack.",
                        action='store_true')
    args = parser.parse_args()

    fvec = args.vectors
    fin = args.input
    fout = fin + '-' + os.path.basename(
        fvec) if args.output == '' else args.output
    cout = args.column
    morph_hack = args.morph

    print >> stderr, "Loading vectors from {}".format(fvec)
    vectors = load_vectors(fvec)

    # for words with underscore (ex: берег_v, берег_s, северо_запад, вода_и_медные_трубы)
    prefix2word = {}
    for x in vectors.vocab.iterkeys():
        if '_' not in x:
            continue
        prefix = x.split('_')[0]
        if prefix not in prefix2word:
            prefix2word[prefix] = [x]
        else:
            prefix2word[prefix].append(x)

    # for e in prefix2word.iteritems():
    #     if len(e[1]) > 1:
    #         print >> stderr, ("%s: %s" % (e[0], ':'.join(e[1]))).encode('utf-8')

    print >> stderr, "Calculating similarity for {}; writing result to {}".format(
        fin, fout)
    with open(fin, 'r') as input_file, open(fout, "w") as output_file, open(
            fout + '.log', "w") as log_file:
        inp = csv.DictReader(input_file,
                             delimiter=',',
                             quoting=csv.QUOTE_MINIMAL,
                             quotechar="'")
        # inp_fieldnames = inp.fieldnames
        out_fieldnames = inp.fieldnames if cout in inp.fieldnames else inp.fieldnames + [
            cout
        ]
        out = csv.DictWriter(output_file,
                             out_fieldnames,
                             delimiter=',',
                             quoting=csv.QUOTE_MINIMAL,
                             quotechar="'")
        out.writeheader()
        for linenum, ex in enumerate(inp):
            if linenum % 1000 == 0:
                print 'Lines processed: {}'.format(linenum)

            ex[cout] = sim(vectors, prefix2word, ex['word1'].decode('utf-8'),
                           ex['word2'].decode('utf-8'), log_file, morph_hack)
            out.writerow(ex)
Beispiel #18
0
## Script that trains an instance of a CATS (or TLT) model

import tensorflow as tf
import model
import serializer
import config
import utils
import numpy as np
import get_data
import pickle
import os

dirname = dirname = os.path.dirname(os.path.realpath(__file__))
print("Dirname: " + dirname)

print("Loading word embeddings...")
embs = utils.load_vectors(os.path.join(dirname, config.vecs_path_en))
vocab = utils.load_vocab(os.path.join(dirname, config.vocab_path_en))
print("Loaded.")

print("Defining estimator...")
rconf = tf.estimator.RunConfig(save_checkpoints_steps=config.SAVE_CHECKPOINT_STEPS, 
                               save_checkpoints_secs=None, 
                               model_dir=os.path.join(dirname, config.MODEL_HOME))
print("Defined.")

params = {"padding_value" : vocab["<PAD>"], "wembs" : embs, "vocab" : vocab, "coherence_hinge_margin" : 1, "learning_rate" : 0.0001}
estimator = tf.estimator.Estimator(model_fn=model.model_fn, config=rconf, params=params)

print("Training the model...")
res = estimator.train(input_fn=lambda : get_data.get_data(os.path.join(dirname, config.tfrec_train), is_train = True, epochs = config.EPOCHS))
Beispiel #19
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Reads words from vector model. Writes to stdout word + similar words and their distances to the original word.'
    )
    parser.add_argument('vectors',
                        help='word2vec word vectors file.',
                        default='')
    parser.add_argument('-output',
                        help='Output file in on-pair-per-line format, gziped',
                        default='')
    parser.add_argument(
        '-only_letters',
        help=
        'Skip words containing non-letter symbols from stding / similar words.',
        action="store_true")
    parser.add_argument(
        "-vocab_limit",
        help=
        "Collect neighbours only for specified number of most frequent words. By default use all words.",
        default=None,
        type=int)
    parser.add_argument(
        '-pairs',
        help=
        "Use pairs format: 2 words and distance in each line. Otherwise echo line is a word and all it's neighbours with distances.",
        action="store_true")
    parser.add_argument('-batch-size',
                        help='Batch size for finding neighbours.',
                        default="1000")
    parser.add_argument(
        '-word_freqs',
        help=
        "Weight similar words by frequency. Pass frequency file as parameter",
        default=None)

    args = parser.parse_args()

    fvec = args.vectors
    batch_size = int(args.batch_size)

    print >> stderr, "Vectors: {}, only_letters: {}".format(
        args.vectors, args.only_letters)
    print >> stderr, "Loading vectors from {}".format(fvec)
    tic = time()
    vectors = load_vectors(fvec)
    print >> stderr, "Vectors loaded in %d sec." % (time() - tic)
    print >> stderr, "Vectors shape is: ", vectors.syn0norm.shape

    vocab_size = len(vectors.vocab)
    print("Vocabulary size: %i" % vocab_size)

    # Limit the number of words for which to collect neighbours
    if args.vocab_limit and args.vocab_limit < vocab_size:
        vocab_size = args.vocab_limit

    print("Collect neighbours for %i most frequent words" % vocab_size)

    freq = None
    if args.word_freqs:
        freq_dict = load_freq(args.word_freqs)
        freq = order_freq(vectors, freq_dict)
        print "freqs loaded. Length ", len(freq), freq[:10]

    with gzip.open(args.output, 'wb') if args.output else stdout as out:
        process(out,
                vectors,
                only_letters=args.only_letters,
                vocab_size=vocab_size,
                batch_size=batch_size,
                pairs=args.pairs,
                freq=freq)
    print('  Loading word pairs... Done!\n')
    
    print('  Loading word synsets and building synset vocabulary...')
    word_synsets, synsets = utils.load_word_synsets(
        pos=setup.POS,
        include_named_entities=setup.INCLUDE_NAMED_ENTITIES)
    print('  Loading word synsets and building synset vocabulary... Done! [{} synsets]\n'.format(len(synsets)))

    print('  Loading synset weights...')
    synset_weights = utils.load_synset_weights(synsets)
    print('  Loading synset weights... Done!\n')

    print('  Loading Conception vectors...')
    vectors = utils.load_vectors(
        conception_vectors.value,
        synsets,
        normalize=setup.NORMALIZE_SCORES,
        min_length=setup.MIN_VECTOR_LENGTH,
        max_length=setup.MAX_VECTOR_LENGTH)
    print('  Loading Conception vectors... Done! [{} vectors]\n'.format(len(vectors)))

    utils.evaluate(
        'Conception_{}'.format(args.conception_type),
        word_pairs,
        word_synsets,
        synset_weights,
        vectors,
        setup.SIMILARITY_FUNCTION,
        setup.SCORING_FUNCTION,
        verbose=args.verbose)