Esempio n. 1
0
        print word2dfs.items()[:10]
        #########

        alphabet = Alphabet(start_feature_id=0)
        alphabet.add('UNKNOWN_WORD_IDX')

        add_to_vocab(answers, alphabet)
        add_to_vocab(questions, alphabet)

        basename = os.path.basename(train)
        cPickle.dump(alphabet, open(os.path.join(outdir, 'vocab.pickle'), 'w'))
        print "alphabet", len(alphabet)

        embeddings_location = 'embeddings/aquaint+wiki.txt.gz.ndim=50.bin'

        word2vec = load_bin_vec(embeddings_location, alphabet.keys())

        dummy_word_idx = alphabet.fid

        q_max_sent_length = max(map(lambda x: len(x), questions))
        a_max_sent_length = max(map(lambda x: len(x), answers))
        print 'q_max_sent_length', q_max_sent_length
        print 'a_max_sent_length', a_max_sent_length

        # Convert dev and test sets
        for fname in [train, dev, test]:
            print fname
            # qids, questions, answers, labels = load_data(fname, stoplist)
            qids, questions, answers, labels = load_data(fname)

            overlap_feats = compute_overlap_features(questions,
Esempio n. 2
0
    stopwords = []

    alphabet = Alphabet(start_feature_id=0)
    alphabet.add('UNKNOWN_WORD_IDX_0')

    vocab_dict = {}

    for crous in [df_train, df_test]:
        add_to_vocab(crous['question'], alphabet)
        add_to_vocab(crous['answer'], alphabet)

    print alphabet.fid
    temp_vec = 0
    vocab_array = np.zeros((alphabet.fid, ndim), dtype='float32')
    for index in alphabet.keys():
        vec = know_dict.get(index, None)
        if vec is None:
            vec = rng.uniform(-0.25, 0.25, ndim)
            vec = list(vec)
            vec = np.array(vec, dtype='float32')
            random_word_count[0] += 1
        if alphabet[index] == 0:
            vec = np.zeros(ndim)
        temp_vec += vec
        vocab_array[alphabet[index]] = vec
    temp_vec /= len(vocab_array)
    for index, _ in enumerate(vocab_array):
        vocab_array[index] -= temp_vec

    Pickle.dump(alphabet, open(vocab_index_file, 'wb'))