Beispiel #1
0
def main():
    outdir = "preprocessed_data"
    out_file = 'vocal_wembext.pickle'
    fname, delimiter, ndim = (
        'embeddings/smiley_tweets_embedding_multilingual300M', ' ', 52)
    word2vec = load_glove_vec(fname, {}, delimiter, ndim)

    alphabet = Alphabet(start_feature_id=0)
    alphabet.add('UNKNOWN_WORD_IDX')
    alphabet.add('DUMMY_WORD_IDX')
    dummy_word_idx = alphabet.get('DUMMY_WORD_IDX')

    for token in word2vec.keys():
        alphabet.add(token)

    print 'Alphabet before purge:', len(alphabet)
    cPickle.dump(alphabet, open(os.path.join(outdir, out_file), 'wb'))