print word2dfs.items()[:10] ######### alphabet = Alphabet(start_feature_id=0) alphabet.add('UNKNOWN_WORD_IDX') add_to_vocab(answers, alphabet) add_to_vocab(questions, alphabet) basename = os.path.basename(train) cPickle.dump(alphabet, open(os.path.join(outdir, 'vocab.pickle'), 'w')) print "alphabet", len(alphabet) embeddings_location = 'embeddings/aquaint+wiki.txt.gz.ndim=50.bin' word2vec = load_bin_vec(embeddings_location, alphabet.keys()) dummy_word_idx = alphabet.fid q_max_sent_length = max(map(lambda x: len(x), questions)) a_max_sent_length = max(map(lambda x: len(x), answers)) print 'q_max_sent_length', q_max_sent_length print 'a_max_sent_length', a_max_sent_length # Convert dev and test sets for fname in [train, dev, test]: print fname # qids, questions, answers, labels = load_data(fname, stoplist) qids, questions, answers, labels = load_data(fname) overlap_feats = compute_overlap_features(questions,
stopwords = [] alphabet = Alphabet(start_feature_id=0) alphabet.add('UNKNOWN_WORD_IDX_0') vocab_dict = {} for crous in [df_train, df_test]: add_to_vocab(crous['question'], alphabet) add_to_vocab(crous['answer'], alphabet) print alphabet.fid temp_vec = 0 vocab_array = np.zeros((alphabet.fid, ndim), dtype='float32') for index in alphabet.keys(): vec = know_dict.get(index, None) if vec is None: vec = rng.uniform(-0.25, 0.25, ndim) vec = list(vec) vec = np.array(vec, dtype='float32') random_word_count[0] += 1 if alphabet[index] == 0: vec = np.zeros(ndim) temp_vec += vec vocab_array[alphabet[index]] = vec temp_vec /= len(vocab_array) for index, _ in enumerate(vocab_array): vocab_array[index] -= temp_vec Pickle.dump(alphabet, open(vocab_index_file, 'wb'))