def testIntVocabulary(self): vocab = categorical_vocabulary.CategoricalVocabulary() self.assertEqual(vocab.get(1), 1) self.assertEqual(vocab.get(3), 2) self.assertEqual(vocab.get(2), 3) self.assertEqual(vocab.get(3), 2) # This vocab doesn't handle nan specially. self.assertEqual(vocab.get(float('nan')), 4) self.assertEqual(len(vocab), 5)
def testCountsTrim(self): vocab = categorical_vocabulary.CategoricalVocabulary() vocab.get('c') vocab.add('c', 5) vocab.get('a') vocab.add('a', 10) # not in vocab yet, skips. vocab.add('b', 5) vocab.add('d', 12) vocab.trim(7, 11) vocab.freeze() self.assertEqual(vocab.get('b'), 0) self.assertEqual(vocab.get('c'), 0) self.assertEqual(len(vocab), 2) self.assertEqual(vocab.get('a'), 1)
def testWordVocabulary(self): vocab = categorical_vocabulary.CategoricalVocabulary() self.assertEqual(vocab.get('a'), 1) self.assertEqual(vocab.get('b'), 2) self.assertEqual(vocab.get('a'), 1) self.assertEqual(vocab.get('b'), 2)
qi_test = dh5.get_queries(dset='test') dt_test = dh5.get_doc_ids(dset='test') print("Loading queries and docs {}".format(time() - t0)) print '%d train examples' % len(qi_train) print '%d valid examples' % len(qi_valid) print '%d test examples' % len(qi_test) #print 'qi_train',qi_train #print 'dt_train',dt_train # Build vocabulary t0 = time() word2vec_vocab = pkl.load(open(cfg['data']['pretrained_embedding_path'], "rb")) #374557*500 dim_emb_orig = word2vec_vocab.values()[0].shape[0] print("Loading word2vec vocabulary in {}".format(time() - t0)) categorical_voc = categorical_vocabulary.CategoricalVocabulary() for key in word2vec_vocab: categorical_voc.add(key) cfg['data']['vocab_size'] = len(word2vec_vocab.keys()) + 1 vocab_processor = learn.preprocessing.VocabularyProcessor( max_document_length=cfg['data']['max_words_input'], vocabulary=categorical_voc) print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_))) W = np.array(word2vec_vocab.values(), dtype='f') if cfg['data']['embedding_dim'] < dim_emb_orig: pca = PCA(n_components=cfg['data']['embedding_dim'], copy=False, whiten=True) W = pca.fit_transform(W) W0 = np.random.rand(