Ejemplo n.º 1
0
 def testIntVocabulary(self):
     vocab = categorical_vocabulary.CategoricalVocabulary()
     self.assertEqual(vocab.get(1), 1)
     self.assertEqual(vocab.get(3), 2)
     self.assertEqual(vocab.get(2), 3)
     self.assertEqual(vocab.get(3), 2)
     # This vocab doesn't handle nan specially.
     self.assertEqual(vocab.get(float('nan')), 4)
     self.assertEqual(len(vocab), 5)
Ejemplo n.º 2
0
 def testCountsTrim(self):
     vocab = categorical_vocabulary.CategoricalVocabulary()
     vocab.get('c')
     vocab.add('c', 5)
     vocab.get('a')
     vocab.add('a', 10)
     # not in vocab yet, skips.
     vocab.add('b', 5)
     vocab.add('d', 12)
     vocab.trim(7, 11)
     vocab.freeze()
     self.assertEqual(vocab.get('b'), 0)
     self.assertEqual(vocab.get('c'), 0)
     self.assertEqual(len(vocab), 2)
     self.assertEqual(vocab.get('a'), 1)
Ejemplo n.º 3
0
 def testWordVocabulary(self):
     vocab = categorical_vocabulary.CategoricalVocabulary()
     self.assertEqual(vocab.get('a'), 1)
     self.assertEqual(vocab.get('b'), 2)
     self.assertEqual(vocab.get('a'), 1)
     self.assertEqual(vocab.get('b'), 2)
Ejemplo n.º 4
0
qi_test = dh5.get_queries(dset='test')
dt_test = dh5.get_doc_ids(dset='test')
print("Loading queries and docs {}".format(time() - t0))
print '%d train examples' % len(qi_train)
print '%d valid examples' % len(qi_valid)
print '%d test examples' % len(qi_test)
#print 'qi_train',qi_train
#print 'dt_train',dt_train

# Build vocabulary
t0 = time()
word2vec_vocab = pkl.load(open(cfg['data']['pretrained_embedding_path'],
                               "rb"))  #374557*500
dim_emb_orig = word2vec_vocab.values()[0].shape[0]
print("Loading word2vec vocabulary in {}".format(time() - t0))
categorical_voc = categorical_vocabulary.CategoricalVocabulary()
for key in word2vec_vocab:
    categorical_voc.add(key)
cfg['data']['vocab_size'] = len(word2vec_vocab.keys()) + 1
vocab_processor = learn.preprocessing.VocabularyProcessor(
    max_document_length=cfg['data']['max_words_input'],
    vocabulary=categorical_voc)
print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))

W = np.array(word2vec_vocab.values(), dtype='f')
if cfg['data']['embedding_dim'] < dim_emb_orig:
    pca = PCA(n_components=cfg['data']['embedding_dim'],
              copy=False,
              whiten=True)
    W = pca.fit_transform(W)
W0 = np.random.rand(