def make_testing_data(model, relationships=None, vocabulary=None, s2w=None, pct=0.98): if relationships is None: rel_path = os.path.join(model.other_params['base_dir'], 'relationships.pkl.gz') with gzip.open(rel_path) as f: relationships = cPickle.load(f) if vocabulary is None: reader = ngrams.NgramReader(model.other_params['ngram_filename'], model.other_params['vocab_size']) vocabulary = reader.word_array if s2w is None: s2w = SynsetToWord(vocabulary) training_size = int(relationships.N) * pct def usable_row(symbolic_row): sa, sb, rel = symbolic_row return s2w.usable(sa) and s2w.usable(sb) testing_data = filter( usable_row, map(relationships.indices_to_symbolic, relationships.data[training_size:])) return testing_data
def get_vocab_container(model): try: ngram_filename = model.other_params['ngram_filename'] except: ngram_filename = DEFAULT_NGRAM_FILENAME try: vocab_size = model.other_params['vocab_size'] except: vocab_size = 50000 return ngrams.NgramReader(ngram_filename, vocab_size=vocab_size)
def get_vocab(model): try: base_dir = model.other_params['base_dir'] vocabulary_path = join(base_dir, 'vocabulary.pkl.gz') return load_model(vocabulary_path) except: try: ngram_filename = model.other_params['ngram_filename'] except: ngram_filename = DEFAULT_NGRAM_FILENAME try: vocab_size = model.other_params['vocab_size'] except: vocab_size = 50000 return ngrams.NgramReader(ngram_filename, vocab_size=vocab_size)
# self.word_token_to_index = dict((word, index) # for index, word in enumerate(vocabulary)) self.words_by_synset = { synset: [self.vocabulary.symbol_to_index[word] for word in words_from_synset(synset) if word in self.vocabulary] for synset in self.synsets } # def all_words_in_relations(self, rels): # """ # rels: wordnet_rels.Relationships # """ # words = set() # for row in rels.data: # syn_a, syn_b, rel = rels.indices_to_symbolic(row) # words.update(self.words_by_synset[syn_a]) # words.update(self.words_by_synset[syn_b]) # return words def usable(self, syn): return bool(self.words_by_synset[syn]) if __name__ == "__main__": rels = Relationships() import ngrams reader = ngrams.NgramReader('/cl/nldata/books_google_ngrams_eng/5grams_size3.hd5', vocab_size=50000) s2w = SynsetToWord(reader.word_array) # indices_in_relationships = s3w.all_words_in_relations(rels) # print '%d words in vocabulary covered by relationships (out of %d)' % (len(indices_in_relationships) , len(reader.word_array))