def make_testing_data(model,
                      relationships=None,
                      vocabulary=None,
                      s2w=None,
                      pct=0.98):
    if relationships is None:
        rel_path = os.path.join(model.other_params['base_dir'],
                                'relationships.pkl.gz')
        with gzip.open(rel_path) as f:
            relationships = cPickle.load(f)

    if vocabulary is None:
        reader = ngrams.NgramReader(model.other_params['ngram_filename'],
                                    model.other_params['vocab_size'])
        vocabulary = reader.word_array

    if s2w is None:
        s2w = SynsetToWord(vocabulary)

    training_size = int(relationships.N) * pct

    def usable_row(symbolic_row):
        sa, sb, rel = symbolic_row
        return s2w.usable(sa) and s2w.usable(sb)

    testing_data = filter(
        usable_row,
        map(relationships.indices_to_symbolic,
            relationships.data[training_size:]))
    return testing_data
Example #2
0
def get_vocab_container(model):
    try:
        ngram_filename = model.other_params['ngram_filename']
    except:
        ngram_filename = DEFAULT_NGRAM_FILENAME
    try:
        vocab_size = model.other_params['vocab_size']
    except:
        vocab_size = 50000
    return ngrams.NgramReader(ngram_filename, vocab_size=vocab_size)
Example #3
0
def get_vocab(model):
    try:
        base_dir = model.other_params['base_dir']
        vocabulary_path = join(base_dir, 'vocabulary.pkl.gz')
        return load_model(vocabulary_path)
    except:
        try:
            ngram_filename = model.other_params['ngram_filename']
        except:
            ngram_filename = DEFAULT_NGRAM_FILENAME
        try:
            vocab_size = model.other_params['vocab_size']
        except:
            vocab_size = 50000
        return ngrams.NgramReader(ngram_filename, vocab_size=vocab_size)
        # self.word_token_to_index = dict((word, index)
        #                            for index, word in enumerate(vocabulary))
        self.words_by_synset = {
            synset: [self.vocabulary.symbol_to_index[word]
                     for word in words_from_synset(synset)
                     if word in self.vocabulary]
            for synset in self.synsets
        }

    # def all_words_in_relations(self, rels):
    #     """
    #     rels: wordnet_rels.Relationships
    #     """
    #     words = set()
    #     for row in rels.data:
    #         syn_a, syn_b, rel = rels.indices_to_symbolic(row)
    #         words.update(self.words_by_synset[syn_a])
    #         words.update(self.words_by_synset[syn_b])
    #     return words

    def usable(self, syn):
        return bool(self.words_by_synset[syn])

if __name__ == "__main__":
    rels = Relationships()
    import ngrams
    reader = ngrams.NgramReader('/cl/nldata/books_google_ngrams_eng/5grams_size3.hd5', vocab_size=50000)
    s2w = SynsetToWord(reader.word_array)
    # indices_in_relationships = s3w.all_words_in_relations(rels)
    # print '%d words in vocabulary covered by relationships (out of %d)' % (len(indices_in_relationships) , len(reader.word_array))