Example #1
0
    def _create_trigrams_for_parseset_n(self, parseset_index):
        print "Parsing parse set {} and generating trigrams with occurrence counts".format(
            parseset_index)

        dom = parse(
            os.path.join(
                os.path.dirname(__file__),
                '../../testresources/parsesets/parseset{}.xml'.format(
                    parseset_index)))
        parseset = ParseSetBinding.build(
            dom.getElementsByTagName("parseset")[0])

        print "Found {} sentences".format(len(parseset.sentences))
        words = [
            word for sentence in parseset.sentences for word in sentence.words
        ]
        print "Found {} words".format(len(words))
        print "Found {} parsable words".format(
            len(
                filter(
                    lambda word: not isinstance(word, UnparsableWordBinding),
                    words)))

        generator = WordNGramGenerator(3)

        collection = self.db['wordTrigrams{}'.format(parseset_index)]

        # delete everything in the collection
        collection.remove({})

        bulk_insert_buffer = []
        for trigram in generator.iter_ngrams(words):
            entity = {
                'item_0': trigram[0],
                'item_1': trigram[1],
                'item_2': trigram[2]
            }
            bulk_insert_buffer.append(entity)
            if len(bulk_insert_buffer) % self.BULK_INSERT_SIZE == 0:
                collection.insert(bulk_insert_buffer)
                bulk_insert_buffer = []

        collection.insert(bulk_insert_buffer)

        trigram_count = collection.count()
        print "Generated {} trigrams".format(trigram_count)