def _create_trigrams_for_parseset_n(self, parseset_index): print "Parsing parse set {} and generating trigrams with occurrence counts".format( parseset_index) dom = parse( os.path.join( os.path.dirname(__file__), '../../testresources/parsesets/parseset{}.xml'.format( parseset_index))) parseset = ParseSetBinding.build( dom.getElementsByTagName("parseset")[0]) print "Found {} sentences".format(len(parseset.sentences)) words = [ word for sentence in parseset.sentences for word in sentence.words ] print "Found {} words".format(len(words)) print "Found {} parsable words".format( len( filter( lambda word: not isinstance(word, UnparsableWordBinding), words))) generator = WordNGramGenerator(3) collection = self.db['wordTrigrams{}'.format(parseset_index)] # delete everything in the collection collection.remove({}) bulk_insert_buffer = [] for trigram in generator.iter_ngrams(words): entity = { 'item_0': trigram[0], 'item_1': trigram[1], 'item_2': trigram[2] } bulk_insert_buffer.append(entity) if len(bulk_insert_buffer) % self.BULK_INSERT_SIZE == 0: collection.insert(bulk_insert_buffer) bulk_insert_buffer = [] collection.insert(bulk_insert_buffer) trigram_count = collection.count() print "Generated {} trigrams".format(trigram_count)