def _create_unigrams_for_parseset_n(self, parseset_index): print "Parsing parse set {} and generating unigrams with occurrence counts".format(parseset_index) dom = parse(os.path.join(os.path.dirname(__file__), '../../testresources/parsesets/parseset{}.xml'.format(parseset_index))) parseset = ParseSetBinding.build(dom.getElementsByTagName("parseset")[0]) print "Found {} sentences".format(len(parseset.sentences)) words = [word for sentence in parseset.sentences for word in sentence.words] print "Found {} words".format(len(words)) print "Found {} parsable words".format( len(filter(lambda word: not isinstance(word, UnparsableWordBinding), words))) generator = WordNGramGenerator(1) collection = self.db['wordUnigrams{}'.format(parseset_index)] # delete everything in the collection collection.remove({}) bulk_insert_buffer = [] for unigram in generator.iter_ngrams(words): entity = { 'item_0': unigram } bulk_insert_buffer.append(entity) if len(bulk_insert_buffer) % self.BULK_INSERT_SIZE == 0: collection.insert(bulk_insert_buffer) bulk_insert_buffer = [] collection.insert(bulk_insert_buffer) self._inspect_unigrams_for_parseset_n(parseset_index)
def _create_trigrams_for_parseset_n(self, parseset_index): print "Parsing parse set {} and generating trigrams with occurrence counts".format( parseset_index) dom = parse( os.path.join( os.path.dirname(__file__), '../../testresources/parsesets/parseset{}.xml'.format( parseset_index))) parseset = ParseSetBinding.build( dom.getElementsByTagName("parseset")[0]) print "Found {} sentences".format(len(parseset.sentences)) words = [ word for sentence in parseset.sentences for word in sentence.words ] print "Found {} words".format(len(words)) print "Found {} parsable words".format( len( filter( lambda word: not isinstance(word, UnparsableWordBinding), words))) generator = WordNGramGenerator(3) collection = self.db['wordTrigrams{}'.format(parseset_index)] # delete everything in the collection collection.remove({}) bulk_insert_buffer = [] for trigram in generator.iter_ngrams(words): entity = { 'item_0': trigram[0], 'item_1': trigram[1], 'item_2': trigram[2] } bulk_insert_buffer.append(entity) if len(bulk_insert_buffer) % self.BULK_INSERT_SIZE == 0: collection.insert(bulk_insert_buffer) bulk_insert_buffer = [] collection.insert(bulk_insert_buffer) trigram_count = collection.count() print "Generated {} trigrams".format(trigram_count)
def _create_mock_container(self, word): if isinstance(word, UnparsableWordBinding): print u'Previous word is unparsable, skipped : {}'.format(word.str) return None surface_str, surface_syntactic_category = word.str, word.syntactic_category stem_str, stem_syntactic_category, stem_secondary_syntactic_category = WordNGramGenerator._get_stem(word) lemma_root_str, lemma_root_syntactic_category = word.root.lemma_root, word.root.syntactic_category if word.secondary_syntactic_category: surface_syntactic_category += u'_' + word.secondary_syntactic_category if stem_secondary_syntactic_category: stem_syntactic_category += u'_' + stem_secondary_syntactic_category if word.root.secondary_syntactic_category: lemma_root_syntactic_category += u'_' + word.root.secondary_syntactic_category return MockMorphemeContainerBuilder.builder(word.format(), surface_str, surface_syntactic_category).stem(stem_str, stem_syntactic_category).lexeme(lemma_root_str, lemma_root_syntactic_category).build()
def _create_mock_container(self, word): if isinstance(word, UnparsableWordBinding): print u'Previous word is unparsable, skipped : {}'.format(word.str) return None surface_str, surface_syntactic_category = word.str, word.syntactic_category stem_str, stem_syntactic_category, stem_secondary_syntactic_category = WordNGramGenerator._get_stem( word) lemma_root_str, lemma_root_syntactic_category = word.root.lemma_root, word.root.syntactic_category if word.secondary_syntactic_category: surface_syntactic_category += u'_' + word.secondary_syntactic_category if stem_secondary_syntactic_category: stem_syntactic_category += u'_' + stem_secondary_syntactic_category if word.root.secondary_syntactic_category: lemma_root_syntactic_category += u'_' + word.root.secondary_syntactic_category return MockMorphemeContainerBuilder.builder( word.format(), surface_str, surface_syntactic_category).stem( stem_str, stem_syntactic_category).lexeme( lemma_root_str, lemma_root_syntactic_category).build()