def _create_parseset_n(self, set_number): source_file_path = os.path.join(os.path.dirname(__file__), '../../testresources/simpleparsesets/simpleparseset{}.txt'.format(set_number)) destination_file_path = os.path.join(os.path.dirname(__file__), '../../testresources/parsesets/parseset{}.xml'.format(set_number)) line_index = 0 sentences = [] with codecs.open(source_file_path, mode='r', encoding='utf-8') as src: entries_for_sentence = [] for line in src: print u'Processing line {}'.format(line_index) line_index +=1 if not line: continue elif line.startswith(END_OF_SENTENCE_MARKER): sentence_binding = self.parseset_creator.create_sentence_binding_from_morpheme_containers(entries_for_sentence) sentences.append(sentence_binding) entries_for_sentence = [] elif line.startswith("#"): continue else: word_part = line[:line.find('=')].strip() parse_result_part = line[line.find('=')+1:].strip() parse_result_matching_simple_parseset = self._find_parse_result_matching_simple_parseset(word_part, parse_result_part) entries_for_sentence.append((word_part, parse_result_matching_simple_parseset)) parseset_binding = ParseSetBinding() parseset_binding.sentences = sentences parseset_dom = parseset_binding.to_dom() parseset_dom.setAttribute("xmlns", xmlbindings.NAMESPACE) with codecs.open(destination_file_path, mode='w', encoding='utf-8') as output: output.write(PARSESET_HEADER) output.write('\n') output.write(parseset_dom.toprettyxml())
def _test_calculate_with_parseset_n(self, parseset_index, leading_context_size, following_context_size): start_time = datetime.datetime.today() self.calculator = self.create_calculator(parseset_index) dom = parse( os.path.join( os.path.dirname(__file__), '../../../../testresources/parsesets/parseset{}.xml'.format( parseset_index))) parseset = ParseSetBinding.build( dom.getElementsByTagName("parseset")[0]) self.parse_set_word_list = [] for sentence in parseset.sentences: self.parse_set_word_list.extend(sentence.words) self._test_generate_likelihoods(leading_context_size, following_context_size) end_time = datetime.datetime.today() print u'Done in {} seconds for {} words'.format( end_time - start_time, len(self.parse_set_word_list) - 1) print u'Average in {} seconds'.format( (end_time - start_time) / (len(self.parse_set_word_list) - 1))
def _create_unigrams_for_parseset_n(self, parseset_index): print "Parsing parse set {} and generating unigrams with occurrence counts".format(parseset_index) dom = parse(os.path.join(os.path.dirname(__file__), '../../testresources/parsesets/parseset{}.xml'.format(parseset_index))) parseset = ParseSetBinding.build(dom.getElementsByTagName("parseset")[0]) print "Found {} sentences".format(len(parseset.sentences)) words = [word for sentence in parseset.sentences for word in sentence.words] print "Found {} words".format(len(words)) print "Found {} parsable words".format( len(filter(lambda word: not isinstance(word, UnparsableWordBinding), words))) generator = WordNGramGenerator(1) collection = self.db['wordUnigrams{}'.format(parseset_index)] # delete everything in the collection collection.remove({}) bulk_insert_buffer = [] for unigram in generator.iter_ngrams(words): entity = { 'item_0': unigram } bulk_insert_buffer.append(entity) if len(bulk_insert_buffer) % self.BULK_INSERT_SIZE == 0: collection.insert(bulk_insert_buffer) bulk_insert_buffer = [] collection.insert(bulk_insert_buffer) self._inspect_unigrams_for_parseset_n(parseset_index)
def setUpClass(cls): dom = parse(os.path.join(os.path.dirname(__file__), 'concordance_sample_parseset.xml')) parseset = ParseSetBinding.build(dom.getElementsByTagName("parseset")[0]) word_list = [] for sentence in parseset.sentences: word_list.extend(sentence.words) cls.word_list = word_list
def _create_parseset_n(self, set_number): source_file_path = os.path.join( os.path.dirname(__file__), '../../testresources/simpleparsesets/simpleparseset{}.txt'.format( set_number)) destination_file_path = os.path.join( os.path.dirname(__file__), '../../testresources/parsesets/parseset{}.xml'.format(set_number)) line_index = 0 sentences = [] with codecs.open(source_file_path, mode='r', encoding='utf-8') as src: entries_for_sentence = [] for line in src: print u'Processing line {}'.format(line_index) line_index += 1 if not line: continue elif line.startswith(END_OF_SENTENCE_MARKER): sentence_binding = self.parseset_creator.create_sentence_binding_from_morpheme_containers( entries_for_sentence) sentences.append(sentence_binding) entries_for_sentence = [] elif line.startswith("#"): continue else: word_part = line[:line.find('=')].strip() parse_result_part = line[line.find('=') + 1:].strip() parse_result_matching_simple_parseset = self._find_parse_result_matching_simple_parseset( word_part, parse_result_part) entries_for_sentence.append( (word_part, parse_result_matching_simple_parseset)) parseset_binding = ParseSetBinding() parseset_binding.sentences = sentences parseset_dom = parseset_binding.to_dom() parseset_dom.setAttribute("xmlns", xmlbindings.NAMESPACE) with codecs.open(destination_file_path, mode='w', encoding='utf-8') as output: output.write(PARSESET_HEADER) output.write('\n') output.write(parseset_dom.toprettyxml())
def setUpClass(cls): dom = parse( os.path.join(os.path.dirname(__file__), 'concordance_sample_parseset.xml')) parseset = ParseSetBinding.build( dom.getElementsByTagName("parseset")[0]) word_list = [] for sentence in parseset.sentences: word_list.extend(sentence.words) cls.word_list = word_list
def setUpClass(cls): super(StatisticalParserTest, cls).setUpClass() all_roots = [] lexemes = LexiconLoader.load_from_file( os.path.join(os.path.dirname(__file__), '../../resources/master_dictionary.txt')) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() cls.root_map = root_map_generator.generate(all_roots) suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph( BasicSuffixGraph())) suffix_graph.initialize() predefined_paths = PredefinedPaths(cls.root_map, suffix_graph) predefined_paths.create_predefined_paths() word_root_finder = WordRootFinder(cls.root_map) digit_numeral_root_finder = DigitNumeralRootFinder() text_numeral_root_finder = TextNumeralRootFinder(cls.root_map) proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder( ) proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder( ) contextless_parser = ContextlessMorphologicalParser( suffix_graph, predefined_paths, [ word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder ]) parseset_index = "001" dom = parse( os.path.join( os.path.dirname(__file__), '../../testresources/parsesets/parseset{}.xml'.format( parseset_index))) parseset = ParseSetBinding.build( dom.getElementsByTagName("parseset")[0]) parse_set_word_list = [] for sentence in parseset.sentences: parse_set_word_list.extend(sentence.words) complete_word_concordance_index = CompleteWordConcordanceIndex( parse_set_word_list) cls.parser = StatisticalParser(contextless_parser, complete_word_concordance_index)
def _validate_concordances_for_parse_set_n(self, parseset_index): dom = parse(os.path.join(os.path.dirname(__file__), '../../testresources/parsesets/parseset{}.xml'.format(parseset_index))) parseset = ParseSetBinding.build(dom.getElementsByTagName("parseset")[0]) word_list = [] for sentence in parseset.sentences: word_list.extend(sentence.words) self._validate_complete_word_concordance_indexes(word_list) self._validate_root_concordance_indexes(word_list) self._validate_lemma_concordance_indexes(word_list) self._validate_transition_word_concordance_indexes(word_list) self._validate_transition_matched_word_concordance_indexes(word_list)
def _test_calculate_with_parseset_n(self, parseset_index, leading_context_size, following_context_size): start_time = datetime.datetime.today() self.contextful_morphological_parser = self.create_contextful_morphological_parser(parseset_index) dom = parse(os.path.join(os.path.dirname(__file__), '../../../../testresources/parsesets/parseset{}.xml'.format(parseset_index))) parseset = ParseSetBinding.build(dom.getElementsByTagName("parseset")[0]) self.parse_set_word_list = [] for sentence in parseset.sentences: self.parse_set_word_list.extend(sentence.words) self._test_generate_likelihoods(leading_context_size, following_context_size) end_time = datetime.datetime.today() print u'Done in {} seconds for {} words'.format(end_time - start_time, len(self.parse_set_word_list) - 1) print u'Average in {} seconds'.format((end_time - start_time) / (len(self.parse_set_word_list) - 1))
def _create_trigrams_for_parseset_n(self, parseset_index): print "Parsing parse set {} and generating trigrams with occurrence counts".format( parseset_index) dom = parse( os.path.join( os.path.dirname(__file__), '../../testresources/parsesets/parseset{}.xml'.format( parseset_index))) parseset = ParseSetBinding.build( dom.getElementsByTagName("parseset")[0]) print "Found {} sentences".format(len(parseset.sentences)) words = [ word for sentence in parseset.sentences for word in sentence.words ] print "Found {} words".format(len(words)) print "Found {} parsable words".format( len( filter( lambda word: not isinstance(word, UnparsableWordBinding), words))) generator = WordNGramGenerator(3) collection = self.db['wordTrigrams{}'.format(parseset_index)] # delete everything in the collection collection.remove({}) bulk_insert_buffer = [] for trigram in generator.iter_ngrams(words): entity = { 'item_0': trigram[0], 'item_1': trigram[1], 'item_2': trigram[2] } bulk_insert_buffer.append(entity) if len(bulk_insert_buffer) % self.BULK_INSERT_SIZE == 0: collection.insert(bulk_insert_buffer) bulk_insert_buffer = [] collection.insert(bulk_insert_buffer) trigram_count = collection.count() print "Generated {} trigrams".format(trigram_count)
def _validate_concordances_for_parse_set_n(self, parseset_index): dom = parse( os.path.join( os.path.dirname(__file__), '../../testresources/parsesets/parseset{}.xml'.format( parseset_index))) parseset = ParseSetBinding.build( dom.getElementsByTagName("parseset")[0]) word_list = [] for sentence in parseset.sentences: word_list.extend(sentence.words) self._validate_complete_word_concordance_indexes(word_list) self._validate_root_concordance_indexes(word_list) self._validate_lemma_concordance_indexes(word_list) self._validate_transition_word_concordance_indexes(word_list) self._validate_transition_matched_word_concordance_indexes(word_list)
def setUpClass(cls): super(StatisticalParserTest, cls).setUpClass() all_roots = [] lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../resources/master_dictionary.txt')) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() cls.root_map = root_map_generator.generate(all_roots) suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(BasicSuffixGraph())) suffix_graph.initialize() predefined_paths = PredefinedPaths(cls.root_map, suffix_graph) predefined_paths.create_predefined_paths() word_root_finder = WordRootFinder(cls.root_map) digit_numeral_root_finder = DigitNumeralRootFinder() text_numeral_root_finder = TextNumeralRootFinder(cls.root_map) proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder() proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder() contextless_parser = ContextlessMorphologicalParser(suffix_graph, predefined_paths, [word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder]) parseset_index = "001" dom = parse(os.path.join(os.path.dirname(__file__), '../../testresources/parsesets/parseset{}.xml'.format(parseset_index))) parseset = ParseSetBinding.build(dom.getElementsByTagName("parseset")[0]) parse_set_word_list = [] for sentence in parseset.sentences: parse_set_word_list.extend(sentence.words) complete_word_concordance_index = CompleteWordConcordanceIndex(parse_set_word_list) cls.parser = StatisticalParser(contextless_parser, complete_word_concordance_index)
""" import os import unittest from xml.dom.minidom import parse from hamcrest import * from trnltk.morphology.contextless.parser.parser import ContextlessMorphologicalParser from trnltk.morphology.contextless.parser.rootfinder import WordRootFinder from trnltk.statistics.morphemecontainerstats import MorphemeContainerContextlessProbabilityGenerator from trnltk.morphology.lexicon.lexiconloader import LexiconLoader from trnltk.morphology.lexicon.rootgenerator import RootGenerator, RootMapGenerator from trnltk.morphology.morphotactics.basicsuffixgraph import BasicSuffixGraph from trnltk.parseset.xmlbindings import ParseSetBinding from trnltk.statistics.suffixtransitionstats import SuffixTransitionProbabilityGenerator dom = parse(os.path.join(os.path.dirname(__file__), '../../morphology/contextful/likelihoodmetrics/wordformcollocation/test/morphology_contextless_statistics_sample_parseset.xml')) parseset = ParseSetBinding.build(dom.getElementsByTagName("parseset")[0]) parse_set_word_list = [] for sentence in parseset.sentences: parse_set_word_list.extend(sentence.words) class MorphemeContainerContextlessProbabilityGeneratorWithContainersTest(unittest.TestCase): @classmethod def setUpClass(cls): super(MorphemeContainerContextlessProbabilityGeneratorWithContainersTest, cls).setUpClass() all_roots = [] lexicon_lines = u''' duvar tutku saç oğul [A:LastVowelDrop]
verify(wrapped_calculator).calculate_oneway_likelihood( target, [mock_context_item_2], True, calculation_context) verify(wrapped_calculator).calculate_oneway_likelihood( target, [mock_context_item_1, mock_context_item_2], True, calculation_context) verify(wrapped_calculator).calculate_oneway_likelihood( target, [mock_context_item_0, mock_context_item_1, mock_context_item_2], True, calculation_context) verifyNoMoreInteractions(wrapped_calculator) dom = parse( os.path.join(os.path.dirname(__file__), 'morphology_contextless_statistics_sample_parseset.xml')) parseset = ParseSetBinding.build(dom.getElementsByTagName("parseset")[0]) parse_set_word_list = [] for sentence in parseset.sentences: parse_set_word_list.extend(sentence.words) class InterpolatingLikelihoodCalculatorCalculationContextTest( unittest.TestCase): @classmethod def setUpClass(cls): super(InterpolatingLikelihoodCalculatorCalculationContextTest, cls).setUpClass() all_roots = [] lexemes = LexiconLoader.load_from_file( os.path.join(os.path.dirname(__file__),