def setUp(self): self.parseset_creator = ParseSetCreator() all_roots = [] lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../resources/master_dictionary.txt')) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map = (RootMapGenerator()).generate(all_roots) suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph()))) suffix_graph.initialize() predefined_paths = PredefinedPaths(root_map, suffix_graph) predefined_paths.create_predefined_paths() word_root_finder = WordRootFinder(root_map) digit_numeral_root_finder = DigitNumeralRootFinder() text_numeral_root_finder = TextNumeralRootFinder(root_map) proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder() proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder() self.parser = UpperCaseSupportingContextlessMorphologicalParser(suffix_graph, predefined_paths, [word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder])
def setUpClass(cls): super(InterpolatingLikelihoodCalculatorCalculationContextTest, cls).setUpClass() all_roots = [] lexemes = LexiconLoader.load_from_file( os.path.join(os.path.dirname(__file__), '../../../../../resources/master_dictionary.txt')) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() cls.root_map = root_map_generator.generate(all_roots) suffix_graph = CopulaSuffixGraph( NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph()))) suffix_graph.initialize() predefined_paths = PredefinedPaths(cls.root_map, suffix_graph) predefined_paths.create_predefined_paths() word_root_finder = WordRootFinder(cls.root_map) digit_numeral_root_finder = DigitNumeralRootFinder() text_numeral_root_finder = TextNumeralRootFinder(cls.root_map) proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder( ) proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder( ) cls.contextless_parser = UpperCaseSupportingContextlessMorphologicalParser( suffix_graph, predefined_paths, [ word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder ]) mongodb_connection = pymongo.Connection(host='127.0.0.1') cls.collection_map = { 1: mongodb_connection['trnltk']['wordUnigrams999'], 2: mongodb_connection['trnltk']['wordBigrams999'], 3: mongodb_connection['trnltk']['wordTrigrams999'] } database_index_builder = DatabaseIndexBuilder(cls.collection_map) target_form_given_context_counter = InMemoryCachingTargetFormGivenContextCounter( cls.collection_map) ngram_frequency_smoother = CachedSimpleGoodTuringNGramFrequencySmoother( ) sequence_likelihood_calculator = UniformSequenceLikelihoodCalculator() wrapped_generator = ContextParsingLikelihoodCalculator( database_index_builder, target_form_given_context_counter, ngram_frequency_smoother, sequence_likelihood_calculator) cls.generator = InterpolatingLikelihoodCalculator(wrapped_generator)
def setUpClass(cls): super(StatisticalParserTest, cls).setUpClass() all_roots = [] lexemes = LexiconLoader.load_from_file( os.path.join(os.path.dirname(__file__), '../../resources/master_dictionary.txt')) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() cls.root_map = root_map_generator.generate(all_roots) suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph( BasicSuffixGraph())) suffix_graph.initialize() predefined_paths = PredefinedPaths(cls.root_map, suffix_graph) predefined_paths.create_predefined_paths() word_root_finder = WordRootFinder(cls.root_map) digit_numeral_root_finder = DigitNumeralRootFinder() text_numeral_root_finder = TextNumeralRootFinder(cls.root_map) proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder( ) proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder( ) contextless_parser = ContextlessMorphologicalParser( suffix_graph, predefined_paths, [ word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder ]) parseset_index = "001" dom = parse( os.path.join( os.path.dirname(__file__), '../../testresources/parsesets/parseset{}.xml'.format( parseset_index))) parseset = ParseSetBinding.build( dom.getElementsByTagName("parseset")[0]) parse_set_word_list = [] for sentence in parseset.sentences: parse_set_word_list.extend(sentence.words) complete_word_concordance_index = CompleteWordConcordanceIndex( parse_set_word_list) cls.parser = StatisticalParser(contextless_parser, complete_word_concordance_index)
def setUpClass(cls): super(ParserTestWithProperNouns, cls).setUpClass() cls.root_map = dict() suffix_graph = ProperNounSuffixGraph(BasicSuffixGraph()) suffix_graph.initialize() proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder() proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder() cls.parser = ContextlessMorphologicalParser(suffix_graph, None, [proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder])
def setUpClass(cls): super(_LikelihoodCalculatorTest, cls).setUpClass() all_roots = [] lexemes = LexiconLoader.load_from_file( os.path.join(os.path.dirname(__file__), '../../../../../resources/master_dictionary.txt')) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() cls.root_map = root_map_generator.generate(all_roots) suffix_graph = CopulaSuffixGraph( NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph()))) suffix_graph.initialize() predefined_paths = PredefinedPaths(cls.root_map, suffix_graph) predefined_paths.create_predefined_paths() word_root_finder = WordRootFinder(cls.root_map) digit_numeral_root_finder = DigitNumeralRootFinder() text_numeral_root_finder = TextNumeralRootFinder(cls.root_map) proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder( ) proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder( ) cls.contextless_parser = ContextlessMorphologicalParser( suffix_graph, predefined_paths, [ word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder ]) cls.mongodb_connection = pymongo.Connection(host='127.0.0.1') cls.collection_map = { 1: cls.mongodb_connection['trnltk']['wordUnigrams999'], 2: cls.mongodb_connection['trnltk']['wordBigrams999'], 3: cls.mongodb_connection['trnltk']['wordTrigrams999'] } cls.generator = None
def setUpClass(cls): super(TransitionGeneratorTest, cls).setUpClass() all_roots = [] lexemes = LexiconLoader.load_from_file( os.path.join(os.path.dirname(__file__), '../../resources/master_dictionary.txt')) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() cls.root_map = root_map_generator.generate(all_roots) suffix_graph = CopulaSuffixGraph( NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph()))) suffix_graph.initialize() predefined_paths = PredefinedPaths(cls.root_map, suffix_graph) predefined_paths.create_predefined_paths() word_root_finder = WordRootFinder(cls.root_map) digit_numeral_root_finder = DigitNumeralRootFinder() text_numeral_root_finder = TextNumeralRootFinder(cls.root_map) proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder( ) proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder( ) cls.parser = ContextlessMorphologicalParser( suffix_graph, predefined_paths, [ word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder ]) cls.transition_generator = TransitionGenerator(cls.parser)
def setUp(self): self.root_finder = ProperNounFromApostropheRootFinder()
class ProperNounFromApostropheRootFinderTest(unittest.TestCase): def setUp(self): self.root_finder = ProperNounFromApostropheRootFinder() def test_should_recognize_abbreviations(self): roots = self.root_finder.find_roots_for_partial_input(u"TR'") assert_that(roots[0].str, equal_to(u'TR')) assert_that(roots[0].lexeme.secondary_syntactic_category, equal_to(SecondarySyntacticCategory.ABBREVIATION)) roots = self.root_finder.find_roots_for_partial_input(u"MB'") assert_that(roots[0].str, equal_to(u'MB')) assert_that(roots[0].lexeme.secondary_syntactic_category, equal_to(SecondarySyntacticCategory.ABBREVIATION)) roots = self.root_finder.find_roots_for_partial_input(u"POL'") assert_that(roots[0].str, equal_to(u'POL')) assert_that(roots[0].lexeme.secondary_syntactic_category, equal_to(SecondarySyntacticCategory.ABBREVIATION)) roots = self.root_finder.find_roots_for_partial_input(u"KAFA1500'") assert_that(roots[0].str, equal_to(u'KAFA1500')) assert_that(roots[0].lexeme.secondary_syntactic_category, equal_to(SecondarySyntacticCategory.ABBREVIATION)) roots = self.root_finder.find_roots_for_partial_input(u"1500KAFA'") assert_that(roots[0].str, equal_to(u'1500KAFA')) assert_that(roots[0].lexeme.secondary_syntactic_category, equal_to(SecondarySyntacticCategory.ABBREVIATION)) roots = self.root_finder.find_roots_for_partial_input(u"İŞÇĞÜÖ'") assert_that(roots[0].str, equal_to(u'İŞÇĞÜÖ')) assert_that(roots[0].lexeme.secondary_syntactic_category, equal_to(SecondarySyntacticCategory.ABBREVIATION)) roots = self.root_finder.find_roots_for_partial_input(u"123'") assert_that(roots, has_length(0)) def test_should_recognize_proper_nouns(self): roots = self.root_finder.find_roots_for_partial_input(u"Ahmet'") assert_that(roots[0].str, equal_to(u'Ahmet')) assert_that(roots[0].lexeme.secondary_syntactic_category, equal_to(SecondarySyntacticCategory.PROPER_NOUN)) roots = self.root_finder.find_roots_for_partial_input(u"Mehmed'") assert_that(roots[0].str, equal_to(u'Mehmed')) assert_that(roots[0].lexeme.secondary_syntactic_category, equal_to(SecondarySyntacticCategory.PROPER_NOUN)) roots = self.root_finder.find_roots_for_partial_input(u"A123a'") assert_that(roots[0].str, equal_to(u'A123a')) assert_that(roots[0].lexeme.secondary_syntactic_category, equal_to(SecondarySyntacticCategory.PROPER_NOUN)) roots = self.root_finder.find_roots_for_partial_input(u"AvA'") assert_that(roots[0].str, equal_to(u'AvA')) assert_that(roots[0].lexeme.secondary_syntactic_category, equal_to(SecondarySyntacticCategory.PROPER_NOUN)) roots = self.root_finder.find_roots_for_partial_input(u"AAxxAA'") assert_that(roots[0].str, equal_to(u'AAxxAA')) assert_that(roots[0].lexeme.secondary_syntactic_category, equal_to(SecondarySyntacticCategory.PROPER_NOUN)) roots = self.root_finder.find_roots_for_partial_input(u"İstanbul'") assert_that(roots[0].str, equal_to(u'İstanbul')) assert_that(roots[0].lexeme.secondary_syntactic_category, equal_to(SecondarySyntacticCategory.PROPER_NOUN)) roots = self.root_finder.find_roots_for_partial_input(u"Çanakkale'") assert_that(roots[0].str, equal_to(u'Çanakkale')) assert_that(roots[0].lexeme.secondary_syntactic_category, equal_to(SecondarySyntacticCategory.PROPER_NOUN)) roots = self.root_finder.find_roots_for_partial_input(u"Ömer'") assert_that(roots[0].str, equal_to(u'Ömer')) assert_that(roots[0].lexeme.secondary_syntactic_category, equal_to(SecondarySyntacticCategory.PROPER_NOUN)) roots = self.root_finder.find_roots_for_partial_input(u"Şaban'") assert_that(roots[0].str, equal_to(u'Şaban')) assert_that(roots[0].lexeme.secondary_syntactic_category, equal_to(SecondarySyntacticCategory.PROPER_NOUN)) roots = self.root_finder.find_roots_for_partial_input(u"Ümmühan'") assert_that(roots[0].str, equal_to(u'Ümmühan')) assert_that(roots[0].lexeme.secondary_syntactic_category, equal_to(SecondarySyntacticCategory.PROPER_NOUN)) roots = self.root_finder.find_roots_for_partial_input(u"aaa'") assert_that(roots, has_length(0)) roots = self.root_finder.find_roots_for_partial_input(u"aAAAA'") assert_that(roots, has_length(0)) roots = self.root_finder.find_roots_for_partial_input(u"1aa'") assert_that(roots, has_length(0)) roots = self.root_finder.find_roots_for_partial_input(u"a111'") assert_that(roots, has_length(0)) roots = self.root_finder.find_roots_for_partial_input(u"şaa'") assert_that(roots, has_length(0))
for di in lexemes: all_roots.extend(CircumflexConvertingRootGenerator.generate(di)) root_map_generator = RootMapGenerator() root_map = root_map_generator.generate(all_roots) suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph()))) suffix_graph.initialize() predefined_paths = PredefinedPaths(root_map, suffix_graph) predefined_paths.create_predefined_paths() word_root_finder = WordRootFinder(root_map) text_numeral_root_finder = TextNumeralRootFinder(root_map) digit_numeral_root_finder = DigitNumeralRootFinder() proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder() proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder() parser = UpperCaseSupportingContextlessMorphologicalParser(suffix_graph, predefined_paths, [word_root_finder, text_numeral_root_finder, digit_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder]) sentence = sys.argv[1].decode('utf-8') for word in sentence.split(): lst = parser.parse(word) root_set = set() for element in lst: formatted = formatter.format_morpheme_container_for_parseset(element) root = formatted[:formatted.index('+')] root_set.add(root.lower())