def test_should_generate_with_last_vowel_drop(self): lexeme = Lexeme(u"ağız", u"ağız", SyntacticCategory.NOUN, None, {LexemeAttribute.LastVowelDrop}) generated_roots = RootGenerator.generate(lexeme) assert_that(generated_roots, has_length(2)) assert_that( generated_roots, has_item( Root(u"ağız", lexeme, {PhoneticExpectation.ConsonantStart}, {LLCont, LVB, LLC, LLNotVless, LVU}))) assert_that( generated_roots, has_item( Root(u"ağz", lexeme, {PhoneticExpectation.VowelStart}, {LLCont, LVB, LLC, LLNotVless, LVU}))) lexeme = Lexeme( u"ahit", u"ahit", SyntacticCategory.NOUN, None, {LexemeAttribute.LastVowelDrop, LexemeAttribute.Voicing}) generated_roots = RootGenerator.generate(lexeme) assert_that(generated_roots, has_length(2)) assert_that( generated_roots, has_item( Root(u"ahit", lexeme, {PhoneticExpectation.ConsonantStart}, {LLNotCont, LVF, LLC, LLVless, LLVlessStop, LVU}))) assert_that( generated_roots, has_item( Root(u"ahd", lexeme, {PhoneticExpectation.VowelStart}, {LLNotCont, LVF, LLVless, LLC, LVU})))
def test_should_generate_with_voicing(self): lexeme = Lexeme(u"armut", u"armut", SyntacticCategory.NOUN, None, {LexemeAttribute.Voicing}) generated_roots = RootGenerator.generate(lexeme) assert_that(generated_roots, has_length(2)) assert_that( generated_roots, has_item( Root(u'armut', lexeme, {PhoneticExpectation.ConsonantStart}, {LLNotCont, LVB, LLC, LLVless, LLVlessStop, LVR}))) assert_that( generated_roots, has_item( Root(u'armud', lexeme, {PhoneticExpectation.VowelStart}, {LLNotCont, LVB, LLC, LLVless, LVR}))) lexeme = Lexeme(u"kapak", u"kapak", SyntacticCategory.NOUN, None, {LexemeAttribute.Voicing}) generated_roots = RootGenerator.generate(lexeme) assert_that(generated_roots, has_length(2)) assert_that( generated_roots, has_item( Root(u'kapak', lexeme, {PhoneticExpectation.ConsonantStart}, {LLNotCont, LVB, LLC, LLVless, LLVlessStop, LVU}))) assert_that( generated_roots, has_item( Root(u'kapağ', lexeme, {PhoneticExpectation.VowelStart}, {LLCont, LVB, LLC, LLVless, LVU}))) lexeme = Lexeme(u"cenk", u"cenk", SyntacticCategory.NOUN, None, {LexemeAttribute.Voicing}) generated_roots = RootGenerator.generate(lexeme) assert_that(generated_roots, has_length(2)) assert_that( generated_roots, has_item( Root(u'cenk', lexeme, {PhoneticExpectation.ConsonantStart}, {LLNotCont, LVF, LLC, LLVless, LLVlessStop, LVU}))) assert_that( generated_roots, has_item( Root(u'ceng', lexeme, {PhoneticExpectation.VowelStart}, {LLNotCont, LVF, LLC, LLVless, LVU}))) lexeme = Lexeme(u"kap", u"kap", SyntacticCategory.NOUN, None, {LexemeAttribute.Voicing}) generated_roots = RootGenerator.generate(lexeme) assert_that(generated_roots, has_length(2)) assert_that( generated_roots, has_item( Root(u'kap', lexeme, {PhoneticExpectation.ConsonantStart}, {LLNotCont, LVB, LLC, LLVless, LLVlessStop, LVU}))) assert_that( generated_roots, has_item( Root(u'kab', lexeme, {PhoneticExpectation.VowelStart}, {LLNotCont, LVB, LLC, LLVless, LVU})))
def test_should_generate_verbs_with_voicing_and_novoicing(self): lexeme = Lexeme(u"gitmek", u"git", SyntacticCategory.VERB, None, {LexemeAttribute.Voicing}) generated_roots = RootGenerator.generate(lexeme) assert_that(generated_roots, has_length(2)) assert_that( generated_roots, has_item( Root(u'git', lexeme, {PhoneticExpectation.ConsonantStart}, {LLNotCont, LVF, LLC, LLVless, LLVlessStop, LVU}))) assert_that( generated_roots, has_item( Root(u'gid', lexeme, {PhoneticExpectation.VowelStart}, {LLNotCont, LVF, LLC, LLVless, LVU}))) lexeme = Lexeme(u"sürtmek", u"sürt", SyntacticCategory.VERB, None, None) generated_roots = RootGenerator.generate(lexeme) assert_that(generated_roots, has_length(1)) assert_that( generated_roots, has_item( Root(u'sürt', lexeme, None, {LLNotCont, LVF, LLC, LLVless, LLVlessStop, LVR})))
def test_should_generate_with_doubling(self): lexeme = Lexeme(u"hac", u"hac", SyntacticCategory.NOUN, None, {LexemeAttribute.Doubling}) generated_roots = RootGenerator.generate(lexeme) assert_that(generated_roots, has_length(2)) assert_that( generated_roots, has_item( Root(u"hac", lexeme, {PhoneticExpectation.ConsonantStart}, {LLNotCont, LLVStop, LVB, LLC, LLNotVless, LVU}))) assert_that( generated_roots, has_item( Root(u"hacc", lexeme, {PhoneticExpectation.VowelStart}, {LLNotCont, LLVStop, LVB, LLC, LLNotVless, LVU}))) lexeme = Lexeme(u"ret", u"ret", SyntacticCategory.NOUN, None, {LexemeAttribute.Voicing, LexemeAttribute.Doubling}) generated_roots = RootGenerator.generate(lexeme) assert_that(generated_roots, has_length(2)) assert_that( generated_roots, has_item( Root(u"ret", lexeme, {PhoneticExpectation.ConsonantStart}, {LLNotCont, LVF, LLC, LLVless, LLVlessStop, LVU}))) assert_that( generated_roots, has_item( Root(u"redd", lexeme, {PhoneticExpectation.VowelStart}, {LLNotCont, LVF, LLC, LLVless, LVU})))
def test_should_generate_with_inverse_harmony(self): lexeme = Lexeme(u"kemal", u"kemal", SyntacticCategory.NOUN, None, {LexemeAttribute.InverseHarmony}) generated_roots = RootGenerator.generate(lexeme) assert_that(generated_roots, has_length(1)) assert_that(generated_roots, has_item(Root(u"kemal", lexeme, None, {LLCont, LVF, LLC, LLNotVless, LVU}))) lexeme = Lexeme(u"kanaat", u"kanaat", SyntacticCategory.NOUN, None, {LexemeAttribute.NoVoicing, LexemeAttribute.InverseHarmony}) generated_roots = RootGenerator.generate(lexeme) assert_that(generated_roots, has_length(1)) assert_that(generated_roots, has_item(Root(u"kanaat", lexeme, None, {LLNotCont, LVF, LLC, LLVless, LLVlessStop, LVU})))
def test_should_generate_with_no_modifiers(self): lexeme = Lexeme(u"elma", u"elma", SyntacticCategory.NOUN, None, None) generated_roots = RootGenerator.generate(lexeme) assert_that(generated_roots, has_length(1)) assert_that(generated_roots, has_item(Root(u'elma', lexeme, None, {LLNotCont, LLV, LVB, LLNotVless, LVU}))) lexeme = Lexeme(u"kek", u"kek", SyntacticCategory.NOUN, None, {LexemeAttribute.NoVoicing}) generated_roots = RootGenerator.generate(lexeme) assert_that(generated_roots, has_length(1)) assert_that(generated_roots, has_item(Root(u'kek', lexeme, None, {LLNotCont, LVF, LLC, LLVless, LLVlessStop, LVU})))
def test_should_generate_verbs_with_voicing_and_novoicing(self): lexeme = Lexeme(u"gitmek", u"git", SyntacticCategory.VERB, None, {LexemeAttribute.Voicing}) generated_roots = RootGenerator.generate(lexeme) assert_that(generated_roots, has_length(2)) assert_that(generated_roots, has_item(Root(u'git', lexeme, {PhoneticExpectation.ConsonantStart}, {LLNotCont, LVF, LLC, LLVless, LLVlessStop, LVU}))) assert_that(generated_roots, has_item(Root(u'gid', lexeme, {PhoneticExpectation.VowelStart}, {LLNotCont, LVF, LLC, LLVless, LVU}))) lexeme = Lexeme(u"sürtmek", u"sürt", SyntacticCategory.VERB, None, None) generated_roots = RootGenerator.generate(lexeme) assert_that(generated_roots, has_length(1)) assert_that(generated_roots, has_item(Root(u'sürt', lexeme, None, {LLNotCont, LVF, LLC, LLVless, LLVlessStop, LVR})))
def test_should_generate_with_last_vowel_drop(self): lexeme = Lexeme(u"ağız", u"ağız", SyntacticCategory.NOUN, None, {LexemeAttribute.LastVowelDrop}) generated_roots = RootGenerator.generate(lexeme) assert_that(generated_roots, has_length(2)) assert_that(generated_roots, has_item(Root(u"ağız", lexeme, {PhoneticExpectation.ConsonantStart}, {LLCont, LVB, LLC, LLNotVless, LVU}))) assert_that(generated_roots, has_item(Root(u"ağz", lexeme, {PhoneticExpectation.VowelStart}, {LLCont, LVB, LLC, LLNotVless, LVU}))) lexeme = Lexeme(u"ahit", u"ahit", SyntacticCategory.NOUN, None, {LexemeAttribute.LastVowelDrop, LexemeAttribute.Voicing}) generated_roots = RootGenerator.generate(lexeme) assert_that(generated_roots, has_length(2)) assert_that(generated_roots, has_item(Root(u"ahit", lexeme, {PhoneticExpectation.ConsonantStart}, {LLNotCont, LVF, LLC, LLVless, LLVlessStop, LVU}))) assert_that(generated_roots, has_item(Root(u"ahd", lexeme, {PhoneticExpectation.VowelStart}, {LLNotCont, LVF, LLVless, LLC, LVU})))
def test_should_generate_with_doubling(self): lexeme = Lexeme(u"hac", u"hac", SyntacticCategory.NOUN, None, {LexemeAttribute.Doubling}) generated_roots = RootGenerator.generate(lexeme) assert_that(generated_roots, has_length(2)) assert_that(generated_roots, has_item(Root(u"hac", lexeme, {PhoneticExpectation.ConsonantStart}, {LLNotCont, LLVStop, LVB, LLC, LLNotVless, LVU}))) assert_that(generated_roots, has_item(Root(u"hacc", lexeme, {PhoneticExpectation.VowelStart}, {LLNotCont, LLVStop, LVB, LLC, LLNotVless, LVU}))) lexeme = Lexeme(u"ret", u"ret", SyntacticCategory.NOUN, None, {LexemeAttribute.Voicing, LexemeAttribute.Doubling}) generated_roots = RootGenerator.generate(lexeme) assert_that(generated_roots, has_length(2)) assert_that(generated_roots, has_item(Root(u"ret", lexeme, {PhoneticExpectation.ConsonantStart}, {LLNotCont, LVF, LLC, LLVless, LLVlessStop, LVU}))) assert_that(generated_roots, has_item(Root(u"redd", lexeme, {PhoneticExpectation.VowelStart}, {LLNotCont, LVF, LLC, LLVless, LVU})))
def setUp(self): self.parseset_creator = ParseSetCreator() all_roots = [] lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../resources/master_dictionary.txt')) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map = (RootMapGenerator()).generate(all_roots) suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph()))) suffix_graph.initialize() predefined_paths = PredefinedPaths(root_map, suffix_graph) predefined_paths.create_predefined_paths() word_root_finder = WordRootFinder(root_map) digit_numeral_root_finder = DigitNumeralRootFinder() text_numeral_root_finder = TextNumeralRootFinder(root_map) proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder() proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder() self.parser = UpperCaseSupportingContextlessMorphologicalParser(suffix_graph, predefined_paths, [word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder])
def setUpClass(cls): super(TransitionGeneratorTest, cls).setUpClass() all_roots = [] lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../resources/master_dictionary.txt')) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() cls.root_map = root_map_generator.generate(all_roots) suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph()))) suffix_graph.initialize() predefined_paths = PredefinedPaths(cls.root_map, suffix_graph) predefined_paths.create_predefined_paths() word_root_finder = WordRootFinder(cls.root_map) digit_numeral_root_finder = DigitNumeralRootFinder() text_numeral_root_finder = TextNumeralRootFinder(cls.root_map) proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder() proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder() cls.parser = ContextlessMorphologicalParser(suffix_graph, predefined_paths, [word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder]) cls.transition_generator = TransitionGenerator(cls.parser)
def setUpClass(cls): super( MorphemeContainerContextlessProbabilityGeneratorWithContainersTest, cls).setUpClass() all_roots = [] lexicon_lines = u''' duvar tutku saç oğul [A:LastVowelDrop] demek [A:RootChange, Passive_In, Passive_InIl] bu [P:Det] '''.strip().splitlines() lexemes = LexiconLoader.load_from_lines(lexicon_lines) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() cls.root_map = root_map_generator.generate(all_roots) suffix_graph = BasicSuffixGraph() suffix_graph.initialize() word_root_finder = WordRootFinder(cls.root_map) cls.contextless_parser = ContextlessMorphologicalParser( suffix_graph, None, [word_root_finder])
def setUpClass(cls): super(_LikelihoodCalculatorTest, cls).setUpClass() all_roots = [] lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../../../../resources/master_dictionary.txt')) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() cls.root_map = root_map_generator.generate(all_roots) suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph()))) suffix_graph.initialize() predefined_paths = PredefinedPaths(cls.root_map, suffix_graph) predefined_paths.create_predefined_paths() word_root_finder = WordRootFinder(cls.root_map) digit_numeral_root_finder = DigitNumeralRootFinder() text_numeral_root_finder = TextNumeralRootFinder(cls.root_map) proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder() proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder() cls.contextless_parser = ContextlessMorphologicalParser(suffix_graph, predefined_paths, [word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder]) cls.mongodb_connection = pymongo.Connection(host='127.0.0.1') cls.collection_map = { 1: cls.mongodb_connection['trnltk']['wordUnigrams999'], 2: cls.mongodb_connection['trnltk']['wordBigrams999'], 3: cls.mongodb_connection['trnltk']['wordTrigrams999'] } cls.generator = None
def setUpClass(cls): super(MorphemeContainerContextlessProbabilityGeneratorWithContainersTest, cls).setUpClass() all_roots = [] lexicon_lines = u''' duvar tutku saç oğul [A:LastVowelDrop] demek [A:RootChange, Passive_In, Passive_InIl] bu [P:Det] '''.strip().splitlines() lexemes = LexiconLoader.load_from_lines(lexicon_lines) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() cls.root_map = root_map_generator.generate(all_roots) suffix_graph = BasicSuffixGraph() suffix_graph.initialize() word_root_finder = WordRootFinder(cls.root_map) cls.contextless_parser = ContextlessMorphologicalParser(suffix_graph, None, [word_root_finder])
def setUpClass(cls): super(InterpolatingLikelihoodCalculatorCalculationContextTest, cls).setUpClass() all_roots = [] lexemes = LexiconLoader.load_from_file( os.path.join(os.path.dirname(__file__), '../../../../../resources/master_dictionary.txt')) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() cls.root_map = root_map_generator.generate(all_roots) suffix_graph = CopulaSuffixGraph( NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph()))) suffix_graph.initialize() predefined_paths = PredefinedPaths(cls.root_map, suffix_graph) predefined_paths.create_predefined_paths() word_root_finder = WordRootFinder(cls.root_map) digit_numeral_root_finder = DigitNumeralRootFinder() text_numeral_root_finder = TextNumeralRootFinder(cls.root_map) proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder( ) proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder( ) cls.contextless_parser = UpperCaseSupportingContextlessMorphologicalParser( suffix_graph, predefined_paths, [ word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder ]) mongodb_connection = pymongo.Connection(host='127.0.0.1') cls.collection_map = { 1: mongodb_connection['trnltk']['wordUnigrams999'], 2: mongodb_connection['trnltk']['wordBigrams999'], 3: mongodb_connection['trnltk']['wordTrigrams999'] } database_index_builder = DatabaseIndexBuilder(cls.collection_map) target_form_given_context_counter = InMemoryCachingTargetFormGivenContextCounter( cls.collection_map) ngram_frequency_smoother = CachedSimpleGoodTuringNGramFrequencySmoother( ) sequence_likelihood_calculator = UniformSequenceLikelihoodCalculator() wrapped_generator = ContextParsingLikelihoodCalculator( database_index_builder, target_form_given_context_counter, ngram_frequency_smoother, sequence_likelihood_calculator) cls.generator = InterpolatingLikelihoodCalculator(wrapped_generator)
def setUpClass(cls): super(ParserTestWithExtendedGraph, cls).setUpClass() all_roots = [] lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../../../resources/master_dictionary.txt')) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) cls._org_root_map = (RootMapGenerator()).generate(all_roots)
def setUpClass(cls): super(FormatterTest, cls).setUpClass() all_roots = [] dictionary_content = ["kitap", "yapmak"] lexemes = LexiconLoader.load_from_lines(dictionary_content) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) cls.root_map = RootMapGenerator().generate(all_roots)
def test_should_generate_with_no_modifiers(self): lexeme = Lexeme(u"elma", u"elma", SyntacticCategory.NOUN, None, None) generated_roots = RootGenerator.generate(lexeme) assert_that(generated_roots, has_length(1)) assert_that( generated_roots, has_item( Root(u'elma', lexeme, None, {LLNotCont, LLV, LVB, LLNotVless, LVU}))) lexeme = Lexeme(u"kek", u"kek", SyntacticCategory.NOUN, None, {LexemeAttribute.NoVoicing}) generated_roots = RootGenerator.generate(lexeme) assert_that(generated_roots, has_length(1)) assert_that( generated_roots, has_item( Root(u'kek', lexeme, None, {LLNotCont, LVF, LLC, LLVless, LLVlessStop, LVU})))
def create(cls, master_dictionary_path, ngram_collection_map): """ @type master_dictionary_path: str or unicode @param ngram_collection_map: list<Collection> @rtype ContextfulMorphologicalParser """ all_roots = [] lexemes = LexiconLoader.load_from_file(master_dictionary_path) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() root_map = root_map_generator.generate(all_roots) suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph()))) suffix_graph.initialize() predefined_paths = PredefinedPaths(root_map, suffix_graph) predefined_paths.create_predefined_paths() word_root_finder = WordRootFinder(root_map) digit_numeral_root_finder = DigitNumeralRootFinder() text_numeral_root_finder = TextNumeralRootFinder(root_map) proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder() proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder() contextless_parser = UpperCaseSupportingContextlessMorphologicalParser(suffix_graph, predefined_paths, [word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder]) database_index_builder = DatabaseIndexBuilder(ngram_collection_map) target_form_given_context_counter = InMemoryCachingTargetFormGivenContextCounter(ngram_collection_map) ngram_frequency_smoother = CachedSimpleGoodTuringNGramFrequencySmoother() sequence_likelihood_calculator = SequenceLikelihoodCalculator(None) collocation_metric_calculator = ContextParsingLikelihoodCalculator(database_index_builder, target_form_given_context_counter, ngram_frequency_smoother, sequence_likelihood_calculator) interpolating_collocation_metric_calculator = InterpolatingLikelihoodCalculator(collocation_metric_calculator) cached_contextless_distribution_smoother = CachedContextlessDistributionSmoother() contextless_distribution_metric_calculator = ContextlessDistributionCalculator(database_index_builder, target_form_given_context_counter, cached_contextless_distribution_smoother) contextful_likelihood_calculator = ContextfulLikelihoodCalculator(interpolating_collocation_metric_calculator, contextless_distribution_metric_calculator) sequence_likelihood_calculator._contextful_likelihood_calculator = contextful_likelihood_calculator contextful_morphological_parser = ContextfulMorphologicalParser(contextless_parser, contextful_likelihood_calculator) return contextful_morphological_parser
def setUpClass(cls): super(StatisticalParserTest, cls).setUpClass() all_roots = [] lexemes = LexiconLoader.load_from_file( os.path.join(os.path.dirname(__file__), '../../resources/master_dictionary.txt')) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() cls.root_map = root_map_generator.generate(all_roots) suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph( BasicSuffixGraph())) suffix_graph.initialize() predefined_paths = PredefinedPaths(cls.root_map, suffix_graph) predefined_paths.create_predefined_paths() word_root_finder = WordRootFinder(cls.root_map) digit_numeral_root_finder = DigitNumeralRootFinder() text_numeral_root_finder = TextNumeralRootFinder(cls.root_map) proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder( ) proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder( ) contextless_parser = ContextlessMorphologicalParser( suffix_graph, predefined_paths, [ word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder ]) parseset_index = "001" dom = parse( os.path.join( os.path.dirname(__file__), '../../testresources/parsesets/parseset{}.xml'.format( parseset_index))) parseset = ParseSetBinding.build( dom.getElementsByTagName("parseset")[0]) parse_set_word_list = [] for sentence in parseset.sentences: parse_set_word_list.extend(sentence.words) complete_word_concordance_index = CompleteWordConcordanceIndex( parse_set_word_list) cls.parser = StatisticalParser(contextless_parser, complete_word_concordance_index)
def test_should_generate_with_inverse_harmony(self): lexeme = Lexeme(u"kemal", u"kemal", SyntacticCategory.NOUN, None, {LexemeAttribute.InverseHarmony}) generated_roots = RootGenerator.generate(lexeme) assert_that(generated_roots, has_length(1)) assert_that( generated_roots, has_item( Root(u"kemal", lexeme, None, {LLCont, LVF, LLC, LLNotVless, LVU}))) lexeme = Lexeme( u"kanaat", u"kanaat", SyntacticCategory.NOUN, None, {LexemeAttribute.NoVoicing, LexemeAttribute.InverseHarmony}) generated_roots = RootGenerator.generate(lexeme) assert_that(generated_roots, has_length(1)) assert_that( generated_roots, has_item( Root(u"kanaat", lexeme, None, {LLNotCont, LVF, LLC, LLVless, LLVlessStop, LVU})))
def create_calculator(cls, parseset_index): all_roots = [] lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../../../resources/master_dictionary.txt')) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() cls.root_map = root_map_generator.generate(all_roots) suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph()))) suffix_graph.initialize() predefined_paths = PredefinedPaths(cls.root_map, suffix_graph) predefined_paths.create_predefined_paths() word_root_finder = WordRootFinder(cls.root_map) digit_numeral_root_finder = DigitNumeralRootFinder() text_numeral_root_finder = TextNumeralRootFinder(cls.root_map) proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder() proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder() cls.contextless_parser = UpperCaseSupportingContextlessMorphologicalParser(suffix_graph, predefined_paths, [word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder]) mongodb_connection = pymongo.Connection(host='127.0.0.1') collection_map = { 1: mongodb_connection['trnltk']['wordUnigrams{}'.format(parseset_index)], 2: mongodb_connection['trnltk']['wordBigrams{}'.format(parseset_index)], 3: mongodb_connection['trnltk']['wordTrigrams{}'.format(parseset_index)] } database_index_builder = DatabaseIndexBuilder(collection_map) target_form_given_context_counter = InMemoryCachingTargetFormGivenContextCounter(collection_map) ngram_frequency_smoother = CachedSimpleGoodTuringNGramFrequencySmoother() sequence_likelihood_calculator = SequenceLikelihoodCalculator(None) collocation_metric_calculator = ContextParsingLikelihoodCalculator(database_index_builder, target_form_given_context_counter, ngram_frequency_smoother, sequence_likelihood_calculator) interpolating_collocation_metric_calculator = InterpolatingLikelihoodCalculator(collocation_metric_calculator) contextless_distribution_metric_calculator = ContextlessDistributionCalculator(database_index_builder, target_form_given_context_counter) contextful_likelihood_calculator = ContextfulLikelihoodCalculator(interpolating_collocation_metric_calculator, contextless_distribution_metric_calculator) sequence_likelihood_calculator._contextful_likelihood_calculator = contextful_likelihood_calculator return contextful_likelihood_calculator
def test_should_generate_with_progressive_vowel_drop(self): lexeme = Lexeme(u"atamak", u"ata", SyntacticCategory.VERB, None, {LexemeAttribute.ProgressiveVowelDrop}) generated_roots = RootGenerator.generate(lexeme) assert_that(generated_roots, has_length(2)) assert_that( generated_roots, has_item( Root(u"ata", lexeme, None, {LLNotCont, LVB, LLV, LLNotVless, LVU}))) assert_that( generated_roots, has_item( Root(u"at", lexeme, {PhoneticExpectation.VowelStart}, {LLNotCont, LVB, LLC, LLVless, LLVlessStop, LVU})))
def setUpClass(cls): super(PredefinedPathsTest, cls).setUpClass() all_roots = [] lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../../resources/master_dictionary.txt')) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() cls.root_map = root_map_generator.generate(all_roots) cls.morpheme_container_map = {} cls.suffix_graph = BasicSuffixGraph() cls.suffix_graph.initialize()
def test_should_generate_with_voicing(self): lexeme = Lexeme(u"armut", u"armut", SyntacticCategory.NOUN, None, {LexemeAttribute.Voicing}) generated_roots = RootGenerator.generate(lexeme) assert_that(generated_roots, has_length(2)) assert_that(generated_roots, has_item(Root(u'armut', lexeme, {PhoneticExpectation.ConsonantStart}, {LLNotCont, LVB, LLC, LLVless, LLVlessStop, LVR}))) assert_that(generated_roots, has_item(Root(u'armud', lexeme, {PhoneticExpectation.VowelStart}, {LLNotCont, LVB, LLC, LLVless, LVR}))) lexeme = Lexeme(u"kapak", u"kapak", SyntacticCategory.NOUN, None, {LexemeAttribute.Voicing}) generated_roots = RootGenerator.generate(lexeme) assert_that(generated_roots, has_length(2)) assert_that(generated_roots, has_item(Root(u'kapak', lexeme, {PhoneticExpectation.ConsonantStart}, {LLNotCont, LVB, LLC, LLVless, LLVlessStop, LVU}))) assert_that(generated_roots, has_item(Root(u'kapağ', lexeme, {PhoneticExpectation.VowelStart}, {LLCont, LVB, LLC, LLVless, LVU}))) lexeme = Lexeme(u"cenk", u"cenk", SyntacticCategory.NOUN, None, {LexemeAttribute.Voicing}) generated_roots = RootGenerator.generate(lexeme) assert_that(generated_roots, has_length(2)) assert_that(generated_roots, has_item(Root(u'cenk', lexeme, {PhoneticExpectation.ConsonantStart}, {LLNotCont, LVF, LLC, LLVless, LLVlessStop, LVU}))) assert_that(generated_roots, has_item(Root(u'ceng', lexeme, {PhoneticExpectation.VowelStart}, {LLNotCont, LVF, LLC, LLVless, LVU}))) lexeme = Lexeme(u"kap", u"kap", SyntacticCategory.NOUN, None, {LexemeAttribute.Voicing}) generated_roots = RootGenerator.generate(lexeme) assert_that(generated_roots, has_length(2)) assert_that(generated_roots, has_item(Root(u'kap', lexeme, {PhoneticExpectation.ConsonantStart}, {LLNotCont, LVB, LLC, LLVless, LLVlessStop, LVU}))) assert_that(generated_roots, has_item(Root(u'kab', lexeme, {PhoneticExpectation.VowelStart}, {LLNotCont, LVB, LLC, LLVless, LVU})))
def setUpClass(cls): super(PredefinedPathsTest, cls).setUpClass() all_roots = [] lexemes = LexiconLoader.load_from_file( os.path.join(os.path.dirname(__file__), '../../../resources/master_dictionary.txt')) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() cls.root_map = root_map_generator.generate(all_roots) cls.morpheme_container_map = {} cls.suffix_graph = BasicSuffixGraph() cls.suffix_graph.initialize()
def setUpClass(cls): super(_LikelihoodCalculatorTest, cls).setUpClass() all_roots = [] lexemes = LexiconLoader.load_from_file( os.path.join(os.path.dirname(__file__), '../../../../../resources/master_dictionary.txt')) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() cls.root_map = root_map_generator.generate(all_roots) suffix_graph = CopulaSuffixGraph( NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph()))) suffix_graph.initialize() predefined_paths = PredefinedPaths(cls.root_map, suffix_graph) predefined_paths.create_predefined_paths() word_root_finder = WordRootFinder(cls.root_map) digit_numeral_root_finder = DigitNumeralRootFinder() text_numeral_root_finder = TextNumeralRootFinder(cls.root_map) proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder( ) proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder( ) cls.contextless_parser = ContextlessMorphologicalParser( suffix_graph, predefined_paths, [ word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder ]) cls.mongodb_connection = pymongo.Connection(host='127.0.0.1') cls.collection_map = { 1: cls.mongodb_connection['trnltk']['wordUnigrams999'], 2: cls.mongodb_connection['trnltk']['wordBigrams999'], 3: cls.mongodb_connection['trnltk']['wordTrigrams999'] } cls.generator = None
def setUpClass(cls): all_roots = [] lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../../../../resources/master_dictionary.txt')) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() cls.root_map = root_map_generator.generate(all_roots) suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph()))) suffix_graph.initialize() predefined_paths = PredefinedPaths(cls.root_map, suffix_graph) predefined_paths.create_predefined_paths() word_root_finder = WordRootFinder(cls.root_map) digit_numeral_root_finder = DigitNumeralRootFinder() text_numeral_root_finder = TextNumeralRootFinder(cls.root_map) proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder() proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder() cls.contextless_parser = ContextlessMorphologicalParser(suffix_graph, predefined_paths, [word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder]) mongodb_connection = pymongo.Connection(host='127.0.0.1') collection_map = { 1: mongodb_connection['trnltk']['wordUnigrams{}'.format(cls.parseset_index)] } database_index_builder = DatabaseIndexBuilder(collection_map) target_form_given_context_counter = TargetFormGivenContextCounter(collection_map) smoother = CachedContextlessDistributionSmoother() smoother.initialize() cls.calculator = ContextlessDistributionCalculator(database_index_builder, target_form_given_context_counter, smoother) cls.calculator.build_indexes()
def setUpClass(cls): super(TransitionGeneratorTest, cls).setUpClass() all_roots = [] lexemes = LexiconLoader.load_from_file( os.path.join(os.path.dirname(__file__), '../../resources/master_dictionary.txt')) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() cls.root_map = root_map_generator.generate(all_roots) suffix_graph = CopulaSuffixGraph( NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph()))) suffix_graph.initialize() predefined_paths = PredefinedPaths(cls.root_map, suffix_graph) predefined_paths.create_predefined_paths() word_root_finder = WordRootFinder(cls.root_map) digit_numeral_root_finder = DigitNumeralRootFinder() text_numeral_root_finder = TextNumeralRootFinder(cls.root_map) proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder( ) proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder( ) cls.parser = ContextlessMorphologicalParser( suffix_graph, predefined_paths, [ word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder ]) cls.transition_generator = TransitionGenerator(cls.parser)
def setUpClass(cls): super(StatisticalParserTest, cls).setUpClass() all_roots = [] lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../resources/master_dictionary.txt')) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() cls.root_map = root_map_generator.generate(all_roots) suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(BasicSuffixGraph())) suffix_graph.initialize() predefined_paths = PredefinedPaths(cls.root_map, suffix_graph) predefined_paths.create_predefined_paths() word_root_finder = WordRootFinder(cls.root_map) digit_numeral_root_finder = DigitNumeralRootFinder() text_numeral_root_finder = TextNumeralRootFinder(cls.root_map) proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder() proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder() contextless_parser = ContextlessMorphologicalParser(suffix_graph, predefined_paths, [word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder]) parseset_index = "001" dom = parse(os.path.join(os.path.dirname(__file__), '../../testresources/parsesets/parseset{}.xml'.format(parseset_index))) parseset = ParseSetBinding.build(dom.getElementsByTagName("parseset")[0]) parse_set_word_list = [] for sentence in parseset.sentences: parse_set_word_list.extend(sentence.words) complete_word_concordance_index = CompleteWordConcordanceIndex(parse_set_word_list) cls.parser = StatisticalParser(contextless_parser, complete_word_concordance_index)
def test_should_generate_with_progressive_vowel_drop(self): lexeme = Lexeme(u"atamak", u"ata", SyntacticCategory.VERB, None, {LexemeAttribute.ProgressiveVowelDrop}) generated_roots = RootGenerator.generate(lexeme) assert_that(generated_roots, has_length(2)) assert_that(generated_roots, has_item(Root(u"ata", lexeme, None, {LLNotCont, LVB, LLV, LLNotVless, LVU}))) assert_that(generated_roots, has_item(Root(u"at", lexeme, {PhoneticExpectation.VowelStart}, {LLNotCont, LVB, LLC, LLVless, LLVlessStop, LVU})))