def test_should_generate_with_last_vowel_drop(self):
        lexeme = Lexeme(u"ağız", u"ağız", SyntacticCategory.NOUN, None,
                        {LexemeAttribute.LastVowelDrop})
        generated_roots = RootGenerator.generate(lexeme)
        assert_that(generated_roots, has_length(2))
        assert_that(
            generated_roots,
            has_item(
                Root(u"ağız", lexeme, {PhoneticExpectation.ConsonantStart},
                     {LLCont, LVB, LLC, LLNotVless, LVU})))
        assert_that(
            generated_roots,
            has_item(
                Root(u"ağz", lexeme, {PhoneticExpectation.VowelStart},
                     {LLCont, LVB, LLC, LLNotVless, LVU})))

        lexeme = Lexeme(
            u"ahit", u"ahit", SyntacticCategory.NOUN, None,
            {LexemeAttribute.LastVowelDrop, LexemeAttribute.Voicing})
        generated_roots = RootGenerator.generate(lexeme)
        assert_that(generated_roots, has_length(2))
        assert_that(
            generated_roots,
            has_item(
                Root(u"ahit", lexeme, {PhoneticExpectation.ConsonantStart},
                     {LLNotCont, LVF, LLC, LLVless, LLVlessStop, LVU})))
        assert_that(
            generated_roots,
            has_item(
                Root(u"ahd", lexeme, {PhoneticExpectation.VowelStart},
                     {LLNotCont, LVF, LLVless, LLC, LVU})))
    def test_should_generate_with_voicing(self):
        lexeme = Lexeme(u"armut", u"armut", SyntacticCategory.NOUN, None,
                        {LexemeAttribute.Voicing})
        generated_roots = RootGenerator.generate(lexeme)
        assert_that(generated_roots, has_length(2))
        assert_that(
            generated_roots,
            has_item(
                Root(u'armut', lexeme, {PhoneticExpectation.ConsonantStart},
                     {LLNotCont, LVB, LLC, LLVless, LLVlessStop, LVR})))
        assert_that(
            generated_roots,
            has_item(
                Root(u'armud', lexeme, {PhoneticExpectation.VowelStart},
                     {LLNotCont, LVB, LLC, LLVless, LVR})))

        lexeme = Lexeme(u"kapak", u"kapak", SyntacticCategory.NOUN, None,
                        {LexemeAttribute.Voicing})
        generated_roots = RootGenerator.generate(lexeme)
        assert_that(generated_roots, has_length(2))
        assert_that(
            generated_roots,
            has_item(
                Root(u'kapak', lexeme, {PhoneticExpectation.ConsonantStart},
                     {LLNotCont, LVB, LLC, LLVless, LLVlessStop, LVU})))
        assert_that(
            generated_roots,
            has_item(
                Root(u'kapağ', lexeme, {PhoneticExpectation.VowelStart},
                     {LLCont, LVB, LLC, LLVless, LVU})))

        lexeme = Lexeme(u"cenk", u"cenk", SyntacticCategory.NOUN, None,
                        {LexemeAttribute.Voicing})
        generated_roots = RootGenerator.generate(lexeme)
        assert_that(generated_roots, has_length(2))
        assert_that(
            generated_roots,
            has_item(
                Root(u'cenk', lexeme, {PhoneticExpectation.ConsonantStart},
                     {LLNotCont, LVF, LLC, LLVless, LLVlessStop, LVU})))
        assert_that(
            generated_roots,
            has_item(
                Root(u'ceng', lexeme, {PhoneticExpectation.VowelStart},
                     {LLNotCont, LVF, LLC, LLVless, LVU})))

        lexeme = Lexeme(u"kap", u"kap", SyntacticCategory.NOUN, None,
                        {LexemeAttribute.Voicing})
        generated_roots = RootGenerator.generate(lexeme)
        assert_that(generated_roots, has_length(2))
        assert_that(
            generated_roots,
            has_item(
                Root(u'kap', lexeme, {PhoneticExpectation.ConsonantStart},
                     {LLNotCont, LVB, LLC, LLVless, LLVlessStop, LVU})))
        assert_that(
            generated_roots,
            has_item(
                Root(u'kab', lexeme, {PhoneticExpectation.VowelStart},
                     {LLNotCont, LVB, LLC, LLVless, LVU})))
    def test_should_generate_verbs_with_voicing_and_novoicing(self):
        lexeme = Lexeme(u"gitmek", u"git", SyntacticCategory.VERB, None,
                        {LexemeAttribute.Voicing})
        generated_roots = RootGenerator.generate(lexeme)
        assert_that(generated_roots, has_length(2))
        assert_that(
            generated_roots,
            has_item(
                Root(u'git', lexeme, {PhoneticExpectation.ConsonantStart},
                     {LLNotCont, LVF, LLC, LLVless, LLVlessStop, LVU})))
        assert_that(
            generated_roots,
            has_item(
                Root(u'gid', lexeme, {PhoneticExpectation.VowelStart},
                     {LLNotCont, LVF, LLC, LLVless, LVU})))

        lexeme = Lexeme(u"sürtmek", u"sürt", SyntacticCategory.VERB, None,
                        None)
        generated_roots = RootGenerator.generate(lexeme)
        assert_that(generated_roots, has_length(1))
        assert_that(
            generated_roots,
            has_item(
                Root(u'sürt', lexeme, None,
                     {LLNotCont, LVF, LLC, LLVless, LLVlessStop, LVR})))
    def test_should_generate_with_doubling(self):
        lexeme = Lexeme(u"hac", u"hac", SyntacticCategory.NOUN, None,
                        {LexemeAttribute.Doubling})
        generated_roots = RootGenerator.generate(lexeme)
        assert_that(generated_roots, has_length(2))
        assert_that(
            generated_roots,
            has_item(
                Root(u"hac", lexeme, {PhoneticExpectation.ConsonantStart},
                     {LLNotCont, LLVStop, LVB, LLC, LLNotVless, LVU})))
        assert_that(
            generated_roots,
            has_item(
                Root(u"hacc", lexeme, {PhoneticExpectation.VowelStart},
                     {LLNotCont, LLVStop, LVB, LLC, LLNotVless, LVU})))

        lexeme = Lexeme(u"ret", u"ret", SyntacticCategory.NOUN, None,
                        {LexemeAttribute.Voicing, LexemeAttribute.Doubling})
        generated_roots = RootGenerator.generate(lexeme)
        assert_that(generated_roots, has_length(2))
        assert_that(
            generated_roots,
            has_item(
                Root(u"ret", lexeme, {PhoneticExpectation.ConsonantStart},
                     {LLNotCont, LVF, LLC, LLVless, LLVlessStop, LVU})))
        assert_that(
            generated_roots,
            has_item(
                Root(u"redd", lexeme, {PhoneticExpectation.VowelStart},
                     {LLNotCont, LVF, LLC, LLVless, LVU})))
Example #5
0
    def test_should_generate_with_inverse_harmony(self):
        lexeme = Lexeme(u"kemal", u"kemal", SyntacticCategory.NOUN, None, {LexemeAttribute.InverseHarmony})
        generated_roots = RootGenerator.generate(lexeme)
        assert_that(generated_roots, has_length(1))
        assert_that(generated_roots, has_item(Root(u"kemal", lexeme, None, {LLCont, LVF, LLC, LLNotVless, LVU})))

        lexeme = Lexeme(u"kanaat", u"kanaat", SyntacticCategory.NOUN, None, {LexemeAttribute.NoVoicing, LexemeAttribute.InverseHarmony})
        generated_roots = RootGenerator.generate(lexeme)
        assert_that(generated_roots, has_length(1))
        assert_that(generated_roots, has_item(Root(u"kanaat", lexeme, None, {LLNotCont, LVF, LLC, LLVless, LLVlessStop, LVU})))
Example #6
0
    def test_should_generate_with_no_modifiers(self):
        lexeme = Lexeme(u"elma", u"elma", SyntacticCategory.NOUN, None, None)
        generated_roots = RootGenerator.generate(lexeme)
        assert_that(generated_roots, has_length(1))
        assert_that(generated_roots, has_item(Root(u'elma', lexeme, None, {LLNotCont, LLV, LVB, LLNotVless, LVU})))

        lexeme = Lexeme(u"kek", u"kek", SyntacticCategory.NOUN, None, {LexemeAttribute.NoVoicing})
        generated_roots = RootGenerator.generate(lexeme)
        assert_that(generated_roots, has_length(1))
        assert_that(generated_roots, has_item(Root(u'kek', lexeme, None, {LLNotCont, LVF, LLC, LLVless, LLVlessStop, LVU})))
Example #7
0
    def test_should_generate_verbs_with_voicing_and_novoicing(self):
        lexeme = Lexeme(u"gitmek", u"git", SyntacticCategory.VERB, None, {LexemeAttribute.Voicing})
        generated_roots = RootGenerator.generate(lexeme)
        assert_that(generated_roots, has_length(2))
        assert_that(generated_roots, has_item(Root(u'git', lexeme, {PhoneticExpectation.ConsonantStart}, {LLNotCont, LVF, LLC, LLVless, LLVlessStop, LVU})))
        assert_that(generated_roots, has_item(Root(u'gid', lexeme, {PhoneticExpectation.VowelStart}, {LLNotCont, LVF, LLC, LLVless, LVU})))

        lexeme = Lexeme(u"sürtmek", u"sürt", SyntacticCategory.VERB, None, None)
        generated_roots = RootGenerator.generate(lexeme)
        assert_that(generated_roots, has_length(1))
        assert_that(generated_roots, has_item(Root(u'sürt', lexeme, None, {LLNotCont, LVF, LLC, LLVless, LLVlessStop, LVR})))
Example #8
0
    def test_should_generate_with_last_vowel_drop(self):
        lexeme = Lexeme(u"ağız", u"ağız", SyntacticCategory.NOUN, None, {LexemeAttribute.LastVowelDrop})
        generated_roots = RootGenerator.generate(lexeme)
        assert_that(generated_roots, has_length(2))
        assert_that(generated_roots, has_item(Root(u"ağız", lexeme, {PhoneticExpectation.ConsonantStart}, {LLCont, LVB, LLC, LLNotVless, LVU})))
        assert_that(generated_roots, has_item(Root(u"ağz", lexeme, {PhoneticExpectation.VowelStart}, {LLCont, LVB, LLC, LLNotVless, LVU})))

        lexeme = Lexeme(u"ahit", u"ahit", SyntacticCategory.NOUN, None, {LexemeAttribute.LastVowelDrop, LexemeAttribute.Voicing})
        generated_roots = RootGenerator.generate(lexeme)
        assert_that(generated_roots, has_length(2))
        assert_that(generated_roots, has_item(Root(u"ahit", lexeme, {PhoneticExpectation.ConsonantStart}, {LLNotCont, LVF, LLC, LLVless, LLVlessStop, LVU})))
        assert_that(generated_roots, has_item(Root(u"ahd", lexeme, {PhoneticExpectation.VowelStart}, {LLNotCont, LVF, LLVless, LLC, LVU})))
Example #9
0
    def test_should_generate_with_doubling(self):
        lexeme = Lexeme(u"hac", u"hac", SyntacticCategory.NOUN, None, {LexemeAttribute.Doubling})
        generated_roots = RootGenerator.generate(lexeme)
        assert_that(generated_roots, has_length(2))
        assert_that(generated_roots, has_item(Root(u"hac", lexeme, {PhoneticExpectation.ConsonantStart}, {LLNotCont, LLVStop, LVB, LLC, LLNotVless, LVU})))
        assert_that(generated_roots, has_item(Root(u"hacc", lexeme, {PhoneticExpectation.VowelStart}, {LLNotCont, LLVStop, LVB, LLC, LLNotVless, LVU})))

        lexeme = Lexeme(u"ret", u"ret", SyntacticCategory.NOUN, None, {LexemeAttribute.Voicing, LexemeAttribute.Doubling})
        generated_roots = RootGenerator.generate(lexeme)
        assert_that(generated_roots, has_length(2))
        assert_that(generated_roots, has_item(Root(u"ret", lexeme, {PhoneticExpectation.ConsonantStart}, {LLNotCont, LVF, LLC, LLVless, LLVlessStop, LVU})))
        assert_that(generated_roots, has_item(Root(u"redd", lexeme, {PhoneticExpectation.VowelStart}, {LLNotCont, LVF, LLC, LLVless, LVU})))
Example #10
0
    def setUp(self):
        self.parseset_creator = ParseSetCreator()

        all_roots = []

        lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map = (RootMapGenerator()).generate(all_roots)

        suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph())))
        suffix_graph.initialize()

        predefined_paths = PredefinedPaths(root_map, suffix_graph)
        predefined_paths.create_predefined_paths()

        word_root_finder = WordRootFinder(root_map)
        digit_numeral_root_finder = DigitNumeralRootFinder()
        text_numeral_root_finder = TextNumeralRootFinder(root_map)
        proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder()
        proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder()

        self.parser = UpperCaseSupportingContextlessMorphologicalParser(suffix_graph, predefined_paths,
            [word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder])
Example #11
0
    def setUpClass(cls):
        super(TransitionGeneratorTest, cls).setUpClass()
        all_roots = []

        lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map_generator = RootMapGenerator()
        cls.root_map = root_map_generator.generate(all_roots)

        suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph())))
        suffix_graph.initialize()

        predefined_paths = PredefinedPaths(cls.root_map, suffix_graph)
        predefined_paths.create_predefined_paths()

        word_root_finder = WordRootFinder(cls.root_map)
        digit_numeral_root_finder = DigitNumeralRootFinder()
        text_numeral_root_finder = TextNumeralRootFinder(cls.root_map)
        proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder()
        proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder()

        cls.parser = ContextlessMorphologicalParser(suffix_graph, predefined_paths,
            [word_root_finder, digit_numeral_root_finder, text_numeral_root_finder,
             proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder])

        cls.transition_generator = TransitionGenerator(cls.parser)
Example #12
0
    def setUpClass(cls):
        super(
            MorphemeContainerContextlessProbabilityGeneratorWithContainersTest,
            cls).setUpClass()
        all_roots = []

        lexicon_lines = u'''
            duvar
            tutku
            saç
            oğul [A:LastVowelDrop]
            demek [A:RootChange, Passive_In, Passive_InIl]
            bu [P:Det]
        '''.strip().splitlines()

        lexemes = LexiconLoader.load_from_lines(lexicon_lines)
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map_generator = RootMapGenerator()
        cls.root_map = root_map_generator.generate(all_roots)

        suffix_graph = BasicSuffixGraph()
        suffix_graph.initialize()

        word_root_finder = WordRootFinder(cls.root_map)

        cls.contextless_parser = ContextlessMorphologicalParser(
            suffix_graph, None, [word_root_finder])
Example #13
0
    def setUpClass(cls):
        super(_LikelihoodCalculatorTest, cls).setUpClass()
        all_roots = []

        lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../../../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map_generator = RootMapGenerator()
        cls.root_map = root_map_generator.generate(all_roots)

        suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph())))
        suffix_graph.initialize()

        predefined_paths = PredefinedPaths(cls.root_map, suffix_graph)
        predefined_paths.create_predefined_paths()

        word_root_finder = WordRootFinder(cls.root_map)
        digit_numeral_root_finder = DigitNumeralRootFinder()
        text_numeral_root_finder = TextNumeralRootFinder(cls.root_map)
        proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder()
        proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder()

        cls.contextless_parser = ContextlessMorphologicalParser(suffix_graph, predefined_paths,
            [word_root_finder, digit_numeral_root_finder, text_numeral_root_finder,
             proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder])

        cls.mongodb_connection = pymongo.Connection(host='127.0.0.1')
        cls.collection_map = {
            1: cls.mongodb_connection['trnltk']['wordUnigrams999'],
            2: cls.mongodb_connection['trnltk']['wordBigrams999'],
            3: cls.mongodb_connection['trnltk']['wordTrigrams999']
        }

        cls.generator = None
    def setUpClass(cls):
        super(MorphemeContainerContextlessProbabilityGeneratorWithContainersTest, cls).setUpClass()
        all_roots = []

        lexicon_lines = u'''
            duvar
            tutku
            saç
            oğul [A:LastVowelDrop]
            demek [A:RootChange, Passive_In, Passive_InIl]
            bu [P:Det]
        '''.strip().splitlines()

        lexemes = LexiconLoader.load_from_lines(lexicon_lines)
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map_generator = RootMapGenerator()
        cls.root_map = root_map_generator.generate(all_roots)

        suffix_graph = BasicSuffixGraph()
        suffix_graph.initialize()

        word_root_finder = WordRootFinder(cls.root_map)

        cls.contextless_parser = ContextlessMorphologicalParser(suffix_graph, None,
            [word_root_finder])
Example #15
0
    def setUpClass(cls):
        super(InterpolatingLikelihoodCalculatorCalculationContextTest,
              cls).setUpClass()
        all_roots = []

        lexemes = LexiconLoader.load_from_file(
            os.path.join(os.path.dirname(__file__),
                         '../../../../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map_generator = RootMapGenerator()
        cls.root_map = root_map_generator.generate(all_roots)

        suffix_graph = CopulaSuffixGraph(
            NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph())))
        suffix_graph.initialize()

        predefined_paths = PredefinedPaths(cls.root_map, suffix_graph)
        predefined_paths.create_predefined_paths()

        word_root_finder = WordRootFinder(cls.root_map)
        digit_numeral_root_finder = DigitNumeralRootFinder()
        text_numeral_root_finder = TextNumeralRootFinder(cls.root_map)
        proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder(
        )
        proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder(
        )

        cls.contextless_parser = UpperCaseSupportingContextlessMorphologicalParser(
            suffix_graph, predefined_paths, [
                word_root_finder, digit_numeral_root_finder,
                text_numeral_root_finder,
                proper_noun_from_apostrophe_root_finder,
                proper_noun_without_apostrophe_root_finder
            ])

        mongodb_connection = pymongo.Connection(host='127.0.0.1')
        cls.collection_map = {
            1: mongodb_connection['trnltk']['wordUnigrams999'],
            2: mongodb_connection['trnltk']['wordBigrams999'],
            3: mongodb_connection['trnltk']['wordTrigrams999']
        }

        database_index_builder = DatabaseIndexBuilder(cls.collection_map)
        target_form_given_context_counter = InMemoryCachingTargetFormGivenContextCounter(
            cls.collection_map)
        ngram_frequency_smoother = CachedSimpleGoodTuringNGramFrequencySmoother(
        )
        sequence_likelihood_calculator = UniformSequenceLikelihoodCalculator()

        wrapped_generator = ContextParsingLikelihoodCalculator(
            database_index_builder, target_form_given_context_counter,
            ngram_frequency_smoother, sequence_likelihood_calculator)

        cls.generator = InterpolatingLikelihoodCalculator(wrapped_generator)
    def setUpClass(cls):
        super(ParserTestWithExtendedGraph, cls).setUpClass()
        all_roots = []

        lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))


        cls._org_root_map = (RootMapGenerator()).generate(all_roots)
Example #17
0
    def setUpClass(cls):
        super(FormatterTest, cls).setUpClass()
        all_roots = []

        dictionary_content = ["kitap", "yapmak"]
        lexemes = LexiconLoader.load_from_lines(dictionary_content)
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        cls.root_map = RootMapGenerator().generate(all_roots)
    def test_should_generate_with_no_modifiers(self):
        lexeme = Lexeme(u"elma", u"elma", SyntacticCategory.NOUN, None, None)
        generated_roots = RootGenerator.generate(lexeme)
        assert_that(generated_roots, has_length(1))
        assert_that(
            generated_roots,
            has_item(
                Root(u'elma', lexeme, None,
                     {LLNotCont, LLV, LVB, LLNotVless, LVU})))

        lexeme = Lexeme(u"kek", u"kek", SyntacticCategory.NOUN, None,
                        {LexemeAttribute.NoVoicing})
        generated_roots = RootGenerator.generate(lexeme)
        assert_that(generated_roots, has_length(1))
        assert_that(
            generated_roots,
            has_item(
                Root(u'kek', lexeme, None,
                     {LLNotCont, LVF, LLC, LLVless, LLVlessStop, LVU})))
Example #19
0
    def setUpClass(cls):
        super(FormatterTest, cls).setUpClass()
        all_roots = []

        dictionary_content = ["kitap", "yapmak"]
        lexemes = LexiconLoader.load_from_lines(dictionary_content)
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        cls.root_map = RootMapGenerator().generate(all_roots)
    def create(cls, master_dictionary_path, ngram_collection_map):
        """
        @type master_dictionary_path: str or unicode
        @param ngram_collection_map: list<Collection>
        @rtype ContextfulMorphologicalParser
        """
        all_roots = []

        lexemes = LexiconLoader.load_from_file(master_dictionary_path)
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map_generator = RootMapGenerator()
        root_map = root_map_generator.generate(all_roots)

        suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph())))
        suffix_graph.initialize()

        predefined_paths = PredefinedPaths(root_map, suffix_graph)
        predefined_paths.create_predefined_paths()

        word_root_finder = WordRootFinder(root_map)
        digit_numeral_root_finder = DigitNumeralRootFinder()
        text_numeral_root_finder = TextNumeralRootFinder(root_map)
        proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder()
        proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder()

        contextless_parser = UpperCaseSupportingContextlessMorphologicalParser(suffix_graph, predefined_paths,
            [word_root_finder, digit_numeral_root_finder, text_numeral_root_finder,
             proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder])

        database_index_builder = DatabaseIndexBuilder(ngram_collection_map)
        target_form_given_context_counter = InMemoryCachingTargetFormGivenContextCounter(ngram_collection_map)
        ngram_frequency_smoother = CachedSimpleGoodTuringNGramFrequencySmoother()
        sequence_likelihood_calculator = SequenceLikelihoodCalculator(None)

        collocation_metric_calculator = ContextParsingLikelihoodCalculator(database_index_builder,
            target_form_given_context_counter, ngram_frequency_smoother,
            sequence_likelihood_calculator)

        interpolating_collocation_metric_calculator = InterpolatingLikelihoodCalculator(collocation_metric_calculator)

        cached_contextless_distribution_smoother = CachedContextlessDistributionSmoother()
        contextless_distribution_metric_calculator = ContextlessDistributionCalculator(database_index_builder,
            target_form_given_context_counter, cached_contextless_distribution_smoother)

        contextful_likelihood_calculator = ContextfulLikelihoodCalculator(interpolating_collocation_metric_calculator,
            contextless_distribution_metric_calculator)

        sequence_likelihood_calculator._contextful_likelihood_calculator = contextful_likelihood_calculator

        contextful_morphological_parser = ContextfulMorphologicalParser(contextless_parser,
            contextful_likelihood_calculator)

        return contextful_morphological_parser
Example #21
0
    def setUpClass(cls):
        super(StatisticalParserTest, cls).setUpClass()
        all_roots = []

        lexemes = LexiconLoader.load_from_file(
            os.path.join(os.path.dirname(__file__),
                         '../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map_generator = RootMapGenerator()
        cls.root_map = root_map_generator.generate(all_roots)

        suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(
            BasicSuffixGraph()))
        suffix_graph.initialize()

        predefined_paths = PredefinedPaths(cls.root_map, suffix_graph)
        predefined_paths.create_predefined_paths()

        word_root_finder = WordRootFinder(cls.root_map)
        digit_numeral_root_finder = DigitNumeralRootFinder()
        text_numeral_root_finder = TextNumeralRootFinder(cls.root_map)
        proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder(
        )
        proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder(
        )

        contextless_parser = ContextlessMorphologicalParser(
            suffix_graph, predefined_paths, [
                word_root_finder, digit_numeral_root_finder,
                text_numeral_root_finder,
                proper_noun_from_apostrophe_root_finder,
                proper_noun_without_apostrophe_root_finder
            ])

        parseset_index = "001"
        dom = parse(
            os.path.join(
                os.path.dirname(__file__),
                '../../testresources/parsesets/parseset{}.xml'.format(
                    parseset_index)))
        parseset = ParseSetBinding.build(
            dom.getElementsByTagName("parseset")[0])
        parse_set_word_list = []
        for sentence in parseset.sentences:
            parse_set_word_list.extend(sentence.words)

        complete_word_concordance_index = CompleteWordConcordanceIndex(
            parse_set_word_list)

        cls.parser = StatisticalParser(contextless_parser,
                                       complete_word_concordance_index)
    def test_should_generate_with_inverse_harmony(self):
        lexeme = Lexeme(u"kemal", u"kemal", SyntacticCategory.NOUN, None,
                        {LexemeAttribute.InverseHarmony})
        generated_roots = RootGenerator.generate(lexeme)
        assert_that(generated_roots, has_length(1))
        assert_that(
            generated_roots,
            has_item(
                Root(u"kemal", lexeme, None,
                     {LLCont, LVF, LLC, LLNotVless, LVU})))

        lexeme = Lexeme(
            u"kanaat", u"kanaat", SyntacticCategory.NOUN, None,
            {LexemeAttribute.NoVoicing, LexemeAttribute.InverseHarmony})
        generated_roots = RootGenerator.generate(lexeme)
        assert_that(generated_roots, has_length(1))
        assert_that(
            generated_roots,
            has_item(
                Root(u"kanaat", lexeme, None,
                     {LLNotCont, LVF, LLC, LLVless, LLVlessStop, LVU})))
    def create_calculator(cls, parseset_index):
        all_roots = []

        lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map_generator = RootMapGenerator()
        cls.root_map = root_map_generator.generate(all_roots)

        suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph())))
        suffix_graph.initialize()

        predefined_paths = PredefinedPaths(cls.root_map, suffix_graph)
        predefined_paths.create_predefined_paths()

        word_root_finder = WordRootFinder(cls.root_map)
        digit_numeral_root_finder = DigitNumeralRootFinder()
        text_numeral_root_finder = TextNumeralRootFinder(cls.root_map)
        proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder()
        proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder()

        cls.contextless_parser = UpperCaseSupportingContextlessMorphologicalParser(suffix_graph, predefined_paths,
            [word_root_finder, digit_numeral_root_finder, text_numeral_root_finder,
             proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder])

        mongodb_connection = pymongo.Connection(host='127.0.0.1')
        collection_map = {
            1: mongodb_connection['trnltk']['wordUnigrams{}'.format(parseset_index)],
            2: mongodb_connection['trnltk']['wordBigrams{}'.format(parseset_index)],
            3: mongodb_connection['trnltk']['wordTrigrams{}'.format(parseset_index)]
        }

        database_index_builder = DatabaseIndexBuilder(collection_map)
        target_form_given_context_counter = InMemoryCachingTargetFormGivenContextCounter(collection_map)
        ngram_frequency_smoother = CachedSimpleGoodTuringNGramFrequencySmoother()
        sequence_likelihood_calculator = SequenceLikelihoodCalculator(None)

        collocation_metric_calculator = ContextParsingLikelihoodCalculator(database_index_builder, target_form_given_context_counter, ngram_frequency_smoother,
            sequence_likelihood_calculator)

        interpolating_collocation_metric_calculator = InterpolatingLikelihoodCalculator(collocation_metric_calculator)

        contextless_distribution_metric_calculator = ContextlessDistributionCalculator(database_index_builder, target_form_given_context_counter)

        contextful_likelihood_calculator = ContextfulLikelihoodCalculator(interpolating_collocation_metric_calculator, contextless_distribution_metric_calculator)

        sequence_likelihood_calculator._contextful_likelihood_calculator = contextful_likelihood_calculator

        return contextful_likelihood_calculator
 def test_should_generate_with_progressive_vowel_drop(self):
     lexeme = Lexeme(u"atamak", u"ata", SyntacticCategory.VERB, None,
                     {LexemeAttribute.ProgressiveVowelDrop})
     generated_roots = RootGenerator.generate(lexeme)
     assert_that(generated_roots, has_length(2))
     assert_that(
         generated_roots,
         has_item(
             Root(u"ata", lexeme, None,
                  {LLNotCont, LVB, LLV, LLNotVless, LVU})))
     assert_that(
         generated_roots,
         has_item(
             Root(u"at", lexeme, {PhoneticExpectation.VowelStart},
                  {LLNotCont, LVB, LLC, LLVless, LLVlessStop, LVU})))
Example #25
0
    def setUpClass(cls):
        super(PredefinedPathsTest, cls).setUpClass()
        all_roots = []

        lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map_generator = RootMapGenerator()
        cls.root_map = root_map_generator.generate(all_roots)

        cls.morpheme_container_map = {}

        cls.suffix_graph = BasicSuffixGraph()
        cls.suffix_graph.initialize()
Example #26
0
    def test_should_generate_with_voicing(self):
        lexeme = Lexeme(u"armut", u"armut", SyntacticCategory.NOUN, None, {LexemeAttribute.Voicing})
        generated_roots = RootGenerator.generate(lexeme)
        assert_that(generated_roots, has_length(2))
        assert_that(generated_roots, has_item(Root(u'armut', lexeme, {PhoneticExpectation.ConsonantStart}, {LLNotCont, LVB, LLC, LLVless, LLVlessStop, LVR})))
        assert_that(generated_roots, has_item(Root(u'armud', lexeme, {PhoneticExpectation.VowelStart}, {LLNotCont, LVB, LLC, LLVless, LVR})))

        lexeme = Lexeme(u"kapak", u"kapak", SyntacticCategory.NOUN, None, {LexemeAttribute.Voicing})
        generated_roots = RootGenerator.generate(lexeme)
        assert_that(generated_roots, has_length(2))
        assert_that(generated_roots, has_item(Root(u'kapak', lexeme, {PhoneticExpectation.ConsonantStart}, {LLNotCont, LVB, LLC, LLVless, LLVlessStop, LVU})))
        assert_that(generated_roots, has_item(Root(u'kapağ', lexeme, {PhoneticExpectation.VowelStart}, {LLCont, LVB, LLC, LLVless, LVU})))

        lexeme = Lexeme(u"cenk", u"cenk", SyntacticCategory.NOUN, None, {LexemeAttribute.Voicing})
        generated_roots = RootGenerator.generate(lexeme)
        assert_that(generated_roots, has_length(2))
        assert_that(generated_roots, has_item(Root(u'cenk', lexeme, {PhoneticExpectation.ConsonantStart}, {LLNotCont, LVF, LLC, LLVless, LLVlessStop, LVU})))
        assert_that(generated_roots, has_item(Root(u'ceng', lexeme, {PhoneticExpectation.VowelStart}, {LLNotCont, LVF, LLC, LLVless, LVU})))

        lexeme = Lexeme(u"kap", u"kap", SyntacticCategory.NOUN, None, {LexemeAttribute.Voicing})
        generated_roots = RootGenerator.generate(lexeme)
        assert_that(generated_roots, has_length(2))
        assert_that(generated_roots, has_item(Root(u'kap', lexeme, {PhoneticExpectation.ConsonantStart}, {LLNotCont, LVB, LLC, LLVless, LLVlessStop, LVU})))
        assert_that(generated_roots, has_item(Root(u'kab', lexeme, {PhoneticExpectation.VowelStart}, {LLNotCont, LVB, LLC, LLVless, LVU})))
    def setUpClass(cls):
        super(PredefinedPathsTest, cls).setUpClass()
        all_roots = []

        lexemes = LexiconLoader.load_from_file(
            os.path.join(os.path.dirname(__file__),
                         '../../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map_generator = RootMapGenerator()
        cls.root_map = root_map_generator.generate(all_roots)

        cls.morpheme_container_map = {}

        cls.suffix_graph = BasicSuffixGraph()
        cls.suffix_graph.initialize()
Example #28
0
    def setUpClass(cls):
        super(_LikelihoodCalculatorTest, cls).setUpClass()
        all_roots = []

        lexemes = LexiconLoader.load_from_file(
            os.path.join(os.path.dirname(__file__),
                         '../../../../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map_generator = RootMapGenerator()
        cls.root_map = root_map_generator.generate(all_roots)

        suffix_graph = CopulaSuffixGraph(
            NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph())))
        suffix_graph.initialize()

        predefined_paths = PredefinedPaths(cls.root_map, suffix_graph)
        predefined_paths.create_predefined_paths()

        word_root_finder = WordRootFinder(cls.root_map)
        digit_numeral_root_finder = DigitNumeralRootFinder()
        text_numeral_root_finder = TextNumeralRootFinder(cls.root_map)
        proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder(
        )
        proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder(
        )

        cls.contextless_parser = ContextlessMorphologicalParser(
            suffix_graph, predefined_paths, [
                word_root_finder, digit_numeral_root_finder,
                text_numeral_root_finder,
                proper_noun_from_apostrophe_root_finder,
                proper_noun_without_apostrophe_root_finder
            ])

        cls.mongodb_connection = pymongo.Connection(host='127.0.0.1')
        cls.collection_map = {
            1: cls.mongodb_connection['trnltk']['wordUnigrams999'],
            2: cls.mongodb_connection['trnltk']['wordBigrams999'],
            3: cls.mongodb_connection['trnltk']['wordTrigrams999']
        }

        cls.generator = None
    def setUpClass(cls):
        all_roots = []

        lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../../../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map_generator = RootMapGenerator()
        cls.root_map = root_map_generator.generate(all_roots)

        suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph())))
        suffix_graph.initialize()

        predefined_paths = PredefinedPaths(cls.root_map, suffix_graph)
        predefined_paths.create_predefined_paths()

        word_root_finder = WordRootFinder(cls.root_map)
        digit_numeral_root_finder = DigitNumeralRootFinder()
        text_numeral_root_finder = TextNumeralRootFinder(cls.root_map)
        proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder()
        proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder()

        cls.contextless_parser = ContextlessMorphologicalParser(suffix_graph, predefined_paths,
            [word_root_finder, digit_numeral_root_finder, text_numeral_root_finder,
             proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder])

        mongodb_connection = pymongo.Connection(host='127.0.0.1')
        collection_map = {
            1: mongodb_connection['trnltk']['wordUnigrams{}'.format(cls.parseset_index)]
        }

        database_index_builder = DatabaseIndexBuilder(collection_map)
        target_form_given_context_counter = TargetFormGivenContextCounter(collection_map)
        smoother = CachedContextlessDistributionSmoother()
        smoother.initialize()

        cls.calculator = ContextlessDistributionCalculator(database_index_builder, target_form_given_context_counter, smoother)
        cls.calculator.build_indexes()
Example #30
0
    def setUpClass(cls):
        super(TransitionGeneratorTest, cls).setUpClass()
        all_roots = []

        lexemes = LexiconLoader.load_from_file(
            os.path.join(os.path.dirname(__file__),
                         '../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map_generator = RootMapGenerator()
        cls.root_map = root_map_generator.generate(all_roots)

        suffix_graph = CopulaSuffixGraph(
            NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph())))
        suffix_graph.initialize()

        predefined_paths = PredefinedPaths(cls.root_map, suffix_graph)
        predefined_paths.create_predefined_paths()

        word_root_finder = WordRootFinder(cls.root_map)
        digit_numeral_root_finder = DigitNumeralRootFinder()
        text_numeral_root_finder = TextNumeralRootFinder(cls.root_map)
        proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder(
        )
        proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder(
        )

        cls.parser = ContextlessMorphologicalParser(
            suffix_graph, predefined_paths, [
                word_root_finder, digit_numeral_root_finder,
                text_numeral_root_finder,
                proper_noun_from_apostrophe_root_finder,
                proper_noun_without_apostrophe_root_finder
            ])

        cls.transition_generator = TransitionGenerator(cls.parser)
Example #31
0
    def setUpClass(cls):
        super(StatisticalParserTest, cls).setUpClass()
        all_roots = []

        lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))


        root_map_generator = RootMapGenerator()
        cls.root_map = root_map_generator.generate(all_roots)

        suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(BasicSuffixGraph()))
        suffix_graph.initialize()

        predefined_paths = PredefinedPaths(cls.root_map, suffix_graph)
        predefined_paths.create_predefined_paths()

        word_root_finder = WordRootFinder(cls.root_map)
        digit_numeral_root_finder = DigitNumeralRootFinder()
        text_numeral_root_finder = TextNumeralRootFinder(cls.root_map)
        proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder()
        proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder()

        contextless_parser = ContextlessMorphologicalParser(suffix_graph, predefined_paths,
            [word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder])

        parseset_index = "001"
        dom = parse(os.path.join(os.path.dirname(__file__), '../../testresources/parsesets/parseset{}.xml'.format(parseset_index)))
        parseset = ParseSetBinding.build(dom.getElementsByTagName("parseset")[0])
        parse_set_word_list = []
        for sentence in parseset.sentences:
            parse_set_word_list.extend(sentence.words)

        complete_word_concordance_index = CompleteWordConcordanceIndex(parse_set_word_list)

        cls.parser = StatisticalParser(contextless_parser, complete_word_concordance_index)
Example #32
0
 def test_should_generate_with_progressive_vowel_drop(self):
     lexeme = Lexeme(u"atamak", u"ata", SyntacticCategory.VERB, None, {LexemeAttribute.ProgressiveVowelDrop})
     generated_roots = RootGenerator.generate(lexeme)
     assert_that(generated_roots, has_length(2))
     assert_that(generated_roots, has_item(Root(u"ata", lexeme, None, {LLNotCont, LVB, LLV, LLNotVless, LVU})))
     assert_that(generated_roots, has_item(Root(u"at", lexeme, {PhoneticExpectation.VowelStart}, {LLNotCont, LVB, LLC, LLVless, LLVlessStop, LVU})))