コード例 #1
0
    def setUp(self):
        self.parseset_creator = ParseSetCreator()

        all_roots = []

        lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map = (RootMapGenerator()).generate(all_roots)

        suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph())))
        suffix_graph.initialize()

        predefined_paths = PredefinedPaths(root_map, suffix_graph)
        predefined_paths.create_predefined_paths()

        word_root_finder = WordRootFinder(root_map)
        digit_numeral_root_finder = DigitNumeralRootFinder()
        text_numeral_root_finder = TextNumeralRootFinder(root_map)
        proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder()
        proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder()

        self.parser = UpperCaseSupportingContextlessMorphologicalParser(suffix_graph, predefined_paths,
            [word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder])
コード例 #2
0
    def setUpClass(cls):
        super(InterpolatingLikelihoodCalculatorCalculationContextTest,
              cls).setUpClass()
        all_roots = []

        lexemes = LexiconLoader.load_from_file(
            os.path.join(os.path.dirname(__file__),
                         '../../../../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map_generator = RootMapGenerator()
        cls.root_map = root_map_generator.generate(all_roots)

        suffix_graph = CopulaSuffixGraph(
            NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph())))
        suffix_graph.initialize()

        predefined_paths = PredefinedPaths(cls.root_map, suffix_graph)
        predefined_paths.create_predefined_paths()

        word_root_finder = WordRootFinder(cls.root_map)
        digit_numeral_root_finder = DigitNumeralRootFinder()
        text_numeral_root_finder = TextNumeralRootFinder(cls.root_map)
        proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder(
        )
        proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder(
        )

        cls.contextless_parser = UpperCaseSupportingContextlessMorphologicalParser(
            suffix_graph, predefined_paths, [
                word_root_finder, digit_numeral_root_finder,
                text_numeral_root_finder,
                proper_noun_from_apostrophe_root_finder,
                proper_noun_without_apostrophe_root_finder
            ])

        mongodb_connection = pymongo.Connection(host='127.0.0.1')
        cls.collection_map = {
            1: mongodb_connection['trnltk']['wordUnigrams999'],
            2: mongodb_connection['trnltk']['wordBigrams999'],
            3: mongodb_connection['trnltk']['wordTrigrams999']
        }

        database_index_builder = DatabaseIndexBuilder(cls.collection_map)
        target_form_given_context_counter = InMemoryCachingTargetFormGivenContextCounter(
            cls.collection_map)
        ngram_frequency_smoother = CachedSimpleGoodTuringNGramFrequencySmoother(
        )
        sequence_likelihood_calculator = UniformSequenceLikelihoodCalculator()

        wrapped_generator = ContextParsingLikelihoodCalculator(
            database_index_builder, target_form_given_context_counter,
            ngram_frequency_smoother, sequence_likelihood_calculator)

        cls.generator = InterpolatingLikelihoodCalculator(wrapped_generator)
コード例 #3
0
    def setUpClass(cls):
        super(StatisticalParserTest, cls).setUpClass()
        all_roots = []

        lexemes = LexiconLoader.load_from_file(
            os.path.join(os.path.dirname(__file__),
                         '../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map_generator = RootMapGenerator()
        cls.root_map = root_map_generator.generate(all_roots)

        suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(
            BasicSuffixGraph()))
        suffix_graph.initialize()

        predefined_paths = PredefinedPaths(cls.root_map, suffix_graph)
        predefined_paths.create_predefined_paths()

        word_root_finder = WordRootFinder(cls.root_map)
        digit_numeral_root_finder = DigitNumeralRootFinder()
        text_numeral_root_finder = TextNumeralRootFinder(cls.root_map)
        proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder(
        )
        proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder(
        )

        contextless_parser = ContextlessMorphologicalParser(
            suffix_graph, predefined_paths, [
                word_root_finder, digit_numeral_root_finder,
                text_numeral_root_finder,
                proper_noun_from_apostrophe_root_finder,
                proper_noun_without_apostrophe_root_finder
            ])

        parseset_index = "001"
        dom = parse(
            os.path.join(
                os.path.dirname(__file__),
                '../../testresources/parsesets/parseset{}.xml'.format(
                    parseset_index)))
        parseset = ParseSetBinding.build(
            dom.getElementsByTagName("parseset")[0])
        parse_set_word_list = []
        for sentence in parseset.sentences:
            parse_set_word_list.extend(sentence.words)

        complete_word_concordance_index = CompleteWordConcordanceIndex(
            parse_set_word_list)

        cls.parser = StatisticalParser(contextless_parser,
                                       complete_word_concordance_index)
コード例 #4
0
    def setUpClass(cls):
        super(ParserTestWithProperNouns, cls).setUpClass()
        cls.root_map = dict()

        suffix_graph = ProperNounSuffixGraph(BasicSuffixGraph())
        suffix_graph.initialize()

        proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder()
        proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder()

        cls.parser = ContextlessMorphologicalParser(suffix_graph, None,
            [proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder])
コード例 #5
0
    def setUpClass(cls):
        super(_LikelihoodCalculatorTest, cls).setUpClass()
        all_roots = []

        lexemes = LexiconLoader.load_from_file(
            os.path.join(os.path.dirname(__file__),
                         '../../../../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map_generator = RootMapGenerator()
        cls.root_map = root_map_generator.generate(all_roots)

        suffix_graph = CopulaSuffixGraph(
            NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph())))
        suffix_graph.initialize()

        predefined_paths = PredefinedPaths(cls.root_map, suffix_graph)
        predefined_paths.create_predefined_paths()

        word_root_finder = WordRootFinder(cls.root_map)
        digit_numeral_root_finder = DigitNumeralRootFinder()
        text_numeral_root_finder = TextNumeralRootFinder(cls.root_map)
        proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder(
        )
        proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder(
        )

        cls.contextless_parser = ContextlessMorphologicalParser(
            suffix_graph, predefined_paths, [
                word_root_finder, digit_numeral_root_finder,
                text_numeral_root_finder,
                proper_noun_from_apostrophe_root_finder,
                proper_noun_without_apostrophe_root_finder
            ])

        cls.mongodb_connection = pymongo.Connection(host='127.0.0.1')
        cls.collection_map = {
            1: cls.mongodb_connection['trnltk']['wordUnigrams999'],
            2: cls.mongodb_connection['trnltk']['wordBigrams999'],
            3: cls.mongodb_connection['trnltk']['wordTrigrams999']
        }

        cls.generator = None
コード例 #6
0
    def setUpClass(cls):
        super(TransitionGeneratorTest, cls).setUpClass()
        all_roots = []

        lexemes = LexiconLoader.load_from_file(
            os.path.join(os.path.dirname(__file__),
                         '../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map_generator = RootMapGenerator()
        cls.root_map = root_map_generator.generate(all_roots)

        suffix_graph = CopulaSuffixGraph(
            NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph())))
        suffix_graph.initialize()

        predefined_paths = PredefinedPaths(cls.root_map, suffix_graph)
        predefined_paths.create_predefined_paths()

        word_root_finder = WordRootFinder(cls.root_map)
        digit_numeral_root_finder = DigitNumeralRootFinder()
        text_numeral_root_finder = TextNumeralRootFinder(cls.root_map)
        proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder(
        )
        proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder(
        )

        cls.parser = ContextlessMorphologicalParser(
            suffix_graph, predefined_paths, [
                word_root_finder, digit_numeral_root_finder,
                text_numeral_root_finder,
                proper_noun_from_apostrophe_root_finder,
                proper_noun_without_apostrophe_root_finder
            ])

        cls.transition_generator = TransitionGenerator(cls.parser)
コード例 #7
0
 def setUp(self):
     self.root_finder = ProperNounFromApostropheRootFinder()
コード例 #8
0
class ProperNounFromApostropheRootFinderTest(unittest.TestCase):

    def setUp(self):
        self.root_finder = ProperNounFromApostropheRootFinder()

    def test_should_recognize_abbreviations(self):
        roots = self.root_finder.find_roots_for_partial_input(u"TR'")
        assert_that(roots[0].str, equal_to(u'TR'))
        assert_that(roots[0].lexeme.secondary_syntactic_category, equal_to(SecondarySyntacticCategory.ABBREVIATION))

        roots = self.root_finder.find_roots_for_partial_input(u"MB'")
        assert_that(roots[0].str, equal_to(u'MB'))
        assert_that(roots[0].lexeme.secondary_syntactic_category, equal_to(SecondarySyntacticCategory.ABBREVIATION))

        roots = self.root_finder.find_roots_for_partial_input(u"POL'")
        assert_that(roots[0].str, equal_to(u'POL'))
        assert_that(roots[0].lexeme.secondary_syntactic_category, equal_to(SecondarySyntacticCategory.ABBREVIATION))

        roots = self.root_finder.find_roots_for_partial_input(u"KAFA1500'")
        assert_that(roots[0].str, equal_to(u'KAFA1500'))
        assert_that(roots[0].lexeme.secondary_syntactic_category, equal_to(SecondarySyntacticCategory.ABBREVIATION))

        roots = self.root_finder.find_roots_for_partial_input(u"1500KAFA'")
        assert_that(roots[0].str, equal_to(u'1500KAFA'))
        assert_that(roots[0].lexeme.secondary_syntactic_category, equal_to(SecondarySyntacticCategory.ABBREVIATION))

        roots = self.root_finder.find_roots_for_partial_input(u"İŞÇĞÜÖ'")
        assert_that(roots[0].str, equal_to(u'İŞÇĞÜÖ'))
        assert_that(roots[0].lexeme.secondary_syntactic_category, equal_to(SecondarySyntacticCategory.ABBREVIATION))

        roots = self.root_finder.find_roots_for_partial_input(u"123'")
        assert_that(roots, has_length(0))


    def test_should_recognize_proper_nouns(self):
        roots = self.root_finder.find_roots_for_partial_input(u"Ahmet'")
        assert_that(roots[0].str, equal_to(u'Ahmet'))
        assert_that(roots[0].lexeme.secondary_syntactic_category, equal_to(SecondarySyntacticCategory.PROPER_NOUN))

        roots = self.root_finder.find_roots_for_partial_input(u"Mehmed'")
        assert_that(roots[0].str, equal_to(u'Mehmed'))
        assert_that(roots[0].lexeme.secondary_syntactic_category, equal_to(SecondarySyntacticCategory.PROPER_NOUN))

        roots = self.root_finder.find_roots_for_partial_input(u"A123a'")
        assert_that(roots[0].str, equal_to(u'A123a'))
        assert_that(roots[0].lexeme.secondary_syntactic_category, equal_to(SecondarySyntacticCategory.PROPER_NOUN))

        roots = self.root_finder.find_roots_for_partial_input(u"AvA'")
        assert_that(roots[0].str, equal_to(u'AvA'))
        assert_that(roots[0].lexeme.secondary_syntactic_category, equal_to(SecondarySyntacticCategory.PROPER_NOUN))

        roots = self.root_finder.find_roots_for_partial_input(u"AAxxAA'")
        assert_that(roots[0].str, equal_to(u'AAxxAA'))
        assert_that(roots[0].lexeme.secondary_syntactic_category, equal_to(SecondarySyntacticCategory.PROPER_NOUN))

        roots = self.root_finder.find_roots_for_partial_input(u"İstanbul'")
        assert_that(roots[0].str, equal_to(u'İstanbul'))
        assert_that(roots[0].lexeme.secondary_syntactic_category, equal_to(SecondarySyntacticCategory.PROPER_NOUN))

        roots = self.root_finder.find_roots_for_partial_input(u"Çanakkale'")
        assert_that(roots[0].str, equal_to(u'Çanakkale'))
        assert_that(roots[0].lexeme.secondary_syntactic_category, equal_to(SecondarySyntacticCategory.PROPER_NOUN))

        roots = self.root_finder.find_roots_for_partial_input(u"Ömer'")
        assert_that(roots[0].str, equal_to(u'Ömer'))
        assert_that(roots[0].lexeme.secondary_syntactic_category, equal_to(SecondarySyntacticCategory.PROPER_NOUN))

        roots = self.root_finder.find_roots_for_partial_input(u"Şaban'")
        assert_that(roots[0].str, equal_to(u'Şaban'))
        assert_that(roots[0].lexeme.secondary_syntactic_category, equal_to(SecondarySyntacticCategory.PROPER_NOUN))

        roots = self.root_finder.find_roots_for_partial_input(u"Ümmühan'")
        assert_that(roots[0].str, equal_to(u'Ümmühan'))
        assert_that(roots[0].lexeme.secondary_syntactic_category, equal_to(SecondarySyntacticCategory.PROPER_NOUN))

        roots = self.root_finder.find_roots_for_partial_input(u"aaa'")
        assert_that(roots, has_length(0))

        roots = self.root_finder.find_roots_for_partial_input(u"aAAAA'")
        assert_that(roots, has_length(0))

        roots = self.root_finder.find_roots_for_partial_input(u"1aa'")
        assert_that(roots, has_length(0))

        roots = self.root_finder.find_roots_for_partial_input(u"a111'")
        assert_that(roots, has_length(0))

        roots = self.root_finder.find_roots_for_partial_input(u"şaa'")
        assert_that(roots, has_length(0))
コード例 #9
0
for di in lexemes:
	all_roots.extend(CircumflexConvertingRootGenerator.generate(di))

root_map_generator = RootMapGenerator()
root_map = root_map_generator.generate(all_roots)

suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph())))
suffix_graph.initialize()

predefined_paths = PredefinedPaths(root_map, suffix_graph)
predefined_paths.create_predefined_paths()

word_root_finder = WordRootFinder(root_map)
text_numeral_root_finder = TextNumeralRootFinder(root_map)
digit_numeral_root_finder = DigitNumeralRootFinder()
proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder()
proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder()

parser = UpperCaseSupportingContextlessMorphologicalParser(suffix_graph, predefined_paths,
	[word_root_finder, text_numeral_root_finder, digit_numeral_root_finder,
	 proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder])

sentence = sys.argv[1].decode('utf-8')

for word in sentence.split():
	lst = parser.parse(word)
	root_set = set()
	for element in lst:
		formatted = formatter.format_morpheme_container_for_parseset(element)
		root = formatted[:formatted.index('+')]
		root_set.add(root.lower())