Esempio n. 1
0
    def setUp(self):
        self.parseset_creator = ParseSetCreator()

        all_roots = []

        lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map = (RootMapGenerator()).generate(all_roots)

        suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph())))
        suffix_graph.initialize()

        predefined_paths = PredefinedPaths(root_map, suffix_graph)
        predefined_paths.create_predefined_paths()

        word_root_finder = WordRootFinder(root_map)
        digit_numeral_root_finder = DigitNumeralRootFinder()
        text_numeral_root_finder = TextNumeralRootFinder(root_map)
        proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder()
        proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder()

        self.parser = UpperCaseSupportingContextlessMorphologicalParser(suffix_graph, predefined_paths,
            [word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder])
class ParseSetCreatorWithSimpleParsesetsTest(unittest.TestCase):
    def setUp(self):
        self.parseset_creator = ParseSetCreator()

        all_roots = []

        lexemes = LexiconLoader.load_from_file(
            os.path.join(os.path.dirname(__file__),
                         '../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map = (RootMapGenerator()).generate(all_roots)

        suffix_graph = CopulaSuffixGraph(
            NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph())))
        suffix_graph.initialize()

        predefined_paths = PredefinedPaths(root_map, suffix_graph)
        predefined_paths.create_predefined_paths()

        word_root_finder = WordRootFinder(root_map)
        digit_numeral_root_finder = DigitNumeralRootFinder()
        text_numeral_root_finder = TextNumeralRootFinder(root_map)
        proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder(
        )
        proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder(
        )

        self.parser = UpperCaseSupportingContextlessMorphologicalParser(
            suffix_graph, predefined_paths, [
                word_root_finder, digit_numeral_root_finder,
                text_numeral_root_finder,
                proper_noun_from_apostrophe_root_finder,
                proper_noun_without_apostrophe_root_finder
            ])

    def test_should_create_parseset_001(self):
        self._create_parseset_n("001")

    def test_should_create_parseset_002(self):
        self._create_parseset_n("002")

    def test_should_create_parseset_003(self):
        self._create_parseset_n("003")

    def test_should_create_parseset_004(self):
        self._create_parseset_n("004")

    def test_should_create_parseset_005(self):
        self._create_parseset_n("005")

    def test_should_create_parseset_999(self):
        self._create_parseset_n("999")

    def _create_parseset_n(self, set_number):
        source_file_path = os.path.join(
            os.path.dirname(__file__),
            '../../testresources/simpleparsesets/simpleparseset{}.txt'.format(
                set_number))
        destination_file_path = os.path.join(
            os.path.dirname(__file__),
            '../../testresources/parsesets/parseset{}.xml'.format(set_number))

        line_index = 0
        sentences = []
        with codecs.open(source_file_path, mode='r', encoding='utf-8') as src:
            entries_for_sentence = []
            for line in src:
                print u'Processing line {}'.format(line_index)
                line_index += 1
                if not line:
                    continue
                elif line.startswith(END_OF_SENTENCE_MARKER):
                    sentence_binding = self.parseset_creator.create_sentence_binding_from_morpheme_containers(
                        entries_for_sentence)
                    sentences.append(sentence_binding)
                    entries_for_sentence = []
                elif line.startswith("#"):
                    continue
                else:
                    word_part = line[:line.find('=')].strip()
                    parse_result_part = line[line.find('=') + 1:].strip()

                    parse_result_matching_simple_parseset = self._find_parse_result_matching_simple_parseset(
                        word_part, parse_result_part)

                    entries_for_sentence.append(
                        (word_part, parse_result_matching_simple_parseset))

        parseset_binding = ParseSetBinding()
        parseset_binding.sentences = sentences
        parseset_dom = parseset_binding.to_dom()
        parseset_dom.setAttribute("xmlns", xmlbindings.NAMESPACE)
        with codecs.open(destination_file_path, mode='w',
                         encoding='utf-8') as output:
            output.write(PARSESET_HEADER)
            output.write('\n')
            output.write(parseset_dom.toprettyxml())

    def _find_parse_result_matching_simple_parseset(self, word_part,
                                                    parse_result_part):
        parse_results = self.parser.parse(word_part)

        for parse_result in parse_results:
            parse_result_str = formatter.format_morpheme_container_for_simple_parseset(
                parse_result)
            if parse_result_part == parse_result_str:
                return parse_result
            elif parse_result_str == modify_treebank_parse_result_strs_to_look_like_trnltk(
                    parse_result_part):
                return parse_result

        return None
Esempio n. 3
0
class ParseSetCreatorTest(unittest.TestCase):

    def setUp(self):
        self.parseset_creator = ParseSetCreator()

        all_roots = []

        lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map = (RootMapGenerator()).generate(all_roots)

        suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph())))
        suffix_graph.initialize()

        predefined_paths = PredefinedPaths(root_map, suffix_graph)
        predefined_paths.create_predefined_paths()

        word_root_finder = WordRootFinder(root_map)
        digit_numeral_root_finder = DigitNumeralRootFinder()
        text_numeral_root_finder = TextNumeralRootFinder(root_map)
        proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder()
        proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder()

        self.parser = UpperCaseSupportingContextlessMorphologicalParser(suffix_graph, predefined_paths,
            [word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder])

    def test_should_create_sentence_binding_from_morpheme_containers(self):
        morpheme_containers = []
        morpheme_containers.append(self._get_word_morpheme_container_tuple(u'blablabla'))
        morpheme_containers.append(self._get_word_morpheme_container_tuple(u'Kitaba', u'kitap+Noun+A3sg+Pnon+Dat'))
        morpheme_containers.append(self._get_word_morpheme_container_tuple(u'abcabcabc'))
        morpheme_containers.append(self._get_word_morpheme_container_tuple(u'buyurmam'))
        morpheme_containers.append(self._get_word_morpheme_container_tuple(u'yetiştirdik'))
        morpheme_containers.append(self._get_word_morpheme_container_tuple(u'Kıvrandığın', u'kıvran+Verb+Pos+Adj+PastPart+P2sg'))
        morpheme_containers.append(self._get_word_morpheme_container_tuple(u'sabahçı'))
        morpheme_containers.append(self._get_word_morpheme_container_tuple(u'sabah'))
        morpheme_containers.append(self._get_word_morpheme_container_tuple(u"Ali'nin"))

        sentence = self.parseset_creator.create_sentence_binding_from_morpheme_containers(morpheme_containers)

        expected = u'''
<sentence>
	<unparsable_word str="blablabla"/>
	<word parse_result="kitap+Noun+A3sg+Pnon+Dat" str="Kitaba" syntactic_category="Noun">
		<root lemma="kitap" lemma_root="kitap" str="kitab" syntactic_category="Noun"/>
		<suffixes>
			<inflectionalSuffix actual="" application="" form="" id="A3Sg_Noun" matched_word="kitab" name="A3sg" to_syntactic_category="Noun" word="kitap"/>
			<inflectionalSuffix actual="" application="" form="" id="Pnon_Noun" matched_word="kitab" name="Pnon" to_syntactic_category="Noun" word="kitap"/>
			<inflectionalSuffix actual="a" application="a" form="+yA" id="Dat_Noun" matched_word="kitaba" name="Dat" to_syntactic_category="Noun" word="kitaba"/>
		</suffixes>
	</word>
	<unparsable_word str="abcabcabc"/>
	<word parse_result="buyur+Verb+Neg+Aor+A1sg" str="buyurmam" syntactic_category="Verb">
		<root lemma="buyurmak" lemma_root="buyur" str="buyur" syntactic_category="Verb"/>
		<suffixes>
			<inflectionalSuffix actual="ma" application="ma" form="mA" id="Neg" matched_word="buyurma" name="Neg" to_syntactic_category="Verb" word="buyurma"/>
			<inflectionalSuffix actual="" application="" form="" id="Aor" matched_word="buyurma" name="Aor" to_syntactic_category="Verb" word="buyurma"/>
			<inflectionalSuffix actual="m" application="m" form="+Im" id="A1Sg_Verb" matched_word="buyurmam" name="A1sg" to_syntactic_category="Verb" word="buyurmam"/>
		</suffixes>
	</word>
	<word parse_result="yetiş+Verb+Verb+Caus+Pos+Past+A1pl" str="yetiştirdik" syntactic_category="Verb">
		<root lemma="yetişmek" lemma_root="yetiş" str="yetiş" syntactic_category="Verb"/>
		<suffixes>
			<derivationalSuffix actual="tir" application="tir" form="dIr" id="Caus" matched_word="yetiştir" name="Caus" to_syntactic_category="Verb" word="yetiştir"/>
			<inflectionalSuffix actual="" application="" form="" id="Pos" matched_word="yetiştir" name="Pos" to_syntactic_category="Verb" word="yetiştir"/>
			<inflectionalSuffix actual="di" application="di" form="dI" id="Past" matched_word="yetiştirdi" name="Past" to_syntactic_category="Verb" word="yetiştirdi"/>
			<inflectionalSuffix actual="k" application="k" form="k" id="A1Pl_Verb" matched_word="yetiştirdik" name="A1pl" to_syntactic_category="Verb" word="yetiştirdik"/>
		</suffixes>
	</word>
	<word parse_result="kıvran+Verb+Pos+Adj+PastPart+P2sg" str="Kıvrandığın" syntactic_category="Adj">
		<root lemma="kıvranmak" lemma_root="kıvran" str="kıvran" syntactic_category="Verb"/>
		<suffixes>
			<inflectionalSuffix actual="" application="" form="" id="Pos" matched_word="kıvran" name="Pos" to_syntactic_category="Verb" word="kıvran"/>
			<derivationalSuffix actual="dığ" application="dık" form="dIk" id="PastPart_Adj" matched_word="kıvrandığ" name="PastPart" to_syntactic_category="Adj" word="kıvrandık"/>
			<inflectionalSuffix actual="ın" application="ın" form="+In" id="P2Sg_Adj" matched_word="kıvrandığın" name="P2sg" to_syntactic_category="Adj" word="kıvrandığın"/>
		</suffixes>
	</word>
	<word parse_result="sabah+Noun+Time+A3sg+Pnon+Nom+Adj+Agt" str="sabahçı" syntactic_category="Adj">
		<root lemma="sabah" lemma_root="sabah" secondary_syntactic_category="Time" str="sabah" syntactic_category="Noun"/>
		<suffixes>
			<inflectionalSuffix actual="" application="" form="" id="A3Sg_Noun" matched_word="sabah" name="A3sg" to_syntactic_category="Noun" word="sabah"/>
			<inflectionalSuffix actual="" application="" form="" id="Pnon_Noun" matched_word="sabah" name="Pnon" to_syntactic_category="Noun" word="sabah"/>
			<inflectionalSuffix actual="" application="" form="" id="Nom_Deriv_Noun" matched_word="sabah" name="Nom" to_syntactic_category="Noun" word="sabah"/>
			<derivationalSuffix actual="çı" application="çı" form="cI" id="Agt_Noun_to_Adj" matched_word="sabahçı" name="Agt" to_syntactic_category="Adj" word="sabahçı"/>
		</suffixes>
	</word>
	<word parse_result="sabah+Adv+Time" secondary_syntactic_category="Time" str="sabah" syntactic_category="Adv">
		<root lemma="sabah" lemma_root="sabah" secondary_syntactic_category="Time" str="sabah" syntactic_category="Adv"/>
	</word>
	<word parse_result="Ali+Noun+Prop+Apos+A3sg+Pnon+Gen" secondary_syntactic_category="Prop" str="Ali'nin" syntactic_category="Noun">
		<root lemma="Ali" lemma_root="Ali" secondary_syntactic_category="Prop" str="Ali" syntactic_category="Noun"/>
		<suffixes>
			<inflectionalSuffix actual="'" application="'" form="'" id="Apos_Proper_Noun" matched_word="Ali'" name="Apos" to_syntactic_category="Noun" word="Ali'"/>
			<inflectionalSuffix actual="" application="" form="" id="A3Sg_Noun" matched_word="Ali'" name="A3sg" to_syntactic_category="Noun" word="Ali'"/>
			<inflectionalSuffix actual="" application="" form="" id="Pnon_Noun" matched_word="Ali'" name="Pnon" to_syntactic_category="Noun" word="Ali'"/>
			<inflectionalSuffix actual="nin" application="nin" form="+nIn" id="Gen_Noun" matched_word="Ali'nin" name="Gen" to_syntactic_category="Noun" word="Ali'nin"/>
		</suffixes>
	</word>
</sentence>
'''
        expected = expected.strip()
        actual = sentence.to_dom().toprettyxml().strip()

        if expected!=actual:
            for line in context_diff(expected.split('\n'), actual.split('\n'), "expected", "actual"):
                print line

        assert_that(expected, equal_to(sentence.to_dom().toprettyxml().strip()))

    def _get_word_morpheme_container_tuple(self, seq, expected_result=None):
        res = self.parser.parse(seq)

        if res:
            if expected_result:
                matching_containers = filter(lambda parse_result : formatter.format_morpheme_container_for_parseset(parse_result)==expected_result, res)
                if matching_containers:
                    return seq, matching_containers[0]
                else:
                    return seq, None
            else:
                return seq, res[0]
        else:
            return seq, None
class ParseSetCreatorWithSimpleParsesetsTest(unittest.TestCase):

    def setUp(self):
        self.parseset_creator = ParseSetCreator()

        all_roots = []

        lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map = (RootMapGenerator()).generate(all_roots)

        suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph())))
        suffix_graph.initialize()

        predefined_paths = PredefinedPaths(root_map, suffix_graph)
        predefined_paths.create_predefined_paths()

        word_root_finder = WordRootFinder(root_map)
        digit_numeral_root_finder = DigitNumeralRootFinder()
        text_numeral_root_finder = TextNumeralRootFinder(root_map)
        proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder()
        proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder()

        self.parser = UpperCaseSupportingContextlessMorphologicalParser(suffix_graph, predefined_paths,
            [word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder])

    def test_should_create_parseset_001(self):
        self._create_parseset_n("001")

    def test_should_create_parseset_002(self):
        self._create_parseset_n("002")

    def test_should_create_parseset_003(self):
        self._create_parseset_n("003")

    def test_should_create_parseset_004(self):
        self._create_parseset_n("004")

    def test_should_create_parseset_005(self):
        self._create_parseset_n("005")

    def test_should_create_parseset_999(self):
        self._create_parseset_n("999")

    def _create_parseset_n(self, set_number):
        source_file_path = os.path.join(os.path.dirname(__file__), '../../testresources/simpleparsesets/simpleparseset{}.txt'.format(set_number))
        destination_file_path = os.path.join(os.path.dirname(__file__), '../../testresources/parsesets/parseset{}.xml'.format(set_number))

        line_index = 0
        sentences = []
        with codecs.open(source_file_path, mode='r', encoding='utf-8') as src:
            entries_for_sentence = []
            for line in src:
                print u'Processing line {}'.format(line_index)
                line_index +=1
                if not line:
                    continue
                elif line.startswith(END_OF_SENTENCE_MARKER):
                    sentence_binding = self.parseset_creator.create_sentence_binding_from_morpheme_containers(entries_for_sentence)
                    sentences.append(sentence_binding)
                    entries_for_sentence = []
                elif line.startswith("#"):
                    continue
                else:
                    word_part = line[:line.find('=')].strip()
                    parse_result_part = line[line.find('=')+1:].strip()

                    parse_result_matching_simple_parseset = self._find_parse_result_matching_simple_parseset(word_part, parse_result_part)

                    entries_for_sentence.append((word_part, parse_result_matching_simple_parseset))

        parseset_binding = ParseSetBinding()
        parseset_binding.sentences = sentences
        parseset_dom = parseset_binding.to_dom()
        parseset_dom.setAttribute("xmlns", xmlbindings.NAMESPACE)
        with codecs.open(destination_file_path, mode='w', encoding='utf-8') as output:
            output.write(PARSESET_HEADER)
            output.write('\n')
            output.write(parseset_dom.toprettyxml())


    def _find_parse_result_matching_simple_parseset(self, word_part, parse_result_part):
        parse_results = self.parser.parse(word_part)

        for parse_result in parse_results:
            parse_result_str = formatter.format_morpheme_container_for_simple_parseset(parse_result)
            if parse_result_part==parse_result_str:
                return parse_result
            elif parse_result_str==modify_treebank_parse_result_strs_to_look_like_trnltk(parse_result_part):
                return parse_result

        return None