Beispiel #1
0
    def setUpClass(cls):
        super(StatisticalParserTest, cls).setUpClass()
        all_roots = []

        lexemes = LexiconLoader.load_from_file(
            os.path.join(os.path.dirname(__file__),
                         '../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map_generator = RootMapGenerator()
        cls.root_map = root_map_generator.generate(all_roots)

        suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(
            BasicSuffixGraph()))
        suffix_graph.initialize()

        predefined_paths = PredefinedPaths(cls.root_map, suffix_graph)
        predefined_paths.create_predefined_paths()

        word_root_finder = WordRootFinder(cls.root_map)
        digit_numeral_root_finder = DigitNumeralRootFinder()
        text_numeral_root_finder = TextNumeralRootFinder(cls.root_map)
        proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder(
        )
        proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder(
        )

        contextless_parser = ContextlessMorphologicalParser(
            suffix_graph, predefined_paths, [
                word_root_finder, digit_numeral_root_finder,
                text_numeral_root_finder,
                proper_noun_from_apostrophe_root_finder,
                proper_noun_without_apostrophe_root_finder
            ])

        parseset_index = "001"
        dom = parse(
            os.path.join(
                os.path.dirname(__file__),
                '../../testresources/parsesets/parseset{}.xml'.format(
                    parseset_index)))
        parseset = ParseSetBinding.build(
            dom.getElementsByTagName("parseset")[0])
        parse_set_word_list = []
        for sentence in parseset.sentences:
            parse_set_word_list.extend(sentence.words)

        complete_word_concordance_index = CompleteWordConcordanceIndex(
            parse_set_word_list)

        cls.parser = StatisticalParser(contextless_parser,
                                       complete_word_concordance_index)
Beispiel #2
0
    def _validate_complete_word_concordance_indexes(self, word_list):
        idx = CompleteWordConcordanceIndex(word_list)

        for complete_word in idx._offsets._indices.iterkeys():
            offsets = idx.offsets(complete_word)
            words = [word_list[offset] for offset in offsets]
            assert_that(all([word.str == complete_word for word in words]))

        for complete_word in idx._offsets._indices.iterkeys():
            for syntactic_category in idx._offsets._indices[
                    complete_word].iterkeys():

                offsets = idx.offsets(complete_word, syntactic_category)
                words = [word_list[offset] for offset in offsets]
                assert_that(
                    all([
                        word.str == complete_word
                        and word.syntactic_category == syntactic_category
                        for word in words
                    ]))

        for complete_word in idx._offsets._indices.iterkeys():
            for syntactic_category in idx._offsets._indices[
                    complete_word].iterkeys():
                for secondary_syntactic_category in idx._offsets._indices[
                        complete_word][syntactic_category].iterkeys():

                    offsets = idx.offsets(complete_word, syntactic_category,
                                          secondary_syntactic_category)
                    words = [word_list[offset] for offset in offsets]
                    assert_that(
                        all([
                            word.str == complete_word
                            and word.syntactic_category == syntactic_category
                            and word.secondary_syntactic_category
                            == secondary_syntactic_category for word in words
                        ]))
Beispiel #3
0
    def _validate_complete_word_concordance_indexes(self, word_list):
        idx = CompleteWordConcordanceIndex(word_list)

        for complete_word in idx._offsets._indices.iterkeys():
            offsets = idx.offsets(complete_word)
            words = [word_list[offset] for offset in offsets]
            assert_that(all([word.str==complete_word for word in words]))

        for complete_word in idx._offsets._indices.iterkeys():
            for syntactic_category in idx._offsets._indices[complete_word].iterkeys():

                offsets = idx.offsets(complete_word, syntactic_category)
                words = [word_list[offset] for offset in offsets]
                assert_that(all([word.str==complete_word and word.syntactic_category==syntactic_category for word in words]))

        for complete_word in idx._offsets._indices.iterkeys():
            for syntactic_category in idx._offsets._indices[complete_word].iterkeys():
                for secondary_syntactic_category in idx._offsets._indices[complete_word][syntactic_category].iterkeys():

                    offsets = idx.offsets(complete_word, syntactic_category, secondary_syntactic_category)
                    words = [word_list[offset] for offset in offsets]
                    assert_that(all([word.str==complete_word and word.syntactic_category==syntactic_category
                                     and word.secondary_syntactic_category==secondary_syntactic_category
                                     for word in words]))
Beispiel #4
0
    def test_should_find_complete_word_concordance(self):
        idx = CompleteWordConcordanceIndex(self.word_list)

        assert_that(idx.offsets(u'something'), equal_to([]))

        assert_that(idx.offsets(u"o"), equal_to([0, 1, 2]))
        assert_that(idx.offsets(u"o", SyntacticCategory.PRONOUN),
                    equal_to([0, 1]))
        assert_that(idx.offsets(u"o", SyntacticCategory.DETERMINER),
                    equal_to([2]))
        assert_that(
            idx.offsets(u"o", SyntacticCategory.PRONOUN,
                        SecondarySyntacticCategory.PERSONAL), equal_to([0]))
        assert_that(
            idx.offsets(u"o", SyntacticCategory.PRONOUN,
                        SecondarySyntacticCategory.DEMONSTRATIVE),
            equal_to([1]))

        assert_that(
            idx.offsets(u"onu", SyntacticCategory.PRONOUN,
                        SecondarySyntacticCategory.PERSONAL), equal_to([3]))
        assert_that(
            idx.offsets(u"onu", SyntacticCategory.PRONOUN,
                        SecondarySyntacticCategory.DEMONSTRATIVE),
            equal_to([4]))

        assert_that(idx.offsets(u"gittim"), equal_to([6]))
        assert_that(idx.offsets(u"gittim", SyntacticCategory.VERB),
                    equal_to([6]))

        assert_that(idx.offsets(u"giderim"), equal_to([7]))
        assert_that(idx.offsets(u"giderim", SyntacticCategory.VERB),
                    equal_to([7]))

        assert_that(idx.offsets(u"gidecekler"), equal_to([8, 10]))
        assert_that(idx.offsets(u"gidecekler", SyntacticCategory.VERB),
                    equal_to([8]))
        assert_that(idx.offsets(u"gidecekler", SyntacticCategory.NOUN),
                    equal_to([10]))

        assert_that(idx.offsets(u"gideceğim"), equal_to([9, 11]))
        assert_that(idx.offsets(u"gideceğim", SyntacticCategory.VERB),
                    equal_to([9]))
        assert_that(idx.offsets(u"gideceğim", SyntacticCategory.NOUN),
                    equal_to([11]))
    def test_should_find_complete_word_concordance(self):
        idx = CompleteWordConcordanceIndex(self.word_list)

        assert_that(idx.offsets(u'something'), equal_to([]))

        assert_that(idx.offsets(u"o"), equal_to([0, 1, 2]))
        assert_that(idx.offsets(u"o", SyntacticCategory.PRONOUN), equal_to([0, 1]))
        assert_that(idx.offsets(u"o", SyntacticCategory.DETERMINER), equal_to([2]))
        assert_that(idx.offsets(u"o", SyntacticCategory.PRONOUN, SecondarySyntacticCategory.PERSONAL), equal_to([0]))
        assert_that(idx.offsets(u"o", SyntacticCategory.PRONOUN, SecondarySyntacticCategory.DEMONSTRATIVE), equal_to([1]))

        assert_that(idx.offsets(u"onu", SyntacticCategory.PRONOUN, SecondarySyntacticCategory.PERSONAL), equal_to([3]))
        assert_that(idx.offsets(u"onu", SyntacticCategory.PRONOUN, SecondarySyntacticCategory.DEMONSTRATIVE), equal_to([4]))

        assert_that(idx.offsets(u"gittim"), equal_to([6]))
        assert_that(idx.offsets(u"gittim", SyntacticCategory.VERB), equal_to([6]))

        assert_that(idx.offsets(u"giderim"), equal_to([7]))
        assert_that(idx.offsets(u"giderim", SyntacticCategory.VERB), equal_to([7]))

        assert_that(idx.offsets(u"gidecekler"), equal_to([8, 10]))
        assert_that(idx.offsets(u"gidecekler", SyntacticCategory.VERB), equal_to([8]))
        assert_that(idx.offsets(u"gidecekler", SyntacticCategory.NOUN), equal_to([10]))

        assert_that(idx.offsets(u"gideceğim"), equal_to([9, 11]))
        assert_that(idx.offsets(u"gideceğim", SyntacticCategory.VERB), equal_to([9]))
        assert_that(idx.offsets(u"gideceğim", SyntacticCategory.NOUN), equal_to([11]))