예제 #1
0
    def __test_parsing(self, ra_version):
        # Initialize text parser pipeline.
        text_parser = BaseTextParser(pipeline=[RuSentRelTextEntitiesParser(),
                                               DefaultTextTokenizer(keep_tokens=True)])

        # iterating through collection
        news_read = 0

        news_it = RuAttitudesCollection.iter_news(version=ra_version,
                                                  get_news_index_func=lambda _: news_read,
                                                  return_inds_only=False)

        for news in tqdm(news_it):

            # parse news
            parsed_news = NewsParser.parse(news=news, text_parser=text_parser)
            terms = parsed_news.iter_sentence_terms(sentence_index=0,
                                                    return_id=False)

            str_terms = []
            for t in terms:
                if isinstance(t, Entity):
                    str_terms.append("E")
                elif isinstance(t, Token):
                    str_terms.append(t.get_token_value())
                else:
                    str_terms.append(t)

            for t in str_terms:
                self.assertIsInstance(t, str)

            news_read += 1
    def test_pipeline(self):
        text_parser = BaseTextParser(
            [TermsSplitterParser(),
             BertOntonotesNERPipelineItem()])
        news = News(doc_id=0, sentences=[BaseNewsSentence(self.text)])
        parsed_news = NewsParser.parse(news=news, text_parser=text_parser)
        terms = parsed_news.iter_sentence_terms(sentence_index=0,
                                                return_id=False)

        for term in terms:
            print(term)
예제 #3
0
def init_rusentrel_doc(doc_id, text_parser, synonyms):
    assert (isinstance(doc_id, int))
    assert (isinstance(text_parser, BaseTextParser))
    assert (isinstance(synonyms, SynonymsCollection))

    news = RuSentRelNews.read_document(doc_id=doc_id, synonyms=synonyms)

    parsed_news = NewsParser.parse(news=news, text_parser=text_parser)

    opinions = RuSentRelOpinionCollection.iter_opinions_from_doc(doc_id=doc_id)

    collection = OpinionCollection(opinions=opinions,
                                   synonyms=synonyms,
                                   error_on_duplicates=True,
                                   error_on_synonym_end_missed=True)

    return news, parsed_news, collection
예제 #4
0
    def test_linked_text_opinion_extraction(self):

        # Initializing logger
        logger = logging.getLogger(__name__)
        logger.setLevel(logging.INFO)
        logging.basicConfig(level=logging.DEBUG)

        # Init text parser.
        text_parser = BaseTextParser(pipeline=[RuSentRelTextEntitiesParser()])

        synonyms = TestRuSentRel.__read_rusentrel_synonyms_collection()
        for news, opinions in self.__iter_by_docs(synonyms):

            logger.info("NewsID: {}".format(news.ID))

            # Example: Access to news text-level opinions.
            first_opinion = opinions[0]
            assert (isinstance(first_opinion, Opinion))

            print("'{src}'->'{tgt}'".format(src=first_opinion.SourceValue,
                                            tgt=first_opinion.TargetValue))

            # Parse text.
            parsed_news = NewsParser.parse(news=news, text_parser=text_parser)

            # Initialize text opinion provider.
            text_opinion_provider = TextOpinionPairsProvider(
                synonyms.get_synonym_group_index)
            text_opinion_provider.init_parsed_news(parsed_news)

            text_opins_it = text_opinion_provider.iter_from_opinion(
                opinion=first_opinion)

            # Obtain text opinions linkage.
            text_opinons_linkage = TextOpinionsLinkage(text_opins_it)

            print("Linked opinions count: {}".format(
                len(text_opinons_linkage)))
            for text_opinion in text_opinons_linkage:
                assert (isinstance(text_opinion, TextOpinion))
                label = text_opinion.Sentiment
                assert (isinstance(label, Label))
                print("<{},{},{}>".format(text_opinion.SourceId,
                                          text_opinion.TargetId, str(label)))
예제 #5
0
    def test_ruattitudes_news_text_parsing(self):
        news_it = RuAttitudesCollection.iter_news(
            version=RuAttitudesVersions.Debug,
            get_news_index_func=lambda _: 0,
            label_convereter=ExperimentRuAttitudesLabelConverter(),
            return_inds_only=False)

        text_parser = BaseTextParser(
            pipeline=[RuAttitudesTextEntitiesParser()])

        for news in news_it:

            # Parse news via external parser.
            parsed_news = NewsParser.parse(news=news, text_parser=text_parser)
            assert (isinstance(parsed_news, ParsedNews))

            # Display result
            for parsed_text in parsed_news:
                self.__print_parsed_text(parsed_text)
예제 #6
0
    def test_parsing(self):

        # Initializing logger.
        logger = logging.getLogger(__name__)
        logger.setLevel(logging.DEBUG)
        logging.basicConfig(level=logging.DEBUG)

        # Initializing stemmer.
        stemmer = MystemWrapper()

        # frame and variants.
        frames = RuSentiFramesCollection.read_collection(version=RuSentiFramesVersions.V20)
        frame_variants = FrameVariantsCollection()
        frame_variants.fill_from_iterable(variants_with_id=frames.iter_frame_id_and_variants(),
                                          overwrite_existed_variant=True,
                                          raise_error_on_existed_variant=False)

        text_parser = BaseTextParser(pipeline=[RuSentRelTextEntitiesParser(),
                                               DefaultTextTokenizer(keep_tokens=True),
                                               LemmasBasedFrameVariantsParser(frame_variants=frame_variants,
                                                                              stemmer=stemmer,
                                                                              save_lemmas=False),
                                               FrameVariantsSentimentNegation()])

        # Reading synonyms collection.
        synonyms = RuSentRelSynonymsCollectionProvider.load_collection(stemmer=stemmer)

        version = RuSentRelVersions.V11
        for doc_id in RuSentRelIOUtils.iter_collection_indices(version):

            # Parsing
            news = RuSentRelNews.read_document(doc_id=doc_id,
                                               synonyms=synonyms,
                                               version=version)

            # Perform text parsing.
            parsed_news = NewsParser.parse(news=news, text_parser=text_parser)
            debug_show_news_terms(parsed_news=parsed_news)
예제 #7
0
    def test_rusentrel_news_text_parsing(self):
        version = RuSentRelVersions.V11

        text_parser = BaseTextParser(pipeline=[
            RuSentRelTextEntitiesParser(),
            DefaultTextTokenizer(keep_tokens=True)
        ])

        stemmer = MystemWrapper()
        synonyms = RuSentRelSynonymsCollectionProvider.load_collection(
            stemmer=stemmer, version=version)
        news = RuSentRelNews.read_document(doc_id=1,
                                           synonyms=synonyms,
                                           version=version)

        # Parse news via external parser.
        parsed_news = NewsParser.parse(news=news, text_parser=text_parser)

        # Display result
        for parsed_text in parsed_news:
            self.__print_parsed_text(parsed_text)

        assert (isinstance(parsed_news, ParsedNews))
예제 #8
0
 def __parse_doc(self, doc_id):
     news = self.get_doc(doc_id=doc_id)
     return NewsParser.parse(news=news, text_parser=self.__text_parser)
예제 #9
0
 def test_parse_sinle_string(self):
     text = "А контроль над этими провинциями — это господство над без малого половиной сирийской территории."
     parser = BaseTextParser(pipeline=[DefaultTextTokenizer(keep_tokens=True)])
     news = News(doc_id=0, sentences=[BaseNewsSentence(text.split())])
     parsed_news = NewsParser.parse(news=news, text_parser=parser)
     debug_show_news_terms(parsed_news=parsed_news)