def __test_parsing(self, ra_version): # Initialize text parser pipeline. text_parser = BaseTextParser(pipeline=[RuSentRelTextEntitiesParser(), DefaultTextTokenizer(keep_tokens=True)]) # iterating through collection news_read = 0 news_it = RuAttitudesCollection.iter_news(version=ra_version, get_news_index_func=lambda _: news_read, return_inds_only=False) for news in tqdm(news_it): # parse news parsed_news = NewsParser.parse(news=news, text_parser=text_parser) terms = parsed_news.iter_sentence_terms(sentence_index=0, return_id=False) str_terms = [] for t in terms: if isinstance(t, Entity): str_terms.append("E") elif isinstance(t, Token): str_terms.append(t.get_token_value()) else: str_terms.append(t) for t in str_terms: self.assertIsInstance(t, str) news_read += 1
def test_pipeline(self): text_parser = BaseTextParser( [TermsSplitterParser(), BertOntonotesNERPipelineItem()]) news = News(doc_id=0, sentences=[BaseNewsSentence(self.text)]) parsed_news = NewsParser.parse(news=news, text_parser=text_parser) terms = parsed_news.iter_sentence_terms(sentence_index=0, return_id=False) for term in terms: print(term)
def init_rusentrel_doc(doc_id, text_parser, synonyms): assert (isinstance(doc_id, int)) assert (isinstance(text_parser, BaseTextParser)) assert (isinstance(synonyms, SynonymsCollection)) news = RuSentRelNews.read_document(doc_id=doc_id, synonyms=synonyms) parsed_news = NewsParser.parse(news=news, text_parser=text_parser) opinions = RuSentRelOpinionCollection.iter_opinions_from_doc(doc_id=doc_id) collection = OpinionCollection(opinions=opinions, synonyms=synonyms, error_on_duplicates=True, error_on_synonym_end_missed=True) return news, parsed_news, collection
def test_linked_text_opinion_extraction(self): # Initializing logger logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) logging.basicConfig(level=logging.DEBUG) # Init text parser. text_parser = BaseTextParser(pipeline=[RuSentRelTextEntitiesParser()]) synonyms = TestRuSentRel.__read_rusentrel_synonyms_collection() for news, opinions in self.__iter_by_docs(synonyms): logger.info("NewsID: {}".format(news.ID)) # Example: Access to news text-level opinions. first_opinion = opinions[0] assert (isinstance(first_opinion, Opinion)) print("'{src}'->'{tgt}'".format(src=first_opinion.SourceValue, tgt=first_opinion.TargetValue)) # Parse text. parsed_news = NewsParser.parse(news=news, text_parser=text_parser) # Initialize text opinion provider. text_opinion_provider = TextOpinionPairsProvider( synonyms.get_synonym_group_index) text_opinion_provider.init_parsed_news(parsed_news) text_opins_it = text_opinion_provider.iter_from_opinion( opinion=first_opinion) # Obtain text opinions linkage. text_opinons_linkage = TextOpinionsLinkage(text_opins_it) print("Linked opinions count: {}".format( len(text_opinons_linkage))) for text_opinion in text_opinons_linkage: assert (isinstance(text_opinion, TextOpinion)) label = text_opinion.Sentiment assert (isinstance(label, Label)) print("<{},{},{}>".format(text_opinion.SourceId, text_opinion.TargetId, str(label)))
def test_ruattitudes_news_text_parsing(self): news_it = RuAttitudesCollection.iter_news( version=RuAttitudesVersions.Debug, get_news_index_func=lambda _: 0, label_convereter=ExperimentRuAttitudesLabelConverter(), return_inds_only=False) text_parser = BaseTextParser( pipeline=[RuAttitudesTextEntitiesParser()]) for news in news_it: # Parse news via external parser. parsed_news = NewsParser.parse(news=news, text_parser=text_parser) assert (isinstance(parsed_news, ParsedNews)) # Display result for parsed_text in parsed_news: self.__print_parsed_text(parsed_text)
def test_parsing(self): # Initializing logger. logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) logging.basicConfig(level=logging.DEBUG) # Initializing stemmer. stemmer = MystemWrapper() # frame and variants. frames = RuSentiFramesCollection.read_collection(version=RuSentiFramesVersions.V20) frame_variants = FrameVariantsCollection() frame_variants.fill_from_iterable(variants_with_id=frames.iter_frame_id_and_variants(), overwrite_existed_variant=True, raise_error_on_existed_variant=False) text_parser = BaseTextParser(pipeline=[RuSentRelTextEntitiesParser(), DefaultTextTokenizer(keep_tokens=True), LemmasBasedFrameVariantsParser(frame_variants=frame_variants, stemmer=stemmer, save_lemmas=False), FrameVariantsSentimentNegation()]) # Reading synonyms collection. synonyms = RuSentRelSynonymsCollectionProvider.load_collection(stemmer=stemmer) version = RuSentRelVersions.V11 for doc_id in RuSentRelIOUtils.iter_collection_indices(version): # Parsing news = RuSentRelNews.read_document(doc_id=doc_id, synonyms=synonyms, version=version) # Perform text parsing. parsed_news = NewsParser.parse(news=news, text_parser=text_parser) debug_show_news_terms(parsed_news=parsed_news)
def test_rusentrel_news_text_parsing(self): version = RuSentRelVersions.V11 text_parser = BaseTextParser(pipeline=[ RuSentRelTextEntitiesParser(), DefaultTextTokenizer(keep_tokens=True) ]) stemmer = MystemWrapper() synonyms = RuSentRelSynonymsCollectionProvider.load_collection( stemmer=stemmer, version=version) news = RuSentRelNews.read_document(doc_id=1, synonyms=synonyms, version=version) # Parse news via external parser. parsed_news = NewsParser.parse(news=news, text_parser=text_parser) # Display result for parsed_text in parsed_news: self.__print_parsed_text(parsed_text) assert (isinstance(parsed_news, ParsedNews))
def __parse_doc(self, doc_id): news = self.get_doc(doc_id=doc_id) return NewsParser.parse(news=news, text_parser=self.__text_parser)
def test_parse_sinle_string(self): text = "А контроль над этими провинциями — это господство над без малого половиной сирийской территории." parser = BaseTextParser(pipeline=[DefaultTextTokenizer(keep_tokens=True)]) news = News(doc_id=0, sentences=[BaseNewsSentence(text.split())]) parsed_news = NewsParser.parse(news=news, text_parser=parser) debug_show_news_terms(parsed_news=parsed_news)