Esempio n. 1
0
def about_version(version=RuSentiFramesVersions.V20):
    stemmer = MystemWrapper()
    pos_tagger = POSMystemWrapper(stemmer.MystemInstance)
    frames_collection = RuSentiFramesCollection.read_collection(
        version=version)
    print("Lexicon version:", version)
    return __about(frames_collection=frames_collection, pos_tagger=pos_tagger)
Esempio n. 2
0
    def __read_synonyms_collection(self):
        # Initializing stemmer
        stemmer = MystemWrapper()

        # Reading synonyms collection.
        return RuSentRelSynonymsCollectionProvider.load_collection(
            stemmer=stemmer, version=self.__rusentrel_version)
    def __init__(self, init_word_embedding=True):
        self.__stemmer = MystemWrapper()
        self.__synonym_collection = RuSentRelSynonymsCollection.load_collection(
            stemmer=self.__stemmer, is_read_only=True)
        self.__opinion_formatter = RuSentRelOpinionCollectionFormatter

        # You may manually select three-scale mode by commenting one of these two lines below.
        self.__neutral_annotator = self.__init_two_scale_neutral_annotator()
        # self.__neutral_annotator = self.__init_three_scale_neutral_annotator()

        self.__word_embedding = self.__create_word_embedding(
        ) if init_word_embedding else None
        self.__cv_folding_algorithm = self.__init_sentence_based_cv_folding_algorithm(
        )
Esempio n. 4
0
    def setUpClass(cls):
        cls.stemmer = MystemWrapper()
        cls.entities_formatter = RussianEntitiesCasedFormatter(
            pos_tagger=POSMystemWrapper(Mystem(entire_input=False)))
        cls.synonyms = RuSentRelSynonymsCollectionProvider.load_collection(
            stemmer=cls.stemmer)
        cls.frames_collection = RuSentiFramesCollection.read_collection(
            version=RuSentiFramesVersions.V10)

        cls.unique_frame_variants = FrameVariantsCollection()
        cls.unique_frame_variants.fill_from_iterable(
            variants_with_id=cls.frames_collection.iter_frame_id_and_variants(
            ),
            overwrite_existed_variant=True,
            raise_error_on_existed_variant=False)
Esempio n. 5
0
    def test(self):
        frame_variants_collection = self.__create_frames_variants_collection()
        stemmer = MystemWrapper()
        p = LemmasBasedFrameVariantsParser(
            save_lemmas=False,
            stemmer=stemmer,
            frame_variants=frame_variants_collection)

        ctx = PipelineContext(
            d={"src": "мы пытались его осудить но не получилось".split()})

        p.apply(ctx)

        for t in ctx.provide("src"):
            s = "[{}]".format(t.Variant.get_value()) if isinstance(
                t, TextFrameVariant) else t
            print(s)
Esempio n. 6
0
    def test_output_formatter(self):

        stemmer = MystemWrapper()
        synonyms = RuSentRelSynonymsCollectionProvider.load_collection(
            stemmer=stemmer)

        label_scaler = ThreeLabelScaler()

        # sample_storage = BaseRowsStorage.from_tsv(filepath=self.__input_samples_filepath)
        output_storage = BaseRowsStorage.from_tsv(
            filepath=self.__output_filepath)
        linkages_view = MultilableOpinionLinkagesView(
            labels_scaler=label_scaler, storage=output_storage)

        opinion_storage = BaseRowsStorage.from_tsv(
            filepath=self.__input_opinions_filepath)
        opinion_view = BaseOpinionStorageView(opinion_storage)

        ppl = output_to_opinion_collections_pipeline(
            create_opinion_collection_func=lambda: OpinionCollection(
                opinions=[],
                synonyms=synonyms,
                error_on_duplicates=True,
                error_on_synonym_end_missed=True),
            doc_ids_set={1},
            labels_scaler=label_scaler,
            iter_opinion_linkages_func=lambda doc_id: linkages_view.
            iter_opinion_linkages(doc_id=doc_id, opinions_view=opinion_view),
            label_calc_mode=LabelCalculationMode.AVERAGE)

        doc_ids = set(
            opinion_storage.iter_column_values(column_name=const.DOC_ID,
                                               dtype=int))

        print(doc_ids)

        pipeline_ctx = PipelineContext({"src": doc_ids})

        # Running pipeline.
        ppl.run(pipeline_ctx)

        # Iterate over the result.
        for doc_id, collection in pipeline_ctx.provide("src"):
            print("d:{}, ct:{}, count:{}".format(doc_id, type(collection),
                                                 len(collection)))
    def __init__(self):
        self.__stemmer = MystemWrapper()
        self.__synonyms = RuSentRelSynonymsCollection.read_collection(stemmer=self.__stemmer,
                                                                      is_read_only=True)
        self.__pnc = ParsedNewsCollection()

        for doc_id in RuSentRelIOUtils.iter_collection_indices():
            entities = RuSentRelDocumentEntityCollection.read_collection(doc_id=doc_id,
                                                                         stemmer=self.__stemmer,
                                                                         synonyms=self.__synonyms)

            news = RuSentRelNews.read_document(doc_id=doc_id, entities=entities)

            parsed_news = RuSentRelParsedNewsHelper.create_parsed_news(rusentrel_news_id=doc_id,
                                                                       rusentrel_news=news,
                                                                       keep_tokens=False,
                                                                       stemmer=self.__stemmer)
            self.__pnc.add(parsed_news)
    def test(self):

        stemmer = MystemWrapper()
        actual_synonyms = RuSentRelSynonymsCollectionProvider.load_collection(
            stemmer=stemmer, version=RuSentRelVersions.V11)

        result = calculate_results(
            doc_ids=RuSentRelIOUtils.iter_test_indices(RuSentRelVersions.V11),
            evaluator=ThreeClassEvaluator(DataType.Test),
            iter_etalon_opins_by_doc_id_func=lambda doc_id:
            create_etalon_with_neutral(
                collection=OpinionCollection(None,
                                             synonyms=actual_synonyms,
                                             error_on_duplicates=True,
                                             error_on_synonym_end_missed=True),
                etalon_opins=RuSentRelOpinionCollection.iter_opinions_from_doc(
                    doc_id=doc_id),
                neut_opins=CustomZippedResultsIOUtils.iter_doc_opinions(
                    doc_id=doc_id,
                    result_version=Results.Etalon,
                    labels_fmt=CustomRuSentRelLabelsFormatter(),
                    opin_path_fmt=u"art{doc_id}.neut.Test.txt")),
            iter_result_opins_by_doc_id_func=lambda doc_id: OpinionCollection(
                opinions=CustomZippedResultsIOUtils.iter_doc_opinions(
                    doc_id=doc_id,
                    labels_fmt=RuSentRelLabelsFormatter(),
                    opin_path_fmt=u"{doc_id}.opin.txt",
                    result_version=Results.Test),
                synonyms=actual_synonyms,
                error_on_duplicates=False,
                error_on_synonym_end_missed=False))

        # logging all the result information.
        for doc_id, doc_info in result.iter_document_results():
            print u"{}:\t{}".format(doc_id, doc_info)

        print "------------------------"
        print str(result.TotalResult)
        print "------------------------"
Esempio n. 9
0
    def test_parsing(self):

        # Initializing logger.
        logger = logging.getLogger(__name__)
        logger.setLevel(logging.DEBUG)
        logging.basicConfig(level=logging.DEBUG)

        # Initializing stemmer.
        stemmer = MystemWrapper()

        # frame and variants.
        frames = RuSentiFramesCollection.read_collection(version=RuSentiFramesVersions.V20)
        frame_variants = FrameVariantsCollection()
        frame_variants.fill_from_iterable(variants_with_id=frames.iter_frame_id_and_variants(),
                                          overwrite_existed_variant=True,
                                          raise_error_on_existed_variant=False)

        text_parser = BaseTextParser(pipeline=[RuSentRelTextEntitiesParser(),
                                               DefaultTextTokenizer(keep_tokens=True),
                                               LemmasBasedFrameVariantsParser(frame_variants=frame_variants,
                                                                              stemmer=stemmer,
                                                                              save_lemmas=False),
                                               FrameVariantsSentimentNegation()])

        # Reading synonyms collection.
        synonyms = RuSentRelSynonymsCollectionProvider.load_collection(stemmer=stemmer)

        version = RuSentRelVersions.V11
        for doc_id in RuSentRelIOUtils.iter_collection_indices(version):

            # Parsing
            news = RuSentRelNews.read_document(doc_id=doc_id,
                                               synonyms=synonyms,
                                               version=version)

            # Perform text parsing.
            parsed_news = NewsParser.parse(news=news, text_parser=text_parser)
            debug_show_news_terms(parsed_news=parsed_news)
Esempio n. 10
0
    def test_rusentrel_news_text_parsing(self):
        version = RuSentRelVersions.V11

        text_parser = BaseTextParser(pipeline=[
            RuSentRelTextEntitiesParser(),
            DefaultTextTokenizer(keep_tokens=True)
        ])

        stemmer = MystemWrapper()
        synonyms = RuSentRelSynonymsCollectionProvider.load_collection(
            stemmer=stemmer, version=version)
        news = RuSentRelNews.read_document(doc_id=1,
                                           synonyms=synonyms,
                                           version=version)

        # Parse news via external parser.
        parsed_news = NewsParser.parse(news=news, text_parser=text_parser)

        # Display result
        for parsed_text in parsed_news:
            self.__print_parsed_text(parsed_text)

        assert (isinstance(parsed_news, ParsedNews))
class StemmerArg(BaseArg):

    default = u"mystem"

    supported = {
        u"mystem": MystemWrapper()
    }

    def __init__(self):
        pass

    @staticmethod
    def read_argument(args):
        return StemmerArg.supported[args.stemmer]

    @staticmethod
    def add_argument(parser):
        parser.add_argument('--stemmer',
                            dest='stemmer',
                            type=unicode,
                            choices=list(StemmerArg.supported.keys()),
                            default=StemmerArg.default,
                            nargs='?',
                            help='Stemmer (Default: {})'.format(StemmerArg.default))
Esempio n. 12
0
    DistanceInTermsBetweenAttitudeEndsArg.add_argument(parser)

    # Parsing arguments.
    args = parser.parse_args()

    # Reading arguments.
    embedding_filepath = RusVectoresEmbeddingFilepathArg.read_argument(args)
    exp_type = ExperimentTypeArg.read_argument(args)
    labels_count = LabelsCountArg.read_argument(args)
    terms_per_context = TermsPerContextArg.read_argument(args)
    entity_fmt = EntityFormatterTypesArg.read_argument(args)
    stemmer = StemmerArg.read_argument(args)
    use_balancing = UseBalancingArg.read_argument(args)
    dist_in_terms_between_attitude_ends = DistanceInTermsBetweenAttitudeEndsArg.read_argument(
        args)
    pos_tagger = POSMystemWrapper(MystemWrapper().MystemInstance)

    synonyms_collection = RuSentRelSynonymsCollectionProvider.load_collection(
        stemmer=stemmer)

    annot_algo = PairBasedAnnotationAlgorithm(
        dist_in_terms_bound=None,
        label_provider=PairSingleLabelProvider(
            label_instance=ExperimentNeutralLabel()))

    exp_name = Common.create_exp_name(rusentrel_version=rusentrel_version,
                                      ra_version=ra_version,
                                      folding_type=folding_type)

    extra_name_suffix = Common.create_exp_name_suffix(
        use_balancing=use_balancing,
Esempio n. 13
0
def run_data_serialization_pipeline(sentences, terms_per_context,
                                    entities_parser, embedding_path,
                                    entity_fmt_type, stemmer):
    assert (isinstance(sentences, list))
    assert (isinstance(entities_parser, BasePipelineItem)
            or entities_parser is None)
    assert (isinstance(terms_per_context, int))
    assert (isinstance(embedding_path, str))
    assert (isinstance(entity_fmt_type, EntityFormatterTypes))

    labels_scaler = BaseLabelScaler(uint_dict=OrderedDict([(NoLabel(), 0)]),
                                    int_dict=OrderedDict([(NoLabel(), 0)]))

    label_provider = MultipleLabelProvider(label_scaler=labels_scaler)

    sentences = list(map(lambda text: BaseNewsSentence(text), sentences))

    annot_algo = PairBasedAnnotationAlgorithm(
        dist_in_terms_bound=None,
        label_provider=PairSingleLabelProvider(label_instance=NoLabel()))

    frames_collection = create_frames_collection()
    frame_variants_collection = create_and_fill_variant_collection(
        frames_collection)

    # Step 1. Annotate text.
    synonyms = RuSentRelSynonymsCollectionProvider.load_collection(
        stemmer=stemmer, version=RuSentRelVersions.V11)

    # Step 2. Parse text.
    news = News(doc_id=0, sentences=sentences)

    text_parser = BaseTextParser(pipeline=[
        TermsSplitterParser(),
        TextEntitiesParser() if entities_parser is None else entities_parser,
        EntitiesGroupingPipelineItem(synonyms.get_synonym_group_index),
        DefaultTextTokenizer(keep_tokens=True),
        FrameVariantsParser(frame_variants=frame_variants_collection),
        LemmasBasedFrameVariantsParser(
            save_lemmas=False,
            stemmer=stemmer,
            frame_variants=frame_variants_collection),
        FrameVariantsSentimentNegation()
    ])

    embedding = RusvectoresEmbedding.from_word2vec_format(
        filepath=embedding_path, binary=True)
    embedding.set_stemmer(stemmer)

    exp_ctx = RuSentRelExperimentSerializationContext(
        labels_scaler=label_provider.LabelScaler,
        stemmer=stemmer,
        embedding=embedding,
        annotator=DefaultAnnotator(annot_algo=annot_algo),
        terms_per_context=terms_per_context,
        str_entity_formatter=create_entity_formatter(entity_fmt_type),
        pos_tagger=POSMystemWrapper(MystemWrapper().MystemInstance),
        name_provider=create_infer_experiment_name_provider(),
        data_folding=NoFolding(doc_ids_to_fold=[0],
                               supported_data_types=[DataType.Test]))

    labels_fmt = StringLabelsFormatter(stol={"neu": NoLabel})

    exp_io = InferIOUtils(exp_ctx)

    # Step 3. Serialize data
    experiment = CustomExperiment(exp_io=exp_io,
                                  exp_ctx=exp_ctx,
                                  doc_ops=SingleDocOperations(
                                      exp_ctx=exp_ctx,
                                      news=news,
                                      text_parser=text_parser),
                                  labels_formatter=labels_fmt,
                                  synonyms=synonyms,
                                  neutral_labels_fmt=labels_fmt)

    NetworkInputHelper.prepare(
        exp_ctx=experiment.ExperimentContext,
        exp_io=experiment.ExperimentIO,
        doc_ops=experiment.DocumentOperations,
        opin_ops=experiment.OpinionOperations,
        terms_per_context=terms_per_context,
        balance=False,
        value_to_group_id_func=synonyms.get_synonym_group_index)

    return experiment.ExperimentIO
Esempio n. 14
0
def create_stemmer():
    # This is the only stemmer supported by the experiment.
    return MystemWrapper()
Esempio n. 15
0
 def __create_stemmer():
     return MystemWrapper()