コード例 #1
0
def read_ruattitudes_in_memory(version, keep_doc_ids_only, doc_id_func):
    """ Performs reading of ruattitude formatted documents and
        selection according to 'doc_ids_set' parameter.
    """
    assert (isinstance(version, RuAttitudesVersions))
    assert (isinstance(keep_doc_ids_only, bool))
    assert (callable(doc_id_func))

    it = RuAttitudesCollection.iter_news(
        version=version,
        get_news_index_func=doc_id_func,
        label_convereter=ExperimentRuAttitudesLabelConverter(),
        return_inds_only=keep_doc_ids_only)

    it_formatted_and_logged = progress_bar_iter(
        iterable=__iter_id_with_news(news_it=it,
                                     keep_doc_ids_only=keep_doc_ids_only),
        desc="Loading RuAttitudes Collection [{}]".format(
            "doc ids only" if keep_doc_ids_only else "fully"),
        unit='docs')

    d = {}
    for doc_id, news in it_formatted_and_logged:
        d[doc_id] = news

    return d
コード例 #2
0
ファイル: base.py プロジェクト: nicolay-r/AREkit
    def from_formatted_samples(cls,
                               formatted_samples_iter,
                               bag_size,
                               create_sample_func,
                               create_empty_sample_func,
                               shuffle,
                               desc=None):
        assert(isinstance(formatted_samples_iter, collections.Iterable))
        assert(isinstance(bag_size, int) and bag_size > 0)
        assert(callable(create_sample_func))
        assert(callable(create_empty_sample_func))
        assert(isinstance(shuffle, bool))

        bags = []

        linked_rows_iter = progress_bar_iter(iterable=formatted_samples_iter, desc=desc)

        for linked_rows in linked_rows_iter:
            assert(len(linked_rows) > 0)

            cls._fill_bags_list_with_linked_text_opinions(
                bags=bags,
                parsed_rows=[ParsedSampleRow.parse(row=row) for row in linked_rows],
                bag_size=bag_size,
                create_sample_func=create_sample_func,
                create_empty_sample_func=create_empty_sample_func)

        if shuffle:
            np.random.shuffle(bags)

        return cls(bags)
コード例 #3
0
ファイル: tsv_writer.py プロジェクト: nicolay-r/AREkit
    def write(self, title, contents_it):
        self.__write(title)

        wrapped_it = progress_bar_iter(iterable=contents_it,
                                       desc='Writing output',
                                       unit='rows')

        for contents in wrapped_it:
            self.__write(contents)
コード例 #4
0
ファイル: test_ruattitudes.py プロジェクト: nicolay-r/AREkit
    def __test_iter_news_inds(self, ra_version):
        # iterating through collection
        doc_ids_it = RuAttitudesCollection.iter_news(version=ra_version,
                                                     get_news_index_func=lambda ind: ind + 1,
                                                     return_inds_only=True)

        it = progress_bar_iter(iterable=doc_ids_it,
                               desc="Extracting document ids",
                               unit="docs")

        print("Total documents count: {}".format(max(it)))
コード例 #5
0
    def __iter_annotated_collections(self, data_type, doc_ops, opin_ops):
        assert(isinstance(doc_ops, DocumentOperations))
        assert(isinstance(opin_ops, OpinionOperations))

        logged_parsed_news_iter = progress_bar_iter(
            iterable=doc_ops.iter_parsed_docs(doc_ops.iter_tagget_doc_ids(BaseDocumentTag.Annotate)),
            desc="Annotating parsed news [{}]".format(data_type))

        for parsed_news in logged_parsed_news_iter:
            assert(isinstance(parsed_news, ParsedNews))
            yield parsed_news.RelatedDocID, \
                  self._annot_collection_core(parsed_news=parsed_news, data_type=data_type, opin_ops=opin_ops)
def calculate_results(doc_ids, evaluator,
                      iter_etalon_opins_by_doc_id_func,
                      iter_result_opins_by_doc_id_func):
    """ Provides f1 (neg, pos, neu) calculation by given enumerations of
        etalon and results opinions for a particular document (doc_id).
    """
    assert(isinstance(evaluator, ThreeClassEvaluator))
    assert(callable(iter_etalon_opins_by_doc_id_func))
    assert(callable(iter_result_opins_by_doc_id_func))

    cmp_pairs_iter = OpinionCollectionsToCompareUtils.iter_comparable_collections(
        doc_ids=doc_ids,
        read_etalon_collection_func=lambda doc_id: iter_etalon_opins_by_doc_id_func(doc_id),
        read_result_collection_func=lambda doc_id: iter_result_opins_by_doc_id_func(doc_id))

    # evaluate every document.
    logged_cmp_pairs_it = progress_bar_iter(cmp_pairs_iter, desc=u"Evaluate", unit=u'pairs')
    return evaluator.evaluate(cmp_pairs=logged_cmp_pairs_it)
コード例 #7
0
ファイル: base.py プロジェクト: nicolay-r/AREkit
    def fill(self, iter_rows_func, columns_provider, desc=""):
        assert (callable(iter_rows_func))
        assert (isinstance(columns_provider, BaseColumnsProvider))

        logged_rows_it = progress_bar_iter(iterable=iter_rows_func(True),
                                           desc="Calculating rows count",
                                           unit="rows")
        rows_count = sum(1 for _ in logged_rows_it)

        logger.info("Filling with blank rows: {}".format(rows_count))
        self.__fill_with_blank_rows(row_id_column_name=columns_provider.ROW_ID,
                                    rows_count=rows_count)
        logger.info("Completed!")

        it = progress_bar_defined(iterable=iter_rows_func(False),
                                  desc="{fmt}".format(fmt=desc),
                                  total=rows_count)

        for row_index, row in enumerate(it):
            for column, value in row.items():
                self.__set_value(row_ind=row_index, column=column, value=value)

        self.__log_info()
コード例 #8
0
    def __test_core(self,
                    res_version,
                    synonyms=None,
                    eval_mode=EvaluationModes.Extraction,
                    check_results=True):
        assert (isinstance(res_version, ResultVersions))
        assert (isinstance(synonyms, SynonymsCollection) or synonyms is None)
        assert (isinstance(eval_mode, EvaluationModes))
        assert (isinstance(check_results, bool))

        # Initializing synonyms collection.
        if synonyms is None:
            # This is a default collection which we used
            # to provide the results in `f1_rusentrel_v11_results`.
            stemmer = self.__create_stemmer()
            actual_synonyms = RuSentRelSynonymsCollectionProvider.load_collection(
                stemmer=stemmer, version=self.__rusentrel_version)
        else:
            actual_synonyms = synonyms

        # Setup an experiment labels formatter.
        labels_formatter = RuSentRelExperimentLabelsFormatter()

        # Iter cmp opinions.
        cmp_pairs_iter = OpinionCollectionsToCompareUtils.iter_comparable_collections(
            doc_ids=ZippedResultsIOUtils.iter_doc_ids(res_version),
            read_etalon_collection_func=lambda doc_id: OpinionCollection(
                opinions=RuSentRelOpinionCollection.iter_opinions_from_doc(
                    doc_id=doc_id, labels_fmt=labels_formatter),
                synonyms=actual_synonyms,
                error_on_duplicates=False,
                error_on_synonym_end_missed=True),
            read_result_collection_func=lambda doc_id: OpinionCollection(
                opinions=ZippedResultsIOUtils.iter_doc_opinions(
                    doc_id=doc_id,
                    result_version=res_version,
                    labels_formatter=labels_formatter),
                synonyms=actual_synonyms,
                error_on_duplicates=False,
                error_on_synonym_end_missed=False))

        # getting evaluator.
        evaluator = TwoClassEvaluator(eval_mode=eval_mode)

        # evaluate every document.
        logged_cmp_pairs_it = progress_bar_iter(cmp_pairs_iter,
                                                desc="Evaluate",
                                                unit='pairs')
        result = evaluator.evaluate(cmp_pairs=logged_cmp_pairs_it)
        assert (isinstance(result, TwoClassEvalResult))

        # calculate results.
        result.calculate()

        # logging all the result information.
        for doc_id, doc_info in result.iter_document_results():
            print("{}:\t{}".format(doc_id, doc_info))
        print("------------------------")
        print(str(result.TotalResult))
        print("------------------------")

        # Display cmp tables (optionally).
        if self.__display_cmp_table:
            with pd.option_context('display.max_rows', None,
                                   'display.max_columns', None):
                for doc_id, df_cmp_table in result.iter_dataframe_cmp_tables():
                    assert (isinstance(df_cmp_table, DocumentCompareTable))
                    print("{}:\t{}\n".format(doc_id,
                                             df_cmp_table.DataframeTable))
            print("------------------------")

        if check_results:
            self.__is_equal_results(v1=result.get_result_by_metric(
                TwoClassEvalResult.C_F1),
                                    v2=f1_rusentrel_v11_results[res_version])