def read_ruattitudes_in_memory(version, keep_doc_ids_only, doc_id_func): """ Performs reading of ruattitude formatted documents and selection according to 'doc_ids_set' parameter. """ assert (isinstance(version, RuAttitudesVersions)) assert (isinstance(keep_doc_ids_only, bool)) assert (callable(doc_id_func)) it = RuAttitudesCollection.iter_news( version=version, get_news_index_func=doc_id_func, label_convereter=ExperimentRuAttitudesLabelConverter(), return_inds_only=keep_doc_ids_only) it_formatted_and_logged = progress_bar_iter( iterable=__iter_id_with_news(news_it=it, keep_doc_ids_only=keep_doc_ids_only), desc="Loading RuAttitudes Collection [{}]".format( "doc ids only" if keep_doc_ids_only else "fully"), unit='docs') d = {} for doc_id, news in it_formatted_and_logged: d[doc_id] = news return d
def from_formatted_samples(cls, formatted_samples_iter, bag_size, create_sample_func, create_empty_sample_func, shuffle, desc=None): assert(isinstance(formatted_samples_iter, collections.Iterable)) assert(isinstance(bag_size, int) and bag_size > 0) assert(callable(create_sample_func)) assert(callable(create_empty_sample_func)) assert(isinstance(shuffle, bool)) bags = [] linked_rows_iter = progress_bar_iter(iterable=formatted_samples_iter, desc=desc) for linked_rows in linked_rows_iter: assert(len(linked_rows) > 0) cls._fill_bags_list_with_linked_text_opinions( bags=bags, parsed_rows=[ParsedSampleRow.parse(row=row) for row in linked_rows], bag_size=bag_size, create_sample_func=create_sample_func, create_empty_sample_func=create_empty_sample_func) if shuffle: np.random.shuffle(bags) return cls(bags)
def write(self, title, contents_it): self.__write(title) wrapped_it = progress_bar_iter(iterable=contents_it, desc='Writing output', unit='rows') for contents in wrapped_it: self.__write(contents)
def __test_iter_news_inds(self, ra_version): # iterating through collection doc_ids_it = RuAttitudesCollection.iter_news(version=ra_version, get_news_index_func=lambda ind: ind + 1, return_inds_only=True) it = progress_bar_iter(iterable=doc_ids_it, desc="Extracting document ids", unit="docs") print("Total documents count: {}".format(max(it)))
def __iter_annotated_collections(self, data_type, doc_ops, opin_ops): assert(isinstance(doc_ops, DocumentOperations)) assert(isinstance(opin_ops, OpinionOperations)) logged_parsed_news_iter = progress_bar_iter( iterable=doc_ops.iter_parsed_docs(doc_ops.iter_tagget_doc_ids(BaseDocumentTag.Annotate)), desc="Annotating parsed news [{}]".format(data_type)) for parsed_news in logged_parsed_news_iter: assert(isinstance(parsed_news, ParsedNews)) yield parsed_news.RelatedDocID, \ self._annot_collection_core(parsed_news=parsed_news, data_type=data_type, opin_ops=opin_ops)
def calculate_results(doc_ids, evaluator, iter_etalon_opins_by_doc_id_func, iter_result_opins_by_doc_id_func): """ Provides f1 (neg, pos, neu) calculation by given enumerations of etalon and results opinions for a particular document (doc_id). """ assert(isinstance(evaluator, ThreeClassEvaluator)) assert(callable(iter_etalon_opins_by_doc_id_func)) assert(callable(iter_result_opins_by_doc_id_func)) cmp_pairs_iter = OpinionCollectionsToCompareUtils.iter_comparable_collections( doc_ids=doc_ids, read_etalon_collection_func=lambda doc_id: iter_etalon_opins_by_doc_id_func(doc_id), read_result_collection_func=lambda doc_id: iter_result_opins_by_doc_id_func(doc_id)) # evaluate every document. logged_cmp_pairs_it = progress_bar_iter(cmp_pairs_iter, desc=u"Evaluate", unit=u'pairs') return evaluator.evaluate(cmp_pairs=logged_cmp_pairs_it)
def fill(self, iter_rows_func, columns_provider, desc=""): assert (callable(iter_rows_func)) assert (isinstance(columns_provider, BaseColumnsProvider)) logged_rows_it = progress_bar_iter(iterable=iter_rows_func(True), desc="Calculating rows count", unit="rows") rows_count = sum(1 for _ in logged_rows_it) logger.info("Filling with blank rows: {}".format(rows_count)) self.__fill_with_blank_rows(row_id_column_name=columns_provider.ROW_ID, rows_count=rows_count) logger.info("Completed!") it = progress_bar_defined(iterable=iter_rows_func(False), desc="{fmt}".format(fmt=desc), total=rows_count) for row_index, row in enumerate(it): for column, value in row.items(): self.__set_value(row_ind=row_index, column=column, value=value) self.__log_info()
def __test_core(self, res_version, synonyms=None, eval_mode=EvaluationModes.Extraction, check_results=True): assert (isinstance(res_version, ResultVersions)) assert (isinstance(synonyms, SynonymsCollection) or synonyms is None) assert (isinstance(eval_mode, EvaluationModes)) assert (isinstance(check_results, bool)) # Initializing synonyms collection. if synonyms is None: # This is a default collection which we used # to provide the results in `f1_rusentrel_v11_results`. stemmer = self.__create_stemmer() actual_synonyms = RuSentRelSynonymsCollectionProvider.load_collection( stemmer=stemmer, version=self.__rusentrel_version) else: actual_synonyms = synonyms # Setup an experiment labels formatter. labels_formatter = RuSentRelExperimentLabelsFormatter() # Iter cmp opinions. cmp_pairs_iter = OpinionCollectionsToCompareUtils.iter_comparable_collections( doc_ids=ZippedResultsIOUtils.iter_doc_ids(res_version), read_etalon_collection_func=lambda doc_id: OpinionCollection( opinions=RuSentRelOpinionCollection.iter_opinions_from_doc( doc_id=doc_id, labels_fmt=labels_formatter), synonyms=actual_synonyms, error_on_duplicates=False, error_on_synonym_end_missed=True), read_result_collection_func=lambda doc_id: OpinionCollection( opinions=ZippedResultsIOUtils.iter_doc_opinions( doc_id=doc_id, result_version=res_version, labels_formatter=labels_formatter), synonyms=actual_synonyms, error_on_duplicates=False, error_on_synonym_end_missed=False)) # getting evaluator. evaluator = TwoClassEvaluator(eval_mode=eval_mode) # evaluate every document. logged_cmp_pairs_it = progress_bar_iter(cmp_pairs_iter, desc="Evaluate", unit='pairs') result = evaluator.evaluate(cmp_pairs=logged_cmp_pairs_it) assert (isinstance(result, TwoClassEvalResult)) # calculate results. result.calculate() # logging all the result information. for doc_id, doc_info in result.iter_document_results(): print("{}:\t{}".format(doc_id, doc_info)) print("------------------------") print(str(result.TotalResult)) print("------------------------") # Display cmp tables (optionally). if self.__display_cmp_table: with pd.option_context('display.max_rows', None, 'display.max_columns', None): for doc_id, df_cmp_table in result.iter_dataframe_cmp_tables(): assert (isinstance(df_cmp_table, DocumentCompareTable)) print("{}:\t{}\n".format(doc_id, df_cmp_table.DataframeTable)) print("------------------------") if check_results: self.__is_equal_results(v1=result.get_result_by_metric( TwoClassEvalResult.C_F1), v2=f1_rusentrel_v11_results[res_version])