def __init__(self, relations, bag_size=40): assert (isinstance(bag_size, int) and bag_size > 0) # relations from relationsCollection assert (isinstance(relations, list)) # relations from relationsCollection self.bags = [] self.bag_size = bag_size pos_bag_ind = self._add_bag(PositiveLabel()) neg_bag_ind = self._add_bag(NegativeLabel()) neu_bag_ind = self._add_bag(NeutralLabel()) for r in relations: assert (isinstance(r, ExtractedRelation)) self._optional_add_in_bag(pos_bag_ind, r.opinion_vector, r.text_position, r.label) self._optional_add_in_bag(neg_bag_ind, r.opinion_vector, r.text_position, r.label) self._optional_add_in_bag(neu_bag_ind, r.opinion_vector, r.text_position, r.label) if len(self.bags[pos_bag_ind]) == bag_size: pos_bag_ind = self._add_bag(PositiveLabel()) if len(self.bags[neg_bag_ind]) == bag_size: neg_bag_ind = self._add_bag(NegativeLabel()) if len(self.bags[neu_bag_ind]) == bag_size: neu_bag_ind = self._add_bag(NeutralLabel())
def __init__(self, synonyms_filepath, user_answers_filepath, stemmer): assert(isinstance(stemmer, Stemmer)) # for opinion collections self.synonyms_filepath = synonyms_filepath self.user_answers = user_answers_filepath self.stemmer = stemmer self.pos = PositiveLabel() self.neg = NegativeLabel() self.neu = NeutralLabel()
def from_file(cls, filepath, synonyms): assert (isinstance(filepath, str)) assert (isinstance(synonyms, SynonymsCollection)) instance = cls(synonyms=synonyms) with open(filepath, 'r') as f: it = cls.iter_line_params(f) for args in tqdm(it, desc="Init BasePrinter from file"): pos_count, neg_count, source_id, target_id, _ = args value_left = synonyms.get_group_by_index(int(source_id))[0] value_right = synonyms.get_group_by_index(int(target_id))[0] pos_opinion = Opinion(value_left=value_left, value_right=value_right, sentiment=PositiveLabel()) neg_opinion = Opinion(value_left=value_left, value_right=value_right, sentiment=NegativeLabel()) if pos_count > 0: instance.register_extracted_opinion(pos_opinion, count=pos_count) if neg_count > 0: instance.register_extracted_opinion(neg_opinion, count=neg_count) return instance
def debug_statistic(self): labels_count = [0, 0, 0] for r in self.relations: labels_count[r.label.to_uint()] += 1 return { 'pos': labels_count[PositiveLabel().to_uint()], 'neg': labels_count[NegativeLabel().to_uint()], 'neu': labels_count[NeutralLabel().to_uint()] }
def optional_invert_label(label, is_inverted): assert (isinstance(label, Label)) assert (isinstance(is_inverted, bool)) if not is_inverted: return label if isinstance(label, PositiveLabel): return NegativeLabel() elif isinstance(label, NegativeLabel): return PositiveLabel() raise Exception("Not supported label")
def get_method_statistic(files_to_compare_list, synonyms_filepath, stemmer): """ Calculate statistic based on result files files_to_compare_list: list list of FilesToCompare objects synonyms_filepath: str stemmer: Stemmer """ assert(isinstance(stemmer, Stemmer)) columns = ["t_all", "t_pos", "t_neg", "e_all", "e_pos", "e_neg"] df = pd.DataFrame(columns=columns) for files_to_compare in files_to_compare_list: assert(isinstance(files_to_compare, FilesToCompare)) test_opins = OpinionCollection.from_file( files_to_compare.test_filepath, synonyms_filepath, stemmer=stemmer) etalon_opins = OpinionCollection.from_file( files_to_compare.etalon_filepath, synonyms_filepath, stemmer=stemmer) df.loc[files_to_compare.index] = [ MethodStatistic.founded_opins(test_opins, etalon_opins), MethodStatistic.founded_opins(test_opins, etalon_opins, PositiveLabel()), MethodStatistic.founded_opins(test_opins, etalon_opins, NegativeLabel()), len(etalon_opins), len(list(etalon_opins.iter_sentiment(PositiveLabel()))), len(list(etalon_opins.iter_sentiment(NegativeLabel())))] df.loc['sum'] = [float(df[c].sum()) for c in columns] df.loc['found'] = None df.loc['found']['t_all'] = float(df.loc['sum']['t_all']) / df.loc['sum']['e_all'] df.loc['found']['t_pos'] = float(df.loc['sum']['t_pos']) / df.loc['sum']['e_pos'] df.loc['found']['t_neg'] = float(df.loc['sum']['t_neg']) / df.loc['sum']['e_neg'] return df
def __neg_labeling_callback(self, rc, dest_data_type): assert(isinstance(rc, ExtractedRelationsCollection)) assert(isinstance(dest_data_type, unicode)) rch = self.get_relations_collection_helper(dest_data_type) assert(isinstance(rch, ExtractedRelationsCollectionHelper)) stat, _ = rch.get_statistic() neg_prec_bound = stat[2] / 100.0 for relation in rc: assert(isinstance(relation, ExtractedRelation)) next = random.random() label = NegativeLabel() if next <= neg_prec_bound else PositiveLabel() rc.apply_label(label=label, relation_id=relation.RelationID) return PredictVariables()
class Evaluator: # Columns C_POS_PREC = 'pos_prec' C_NEG_PREC = 'neg_prec' C_POS_RECALL = 'pos_recall' C_NEG_RECALL = 'neg_recall' C_F1_POS = 'f1_pos' C_F1_NEG = 'f1_neg' C_F1 = 'f1' # Columns for differences C_WHO = 'who' C_TO = 'to' C_ORIG = 'how_orig' C_RES = 'how_results' C_CMP = 'comparison' def __init__(self, synonyms_filepath, user_answers_filepath, stemmer): assert(isinstance(stemmer, Stemmer)) # for opinion collections self.synonyms_filepath = synonyms_filepath self.user_answers = user_answers_filepath self.stemmer = stemmer self.pos = PositiveLabel() self.neg = NegativeLabel() self.neu = NeutralLabel() @staticmethod def get_result_columns(): return [Evaluator.C_POS_PREC, Evaluator.C_POS_PREC, Evaluator.C_NEG_PREC, Evaluator.C_POS_RECALL, Evaluator.C_NEG_RECALL, Evaluator.C_F1_POS, Evaluator.C_F1_NEG, Evaluator.C_F1] @staticmethod def _calcRecall(results, answers, label): assert(isinstance(label, PositiveLabel) or isinstance(label, NegativeLabel)) if len(results[results[Evaluator.C_ORIG] == label.to_str()]) != 0: return 1.0 * len(answers[(answers[Evaluator.C_CMP] == True)]) / len(results[results[Evaluator.C_ORIG] == label.to_str()]) else: return 0.0 @staticmethod def _calcPrecision(answers): if len(answers) != 0: return 1.0 * len(answers[(answers[Evaluator.C_CMP] == True)]) / len(answers) else: return 0.0 def _calcPrecisionAndRecall(self, results): """ Расчет полноты и точности. """ pos_answers = results[(results[Evaluator.C_RES] == self.pos.to_str())] neg_answers = results[(results[Evaluator.C_RES] == self.neg.to_str())] pos_prec = self._calcPrecision(pos_answers) neg_prec = self._calcPrecision(neg_answers) pos_recall = self._calcRecall(results, pos_answers, self.pos) neg_recall = self._calcRecall(results, neg_answers, self.neg) assert(isinstance(pos_prec, float)) assert(isinstance(neg_prec, float)) assert(isinstance(pos_recall, float)) assert(isinstance(neg_recall, float)) return pos_prec, neg_prec, pos_recall, neg_recall def _check(self, etalon_opins, test_opins): assert(isinstance(etalon_opins, OpinionCollection)) assert(isinstance(test_opins, OpinionCollection)) df = pd.DataFrame( columns=[self.C_WHO, self.C_TO, self.C_ORIG, self.C_RES, self.C_CMP]) r_ind = 0 # Append everithing that exist in etalon collection. for o_etalon in etalon_opins: comparison = False has_opinion = test_opins.has_opinion_by_synonyms(o_etalon) if has_opinion: o_test = test_opins.get_opinion_by_synonyms(o_etalon) comparison = o_test.sentiment == o_etalon.sentiment df.loc[r_ind] = [o_etalon.value_left.encode('utf-8'), o_etalon.value_right.encode('utf-8'), o_etalon.sentiment.to_str(), None if not has_opinion else o_test.sentiment.to_str(), comparison] r_ind += 1 # Append everithing that exist in test collection. for o_test in test_opins: has_opinion = etalon_opins.has_opinion_by_synonyms(o_test) if has_opinion: continue df.loc[r_ind] = [o_test.value_left.encode('utf-8'), o_test.value_right.encode('utf-8'), None, o_test.sentiment.to_str(), False] r_ind += 1 return df # TODO. change it with the list of FilesToCompare objects. def _calc_a_file(self, files_to_compare, debug): assert(isinstance(files_to_compare, FilesToCompare)) # Reading test answers. test_opins = OpinionCollection.from_file( files_to_compare.test_filepath, self.synonyms_filepath, stemmer=self.stemmer) # Reading etalon answers. etalon_opins = OpinionCollection.from_file( files_to_compare.etalon_filepath, self.synonyms_filepath, stemmer=self.stemmer) if debug: print "{} <-> {}, {}".format( files_to_compare.test_filepath, files_to_compare.etalon_filepath, files_to_compare.index) # Comparing test and etalon results. results = self._check(etalon_opins, test_opins) # Save result comparison into file. # TODO. remove path declaration from here. comparison_file = "{}/art{}.comp.txt".format( self.user_answers, str(files_to_compare.index)) if debug: print "Save comparison file: {}".format(comparison_file) results.to_csv(comparison_file) return self._calcPrecisionAndRecall(results) def evaluate(self, files_to_compare_list, debug=False): """ Main evaluation subprogram """ assert(isinstance(files_to_compare_list, list)) pos_prec, neg_prec, pos_recall, neg_recall = (0, 0, 0, 0) for files_to_compare in files_to_compare_list: [pos_prec1, neg_prec1, pos_recall1, neg_recall1] = self._calc_a_file(files_to_compare, debug=debug) pos_prec += pos_prec1 neg_prec += neg_prec1 pos_recall += pos_recall1 neg_recall += neg_recall1 # print len(files_to_compare_list) pos_prec /= len(files_to_compare_list) neg_prec /= len(files_to_compare_list) pos_recall /= len(files_to_compare_list) neg_recall /= len(files_to_compare_list) if pos_prec * pos_recall != 0: f1_pos = 2 * pos_prec * pos_recall / (pos_prec + pos_recall) else: f1_pos = 0 if neg_prec * neg_recall != 0: f1_neg = 2 * neg_prec * neg_recall / (neg_prec + neg_recall) else: f1_neg = 0 return {self.C_POS_PREC: pos_prec, self.C_NEG_PREC: neg_prec, self.C_POS_RECALL: pos_recall, self.C_NEG_RECALL: neg_recall, self.C_F1_POS: f1_pos, self.C_F1_NEG: f1_neg, self.C_F1: (f1_pos + f1_neg) / 2}