Exemple #1
0
    def __init__(self, relations, bag_size=40):
        assert (isinstance(bag_size, int)
                and bag_size > 0)  # relations from relationsCollection
        assert (isinstance(relations,
                           list))  # relations from relationsCollection

        self.bags = []
        self.bag_size = bag_size
        pos_bag_ind = self._add_bag(PositiveLabel())
        neg_bag_ind = self._add_bag(NegativeLabel())
        neu_bag_ind = self._add_bag(NeutralLabel())

        for r in relations:
            assert (isinstance(r, ExtractedRelation))

            self._optional_add_in_bag(pos_bag_ind, r.opinion_vector,
                                      r.text_position, r.label)
            self._optional_add_in_bag(neg_bag_ind, r.opinion_vector,
                                      r.text_position, r.label)
            self._optional_add_in_bag(neu_bag_ind, r.opinion_vector,
                                      r.text_position, r.label)

            if len(self.bags[pos_bag_ind]) == bag_size:
                pos_bag_ind = self._add_bag(PositiveLabel())
            if len(self.bags[neg_bag_ind]) == bag_size:
                neg_bag_ind = self._add_bag(NegativeLabel())
            if len(self.bags[neu_bag_ind]) == bag_size:
                neu_bag_ind = self._add_bag(NeutralLabel())
Exemple #2
0
    def __init__(self, synonyms_filepath, user_answers_filepath, stemmer):
        assert(isinstance(stemmer, Stemmer)) # for opinion collections

        self.synonyms_filepath = synonyms_filepath
        self.user_answers = user_answers_filepath
        self.stemmer = stemmer

        self.pos = PositiveLabel()
        self.neg = NegativeLabel()
        self.neu = NeutralLabel()
Exemple #3
0
    def from_file(cls, filepath, synonyms):
        assert (isinstance(filepath, str))
        assert (isinstance(synonyms, SynonymsCollection))

        instance = cls(synonyms=synonyms)

        with open(filepath, 'r') as f:
            it = cls.iter_line_params(f)
            for args in tqdm(it, desc="Init BasePrinter from file"):

                pos_count, neg_count, source_id, target_id, _ = args

                value_left = synonyms.get_group_by_index(int(source_id))[0]
                value_right = synonyms.get_group_by_index(int(target_id))[0]

                pos_opinion = Opinion(value_left=value_left,
                                      value_right=value_right,
                                      sentiment=PositiveLabel())

                neg_opinion = Opinion(value_left=value_left,
                                      value_right=value_right,
                                      sentiment=NegativeLabel())

                if pos_count > 0:
                    instance.register_extracted_opinion(pos_opinion,
                                                        count=pos_count)
                if neg_count > 0:
                    instance.register_extracted_opinion(neg_opinion,
                                                        count=neg_count)

        return instance
Exemple #4
0
 def debug_statistic(self):
     labels_count = [0, 0, 0]
     for r in self.relations:
         labels_count[r.label.to_uint()] += 1
     return {
         'pos': labels_count[PositiveLabel().to_uint()],
         'neg': labels_count[NegativeLabel().to_uint()],
         'neu': labels_count[NeutralLabel().to_uint()]
     }
    def __neg_labeling_callback(self, rc, dest_data_type):
        assert (isinstance(rc, ExtractedRelationsCollection))
        assert (isinstance(dest_data_type, unicode))

        for relation in rc:
            assert (isinstance(relation, ExtractedRelation))
            rc.apply_label(label=NegativeLabel(),
                           relation_id=relation.RelationID)

        return PredictVariables()
Exemple #6
0
def optional_invert_label(label, is_inverted):
    assert (isinstance(label, Label))
    assert (isinstance(is_inverted, bool))

    if not is_inverted:
        return label
    if isinstance(label, PositiveLabel):
        return NegativeLabel()
    elif isinstance(label, NegativeLabel):
        return PositiveLabel()
    raise Exception("Not supported label")
Exemple #7
0
    def get_method_statistic(files_to_compare_list, synonyms_filepath, stemmer):
        """
            Calculate statistic based on result files
            files_to_compare_list: list
                list of FilesToCompare objects
            synonyms_filepath: str
            stemmer: Stemmer
        """
        assert(isinstance(stemmer, Stemmer))

        columns = ["t_all", "t_pos", "t_neg", "e_all", "e_pos", "e_neg"]

        df = pd.DataFrame(columns=columns)
        for files_to_compare in files_to_compare_list:

            assert(isinstance(files_to_compare, FilesToCompare))
            test_opins = OpinionCollection.from_file(
                    files_to_compare.test_filepath, synonyms_filepath, stemmer=stemmer)
            etalon_opins = OpinionCollection.from_file(
                    files_to_compare.etalon_filepath, synonyms_filepath, stemmer=stemmer)

            df.loc[files_to_compare.index] = [
                    MethodStatistic.founded_opins(test_opins, etalon_opins),
                    MethodStatistic.founded_opins(test_opins, etalon_opins, PositiveLabel()),
                    MethodStatistic.founded_opins(test_opins, etalon_opins, NegativeLabel()),
                    len(etalon_opins),
                    len(list(etalon_opins.iter_sentiment(PositiveLabel()))),
                    len(list(etalon_opins.iter_sentiment(NegativeLabel())))]

        df.loc['sum'] = [float(df[c].sum()) for c in columns]

        df.loc['found'] = None
        df.loc['found']['t_all'] = float(df.loc['sum']['t_all']) / df.loc['sum']['e_all']
        df.loc['found']['t_pos'] = float(df.loc['sum']['t_pos']) / df.loc['sum']['e_pos']
        df.loc['found']['t_neg'] = float(df.loc['sum']['t_neg']) / df.loc['sum']['e_neg']

        return df
    def __neg_labeling_callback(self, rc, dest_data_type):
        assert(isinstance(rc, ExtractedRelationsCollection))
        assert(isinstance(dest_data_type, unicode))

        rch = self.get_relations_collection_helper(dest_data_type)
        assert(isinstance(rch, ExtractedRelationsCollectionHelper))

        stat, _ = rch.get_statistic()
        neg_prec_bound = stat[2] / 100.0

        for relation in rc:
            assert(isinstance(relation, ExtractedRelation))
            next = random.random()
            label = NegativeLabel() if next <= neg_prec_bound else PositiveLabel()
            rc.apply_label(label=label, relation_id=relation.RelationID)

        return PredictVariables()
Exemple #9
0
class Evaluator:

    # Columns
    C_POS_PREC = 'pos_prec'
    C_NEG_PREC = 'neg_prec'
    C_POS_RECALL = 'pos_recall'
    C_NEG_RECALL = 'neg_recall'
    C_F1_POS = 'f1_pos'
    C_F1_NEG = 'f1_neg'
    C_F1 = 'f1'

    # Columns for differences
    C_WHO = 'who'
    C_TO = 'to'
    C_ORIG = 'how_orig'
    C_RES = 'how_results'
    C_CMP = 'comparison'

    def __init__(self, synonyms_filepath, user_answers_filepath, stemmer):
        assert(isinstance(stemmer, Stemmer)) # for opinion collections

        self.synonyms_filepath = synonyms_filepath
        self.user_answers = user_answers_filepath
        self.stemmer = stemmer

        self.pos = PositiveLabel()
        self.neg = NegativeLabel()
        self.neu = NeutralLabel()

    @staticmethod
    def get_result_columns():
        return [Evaluator.C_POS_PREC,
                Evaluator.C_POS_PREC,
                Evaluator.C_NEG_PREC,
                Evaluator.C_POS_RECALL,
                Evaluator.C_NEG_RECALL,
                Evaluator.C_F1_POS,
                Evaluator.C_F1_NEG,
                Evaluator.C_F1]

    @staticmethod
    def _calcRecall(results, answers, label):
        assert(isinstance(label, PositiveLabel) or isinstance(label, NegativeLabel))
        if len(results[results[Evaluator.C_ORIG] == label.to_str()]) != 0:
            return 1.0 * len(answers[(answers[Evaluator.C_CMP] == True)]) / len(results[results[Evaluator.C_ORIG] == label.to_str()])
        else:
            return 0.0

    @staticmethod
    def _calcPrecision(answers):
        if len(answers) != 0:
            return 1.0 * len(answers[(answers[Evaluator.C_CMP] == True)]) / len(answers)
        else:
            return 0.0

    def _calcPrecisionAndRecall(self, results):
        """ Расчет полноты и точности.
        """
        pos_answers = results[(results[Evaluator.C_RES] == self.pos.to_str())]
        neg_answers = results[(results[Evaluator.C_RES] == self.neg.to_str())]

        pos_prec = self._calcPrecision(pos_answers)
        neg_prec = self._calcPrecision(neg_answers)

        pos_recall = self._calcRecall(results, pos_answers, self.pos)
        neg_recall = self._calcRecall(results, neg_answers, self.neg)

        assert(isinstance(pos_prec, float))
        assert(isinstance(neg_prec, float))
        assert(isinstance(pos_recall, float))
        assert(isinstance(neg_recall, float))

        return pos_prec, neg_prec, pos_recall, neg_recall

    def _check(self, etalon_opins, test_opins):
        assert(isinstance(etalon_opins, OpinionCollection))
        assert(isinstance(test_opins, OpinionCollection))

        df = pd.DataFrame(
                columns=[self.C_WHO, self.C_TO, self.C_ORIG, self.C_RES, self.C_CMP])

        r_ind = 0
        # Append everithing that exist in etalon collection.
        for o_etalon in etalon_opins:
            comparison = False
            has_opinion = test_opins.has_opinion_by_synonyms(o_etalon)

            if has_opinion:
                o_test = test_opins.get_opinion_by_synonyms(o_etalon)
                comparison = o_test.sentiment == o_etalon.sentiment

            df.loc[r_ind] = [o_etalon.value_left.encode('utf-8'),
                             o_etalon.value_right.encode('utf-8'),
                             o_etalon.sentiment.to_str(),
                             None if not has_opinion else o_test.sentiment.to_str(),
                             comparison]
            r_ind += 1

        # Append everithing that exist in test collection.
        for o_test in test_opins:
            has_opinion = etalon_opins.has_opinion_by_synonyms(o_test)
            if has_opinion:
                continue
            df.loc[r_ind] = [o_test.value_left.encode('utf-8'),
                             o_test.value_right.encode('utf-8'),
                             None,
			                 o_test.sentiment.to_str(),
                             False]
            r_ind += 1

        return df

    # TODO. change it with the list of FilesToCompare objects.
    def _calc_a_file(self, files_to_compare, debug):
        assert(isinstance(files_to_compare, FilesToCompare))

        # Reading test answers.
        test_opins = OpinionCollection.from_file(
            files_to_compare.test_filepath,
            self.synonyms_filepath,
            stemmer=self.stemmer)

        # Reading etalon answers.
        etalon_opins = OpinionCollection.from_file(
            files_to_compare.etalon_filepath,
            self.synonyms_filepath,
            stemmer=self.stemmer)

        if debug:
            print "{} <-> {}, {}".format(
                    files_to_compare.test_filepath,
                    files_to_compare.etalon_filepath,
                    files_to_compare.index)

        # Comparing test and etalon results.
        results = self._check(etalon_opins, test_opins)

        # Save result comparison into file.
        # TODO. remove path declaration from here.
        comparison_file = "{}/art{}.comp.txt".format(
                self.user_answers, str(files_to_compare.index))

        if debug:
            print "Save comparison file: {}".format(comparison_file)

        results.to_csv(comparison_file)

        return self._calcPrecisionAndRecall(results)

    def evaluate(self, files_to_compare_list, debug=False):
        """ Main evaluation subprogram
        """
        assert(isinstance(files_to_compare_list, list))

        pos_prec, neg_prec, pos_recall, neg_recall = (0, 0, 0, 0)

        for files_to_compare in files_to_compare_list:
            [pos_prec1, neg_prec1, pos_recall1, neg_recall1] = self._calc_a_file(files_to_compare, debug=debug)

            pos_prec += pos_prec1
            neg_prec += neg_prec1
            pos_recall += pos_recall1
            neg_recall += neg_recall1

        # print len(files_to_compare_list)

        pos_prec /= len(files_to_compare_list)
        neg_prec /= len(files_to_compare_list)
        pos_recall /= len(files_to_compare_list)
        neg_recall /= len(files_to_compare_list)

        if pos_prec * pos_recall != 0:
            f1_pos = 2 * pos_prec * pos_recall / (pos_prec + pos_recall)
        else:
            f1_pos = 0

        if neg_prec * neg_recall != 0:
            f1_neg = 2 * neg_prec * neg_recall / (neg_prec + neg_recall)
        else:
            f1_neg = 0

        return {self.C_POS_PREC: pos_prec,
                self.C_NEG_PREC: neg_prec,
                self.C_POS_RECALL: pos_recall,
                self.C_NEG_RECALL: neg_recall,
                self.C_F1_POS: f1_pos,
                self.C_F1_NEG: f1_neg,
                self.C_F1: (f1_pos + f1_neg) / 2}