Exemple #1
0
    def extract_facts(self, classifiers):
        """
        Stage 5 of pipeline.
        classifiers is a dict {relation: classifier, ...}
        """
        # TODO: this probably is smarter as an outer iteration through segments
        # and then an inner iteration over relations
        logger.debug(u'running extract_facts')
        result = Knowledge()

        for r, evidence in self.evidence.per_relation().items():
            evidence = list(evidence)
            ps = self._score_evidence(r, classifiers.get(r, None), evidence)
            result.update(zip(evidence, ps))
        # save scores for later use (e.g. in generate_questions, stage 2.1)
        self.evidence.update(result)
        return result
Exemple #2
0
    def filter_evidence(self, _):
        """
        Stage 2.2 of pipeline.

        Build evidence for training the classifiers, from user answers
        (self.answers) and unanswered evidence (self.evidence) with last
        classification score certainty over self.evidence_threshold.
        """
        logger.debug(u'running filter_evidence')
        evidence = Knowledge(self.answers)
        n = len(evidence)
        evidence.update((e, score > 0.5 and 1 or 0)
                        for e, score in self.evidence.items()
                        if certainty(score) > self.evidence_threshold
                        and e not in self.answers)
        logger.info(u'Filtering returns {} human-built evidences and {} '
                    u'over-threshold evidences'.format(n,
                                                       len(evidence) - n))
        return evidence
Exemple #3
0
class BootstrappedIEPipeline(object):
    """
    Iepy's main class. Implements a boostrapped information extraction pipeline.

    From the user's point of view this class is meant to be used like this::

        p = BoostrappedIEPipeline(db_connector, seed_facts)
        p.start()  # blocking
        while UserIsNotTired:
            for question in p.questions_available():
                # Ask user
                # ...
                p.add_answer(question, answer)
            p.force_process()
        facts = p.get_facts()  # profit
    """
    def __init__(self,
                 db_connector,
                 seed_facts,
                 gold_standard=None,
                 extractor_config=None,
                 prediction_config=None,
                 evidence_threshold=defaults.evidence_threshold,
                 fact_threshold=defaults.fact_threshold,
                 sort_questions_by=defaults.questions_sorting,
                 drop_guesses_each_round=defaults.drop_guesses_each_round):
        """
        Not blocking.
        """
        self.db_con = db_connector
        self.knowledge = Knowledge(
            {Evidence(f, None, None, None): 1
             for f in seed_facts})
        self.evidence_threshold = evidence_threshold
        self.fact_threshold = fact_threshold
        self.questions = Knowledge()
        self.answers = {}
        self.gold_standard = gold_standard
        self.extractor_config = deepcopy(extractor_config
                                         or defaults.extractor_config)
        self.prediction_config = deepcopy(prediction_config
                                          or defaults.prediction_config)
        self.sort_questions_by = sort_questions_by
        self.drop_guesses_each_round = drop_guesses_each_round

        self.steps = [
            self.generalize_knowledge,  # Step 1
            self.generate_questions,  # Step 2, first half
            None,  # Pause to wait question answers
            self.filter_evidence,  # Step 2, second half
            self.learn_fact_extractors,  # Step 3
            self.extract_facts,  # Step 5
            self.filter_facts,  # Step 6
            self.evaluate  # Optional evaluation step
        ]
        self.step_iterator = itertools.cycle(self.steps)

        # Build relation description: a map from relation labels to pairs of entity kinds
        self.relations = {}
        for e in self.knowledge:
            t1 = e.fact.e1.kind
            t2 = e.fact.e2.kind
            if e.fact.relation in self.relations and (
                    t1, t2) != self.relations[e.fact.relation]:
                raise ValueError("Ambiguous kinds for relation %r" %
                                 e.fact.relation)
            self.relations[e.fact.relation] = (t1, t2)
        # Precompute all the evidence that must be classified
        self.evidence = evidence = Knowledge()
        for r, (lkind, rkind) in self.relations.items():
            for segment in self.db_con.segments.segments_with_both_kinds(
                    lkind, rkind):
                for o1, o2 in segment.kind_occurrence_pairs(lkind, rkind):
                    e1 = db.get_entity(segment.entities[o1].kind,
                                       segment.entities[o1].key)
                    e2 = db.get_entity(segment.entities[o2].kind,
                                       segment.entities[o2].key)
                    f = Fact(e1, r, e2)
                    e = Evidence(f, segment, o1, o2)
                    evidence[e] = 0.5

    def do_iteration(self, data):
        for step in self.step_iterator:
            if step is None:
                return
            data = step(data)

    ###
    ### IEPY User API
    ###

    def start(self):
        """
        Blocking.
        """
        logger.info(u'Starting pipeline with {} seed '
                    u'facts'.format(len(self.knowledge)))
        self.do_iteration(self.knowledge)

    def questions_available(self):
        """
        Not blocking.
        Returned value won't change until a call to `add_answer` or
        `force_process`.
        If `id` of the returned value hasn't changed the returned value is the
        same.
        The available questions are a list of evidence.
        """
        if self.sort_questions_by == 'score':
            return self.questions.by_score(reverse=True)
        else:
            assert self.sort_questions_by == 'certainty'
            # TODO: Check: latest changes on generate_questions probably demand
            # some extra work on the following line to have back the usual
            # sort by certainty
            return self.questions.by_certainty()

    def add_answer(self, evidence, answer):
        """
        Blocking (potentially).
        After calling this method the values returned by `questions_available`
        and `known_facts` might change.
        """
        self.answers[evidence] = int(answer)

    def force_process(self):
        """
        Blocking.
        After calling this method the values returned by `questions_available`
        and `known_facts` might change.
        """
        self.do_iteration(None)

    def known_facts(self):
        """
        Not blocking.
        Returned value won't change until a call to `add_answer` or
        `force_process`.
        If `len` of the returned value hasn't changed the returned value is the
        same.
        """
        return self.knowledge

    ###
    ### Pipeline steps
    ###

    def generalize_knowledge(self, knowledge):
        """
        Stage 1 of pipeline.

        Based on the known facts (knowledge), generates all possible
        evidences of them. The generated evidence is scored using the scores
        given to the facts.
        """
        logger.debug(u'running generalize_knowledge')
        # XXX: there may be several scores for the same fact in knowledge.
        fact_knowledge = dict((e.fact, s) for e, s in knowledge.items())
        knowledge_evidence = Knowledge((e, fact_knowledge[e.fact])
                                       for e, _ in self.evidence.items()
                                       if e.fact in fact_knowledge)
        logger.info(u'Found {} potential evidences where the known facts could'
                    u' manifest'.format(len(knowledge_evidence)))
        return knowledge_evidence

    def generate_questions(self, knowledge_evidence):
        """
        Stage 2.1 of pipeline.

        Stores unanswered questions in self.questions and stops. Questions come
        from generalized evidence for known facts (knowledge_evidence), with
        high scores, and from undecided evidence scored by the last classifier
        in step 5 (self.evidence).
        """
        logger.debug(u'running generate_questions')
        # first add all evidence, then override scores for fact_evidence.
        self.questions = Knowledge(
            (e, s) for e, s in self.evidence.items() if e not in self.answers)
        self.questions.update((e, s) for e, s in knowledge_evidence.items()
                              if e not in self.answers)

    def filter_evidence(self, _):
        """
        Stage 2.2 of pipeline.

        Build evidence for training the classifiers, from user answers
        (self.answers) and unanswered evidence (self.evidence) with last
        classification score certainty over self.evidence_threshold.
        """
        logger.debug(u'running filter_evidence')
        evidence = Knowledge(self.answers)
        n = len(evidence)
        evidence.update((e, score > 0.5 and 1 or 0)
                        for e, score in self.evidence.items()
                        if certainty(score) > self.evidence_threshold
                        and e not in self.answers)
        logger.info(u'Filtering returns {} human-built evidences and {} '
                    u'over-threshold evidences'.format(n,
                                                       len(evidence) - n))
        return evidence

    def learn_fact_extractors(self, evidence):
        """
        Stage 3 of pipeline.
        evidence is a Knowledge instance of {evidence: is_good_evidence}
        """
        logger.debug(u'running learn_fact_extractors')
        classifiers = {}
        for rel, k in evidence.per_relation().items():
            yesno = set(k.values())
            if True not in yesno or False not in yesno:
                logger.warning(u'Not enough evidence to train a fact extractor'
                               u' for the "{}" relation'.format(rel))
                continue  # Not enough data to train a classifier
            assert len(yesno) == 2, "Evidence is not binary!"
            logger.info(u'Training "{}" relation with {} '
                        u'evidences'.format(rel, len(k)))
            classifiers[rel] = self._build_extractor(rel, Knowledge(k))
        return classifiers

    def _build_extractor(self, relation, data):
        """Actual invocation of classifier"""
        if self.extractor_config['classifier'] == 'labelspreading':
            # semi-supervised learning: add unlabeled data
            data.update((e, -1) for e in self.evidence if e not in data)
        return FactExtractorFactory(self.extractor_config, data)

    def _score_evidence(self, relation, classifier, evidence_list):
        """Given a classifier and a list of evidences, predict if they
        are positive evidences or not.
        Depending on the settings, prediction can be:
            - probabilistic or binary
            - scaled to a range, or not
        """
        # TODO: Is probably cleaner if this logic is inside FactExtractorFactory
        if classifier:
            method = self.prediction_config['method']
            ps = getattr(classifier, method)(evidence_list)
            if self.prediction_config['scale_to_range']:
                # scale scores to a given range
                range_min, range_max = sorted(
                    self.prediction_config['scale_to_range'])
                range_delta = range_max - range_min
                max_score = max(ps)
                min_score = min(ps)
                score_range = max_score - min_score
                scale = lambda x: (x - min_score
                                   ) * range_delta / score_range + range_min
                ps = map(scale, ps)
        else:
            # There was no evidence to train this classifier
            ps = [0.5] * len(evidence_list)  # Maximum uncertainty
        logger.info(u'Estimated fact manifestation probabilities for {} '
                    u'potential evidences for "{}" '
                    u'relation'.format(len(ps), relation))
        return ps

    def extract_facts(self, classifiers):
        """
        Stage 5 of pipeline.
        classifiers is a dict {relation: classifier, ...}
        """
        # TODO: this probably is smarter as an outer iteration through segments
        # and then an inner iteration over relations
        logger.debug(u'running extract_facts')
        result = Knowledge()

        for r, evidence in self.evidence.per_relation().items():
            evidence = list(evidence)
            ps = self._score_evidence(r, classifiers.get(r, None), evidence)
            result.update(zip(evidence, ps))
        # save scores for later use (e.g. in generate_questions, stage 2.1)
        self.evidence.update(result)
        return result

    def filter_facts(self, facts):
        """
        Stage 6 of pipeline.
        facts is [((a, b, relation), confidence), ...]
        """
        logger.debug(u'running filter_facts')
        if self.drop_guesses_each_round:
            logger.info(u'Discarding previously auto-accepted evidence.')
            self.knowledge = Knowledge(
                (e, answer) for (e, answer) in self.answers.items() if answer)
        n = len(self.knowledge)
        self.knowledge.update(
            (e, s) for e, s in facts.items() if s > self.fact_threshold)
        logger.debug(u'  classifiers accepted {} new evidences'.format(
            len(self.knowledge) - n))
        # unlearn user negative answers:
        m = len(self.knowledge)
        for e, s in self.answers.items():
            if s == 0 and e in self.knowledge:
                del self.knowledge[e]
        logger.debug(
            u'  user answers removed {} evidences'.format(m -
                                                          len(self.knowledge)))

        logger.info(
            u'Learnt {} new evidences this iteration (adding to a total '
            u'of {} evidences)'.format(
                len(self.knowledge) - n, len(self.knowledge)))

        return self.knowledge

    def evaluate(self, knowledge):
        """
        If a gold standard was given, compute precision and recall for current
        knowledge.
        """
        if self.gold_standard:
            logger.debug(u'running evaluate')
            result = evaluate(knowledge, self.gold_standard)
            logger.info(u'Precision: {}'.format(result['precision']))
            logger.info(u'Recall: {}'.format(result['recall']))

        return knowledge