Exemple #1
0
    def filter_facts(self, facts):
        """
        Stage 6 of pipeline.
        facts is [((a, b, relation), confidence), ...]
        """
        logger.debug(u'running filter_facts')
        if self.drop_guesses_each_round:
            logger.info(u'Discarding previously auto-accepted evidence.')
            self.knowledge = Knowledge(
                (e, answer) for (e, answer) in self.answers.items() if answer)
        n = len(self.knowledge)
        self.knowledge.update(
            (e, s) for e, s in facts.items() if s > self.fact_threshold)
        logger.debug(u'  classifiers accepted {} new evidences'.format(
            len(self.knowledge) - n))
        # unlearn user negative answers:
        m = len(self.knowledge)
        for e, s in self.answers.items():
            if s == 0 and e in self.knowledge:
                del self.knowledge[e]
        logger.debug(
            u'  user answers removed {} evidences'.format(m -
                                                          len(self.knowledge)))

        logger.info(
            u'Learnt {} new evidences this iteration (adding to a total '
            u'of {} evidences)'.format(
                len(self.knowledge) - n, len(self.knowledge)))

        return self.knowledge
Exemple #2
0
def main(options):
    logging.basicConfig(level=logging.DEBUG, stream=sys.stderr)
    connection = db.connect(options['<dbname>'])
    standard = Knowledge.load_from_csv(options['<gold_standard>'])
    logging.info("Loaded %d samples from gold standard", len(standard))
    k = int(options['--k'])

    success = total = 0
    confusion_matrix = [[[], []], [[], []]]
    logging.info("Splitting into %d subsamples", k)
    for subsample in range(k):
        logging.debug("Subsample = %d", subsample)
        train_data = Knowledge()
        test_data = []
        test_labels = []
        for i, (e, s) in enumerate(standard.items()):
            if i % k == subsample:
                test_data.append(e)
                test_labels.append(int(s))
            else:
                train_data[e] = s
        extractor = FactExtractorFactory(config, train_data)
        prediction = extractor.predict(test_data)
        assert len(prediction) == len(test_data)
        total += len(prediction)
        success += sum(1 for (p, e) in zip(prediction, test_labels) if p == e)
        for i, (p, e) in enumerate(zip(prediction, test_labels)):
            confusion_matrix[p][e].append(test_data[i])
    logging.info("%d values evaluated;", total)
    logging.info("%d accurate predictions (%d negative, %d positive)", success,
                 len(confusion_matrix[0][0]), len(confusion_matrix[1][1]))
    logging.info(
        "%d inaccurate predictions (%d actual positive, %d actual negative)",
        total - success, len(confusion_matrix[0][1]),
        len(confusion_matrix[1][0]))
    for e in confusion_matrix[0][1][:3]:
        logging.info("Predicted negative, actually positive: %s", e)
    for e in confusion_matrix[1][0][:3]:
        logging.info("Predicted positive, actually negative: %s", e)

    try:
        precision = len(confusion_matrix[1][1]) / len(confusion_matrix[1][0] +
                                                      confusion_matrix[1][1])
    except ZeroDivisionError:
        precision = None
    try:
        recall = len(confusion_matrix[1][1]) / len(confusion_matrix[0][1] +
                                                   confusion_matrix[1][1])
    except ZeroDivisionError:
        recall = None
    accuracy = success / total
    return accuracy, precision, recall
Exemple #3
0
    def generate_questions(self, knowledge_evidence):
        """
        Stage 2.1 of pipeline.

        Stores unanswered questions in self.questions and stops. Questions come
        from generalized evidence for known facts (knowledge_evidence), with
        high scores, and from undecided evidence scored by the last classifier
        in step 5 (self.evidence).
        """
        logger.debug(u'running generate_questions')
        # first add all evidence, then override scores for fact_evidence.
        self.questions = Knowledge(
            (e, s) for e, s in self.evidence.items() if e not in self.answers)
        self.questions.update((e, s) for e, s in knowledge_evidence.items()
                              if e not in self.answers)
Exemple #4
0
 def build_training_knowledge(self, relations_dict):
     k = Knowledge()
     for relation_name, number in relations_dict.items():
         for i in range(number):
             ev = self.get_evidence(relation_name)
             k[ev] = (len(k) % 2 == 0)
     return k
Exemple #5
0
    def extract_facts(self, classifiers):
        """
        Stage 5 of pipeline.
        classifiers is a dict {relation: classifier, ...}
        """
        # TODO: this probably is smarter as an outer iteration through segments
        # and then an inner iteration over relations
        logger.debug(u'running extract_facts')
        result = Knowledge()

        for r, evidence in self.evidence.per_relation().items():
            evidence = list(evidence)
            ps = self._score_evidence(r, classifiers.get(r, None), evidence)
            result.update(zip(evidence, ps))
        # save scores for later use (e.g. in generate_questions, stage 2.1)
        self.evidence.update(result)
        return result
Exemple #6
0
    def filter_evidence(self, _):
        """
        Stage 2.2 of pipeline.

        Build evidence for training the classifiers, from user answers
        (self.answers) and unanswered evidence (self.evidence) with last
        classification score certainty over self.evidence_threshold.
        """
        logger.debug(u'running filter_evidence')
        evidence = Knowledge(self.answers)
        n = len(evidence)
        evidence.update((e, score > 0.5 and 1 or 0)
                        for e, score in self.evidence.items()
                        if certainty(score) > self.evidence_threshold
                        and e not in self.answers)
        logger.info(u'Filtering returns {} human-built evidences and {} '
                    u'over-threshold evidences'.format(n,
                                                       len(evidence) - n))
        return evidence
Exemple #7
0
 def load_data(self, config):
     if self.last_dbname != self.dbname or self.last_path != self.path or \
        self.last_hash != config[u"input_file_md5"]:
         iepy.db.connect(self.dbname)
         data = Knowledge.load_from_csv(self.path)
         self.last_dbname = self.dbname
         self.last_path = self.path
         hasher = hashlib.md5(open(self.path, "rb").read())
         self.last_hash = hasher.hexdigest()
         if self.last_hash != config[u"input_file_md5"]:
             raise ValueError("Configured input file and actual input "
                              "file have different MD5 checksums")
         self.data = data
Exemple #8
0
    def generalize_knowledge(self, knowledge):
        """
        Stage 1 of pipeline.

        Based on the known facts (knowledge), generates all possible
        evidences of them. The generated evidence is scored using the scores
        given to the facts.
        """
        logger.debug(u'running generalize_knowledge')
        # XXX: there may be several scores for the same fact in knowledge.
        fact_knowledge = dict((e.fact, s) for e, s in knowledge.items())
        knowledge_evidence = Knowledge((e, fact_knowledge[e.fact])
                                       for e, _ in self.evidence.items()
                                       if e.fact in fact_knowledge)
        logger.info(u'Found {} potential evidences where the known facts could'
                    u' manifest'.format(len(knowledge_evidence)))
        return knowledge_evidence
Exemple #9
0
 def learn_fact_extractors(self, evidence):
     """
     Stage 3 of pipeline.
     evidence is a Knowledge instance of {evidence: is_good_evidence}
     """
     logger.debug(u'running learn_fact_extractors')
     classifiers = {}
     for rel, k in evidence.per_relation().items():
         yesno = set(k.values())
         if True not in yesno or False not in yesno:
             logger.warning(u'Not enough evidence to train a fact extractor'
                            u' for the "{}" relation'.format(rel))
             continue  # Not enough data to train a classifier
         assert len(yesno) == 2, "Evidence is not binary!"
         logger.info(u'Training "{}" relation with {} '
                     u'evidences'.format(rel, len(k)))
         classifiers[rel] = self._build_extractor(rel, Knowledge(k))
     return classifiers
Exemple #10
0
import logging

from iepy.core import BootstrappedIEPipeline
from iepy import db
from iepy.human_validation import TerminalInterviewer
from iepy.knowledge import Knowledge
from iepy.utils import load_facts_from_csv

if __name__ == u'__main__':
    opts = docopt(__doc__, version=0.1)
    connection = db.connect(opts[u'<dbname>'])
    seed_facts = load_facts_from_csv(opts[u'<seeds_file>'])
    output_file = opts[u'<output_file>']
    gold_standard_file = opts[u'--gold']
    if gold_standard_file:
        gold_standard = Knowledge.load_from_csv(gold_standard_file)
    else:
        gold_standard = None

    p = BootstrappedIEPipeline(connection, seed_facts, gold_standard)

    logging.basicConfig(
        level=logging.DEBUG,
        format=u"%(asctime)s - %(name)s - %(levelname)s - %(message)s")

    STOP = u'STOP'

    p.start()  # blocking
    keep_looping = True
    while keep_looping:
        qs = list(p.questions_available())
Exemple #11
0
    def __init__(self,
                 db_connector,
                 seed_facts,
                 gold_standard=None,
                 extractor_config=None,
                 prediction_config=None,
                 evidence_threshold=defaults.evidence_threshold,
                 fact_threshold=defaults.fact_threshold,
                 sort_questions_by=defaults.questions_sorting,
                 drop_guesses_each_round=defaults.drop_guesses_each_round):
        """
        Not blocking.
        """
        self.db_con = db_connector
        self.knowledge = Knowledge(
            {Evidence(f, None, None, None): 1
             for f in seed_facts})
        self.evidence_threshold = evidence_threshold
        self.fact_threshold = fact_threshold
        self.questions = Knowledge()
        self.answers = {}
        self.gold_standard = gold_standard
        self.extractor_config = deepcopy(extractor_config
                                         or defaults.extractor_config)
        self.prediction_config = deepcopy(prediction_config
                                          or defaults.prediction_config)
        self.sort_questions_by = sort_questions_by
        self.drop_guesses_each_round = drop_guesses_each_round

        self.steps = [
            self.generalize_knowledge,  # Step 1
            self.generate_questions,  # Step 2, first half
            None,  # Pause to wait question answers
            self.filter_evidence,  # Step 2, second half
            self.learn_fact_extractors,  # Step 3
            self.extract_facts,  # Step 5
            self.filter_facts,  # Step 6
            self.evaluate  # Optional evaluation step
        ]
        self.step_iterator = itertools.cycle(self.steps)

        # Build relation description: a map from relation labels to pairs of entity kinds
        self.relations = {}
        for e in self.knowledge:
            t1 = e.fact.e1.kind
            t2 = e.fact.e2.kind
            if e.fact.relation in self.relations and (
                    t1, t2) != self.relations[e.fact.relation]:
                raise ValueError("Ambiguous kinds for relation %r" %
                                 e.fact.relation)
            self.relations[e.fact.relation] = (t1, t2)
        # Precompute all the evidence that must be classified
        self.evidence = evidence = Knowledge()
        for r, (lkind, rkind) in self.relations.items():
            for segment in self.db_con.segments.segments_with_both_kinds(
                    lkind, rkind):
                for o1, o2 in segment.kind_occurrence_pairs(lkind, rkind):
                    e1 = db.get_entity(segment.entities[o1].kind,
                                       segment.entities[o1].key)
                    e2 = db.get_entity(segment.entities[o2].kind,
                                       segment.entities[o2].key)
                    f = Fact(e1, r, e2)
                    e = Evidence(f, segment, o1, o2)
                    evidence[e] = 0.5
Exemple #12
0
class BootstrappedIEPipeline(object):
    """
    Iepy's main class. Implements a boostrapped information extraction pipeline.

    From the user's point of view this class is meant to be used like this::

        p = BoostrappedIEPipeline(db_connector, seed_facts)
        p.start()  # blocking
        while UserIsNotTired:
            for question in p.questions_available():
                # Ask user
                # ...
                p.add_answer(question, answer)
            p.force_process()
        facts = p.get_facts()  # profit
    """
    def __init__(self,
                 db_connector,
                 seed_facts,
                 gold_standard=None,
                 extractor_config=None,
                 prediction_config=None,
                 evidence_threshold=defaults.evidence_threshold,
                 fact_threshold=defaults.fact_threshold,
                 sort_questions_by=defaults.questions_sorting,
                 drop_guesses_each_round=defaults.drop_guesses_each_round):
        """
        Not blocking.
        """
        self.db_con = db_connector
        self.knowledge = Knowledge(
            {Evidence(f, None, None, None): 1
             for f in seed_facts})
        self.evidence_threshold = evidence_threshold
        self.fact_threshold = fact_threshold
        self.questions = Knowledge()
        self.answers = {}
        self.gold_standard = gold_standard
        self.extractor_config = deepcopy(extractor_config
                                         or defaults.extractor_config)
        self.prediction_config = deepcopy(prediction_config
                                          or defaults.prediction_config)
        self.sort_questions_by = sort_questions_by
        self.drop_guesses_each_round = drop_guesses_each_round

        self.steps = [
            self.generalize_knowledge,  # Step 1
            self.generate_questions,  # Step 2, first half
            None,  # Pause to wait question answers
            self.filter_evidence,  # Step 2, second half
            self.learn_fact_extractors,  # Step 3
            self.extract_facts,  # Step 5
            self.filter_facts,  # Step 6
            self.evaluate  # Optional evaluation step
        ]
        self.step_iterator = itertools.cycle(self.steps)

        # Build relation description: a map from relation labels to pairs of entity kinds
        self.relations = {}
        for e in self.knowledge:
            t1 = e.fact.e1.kind
            t2 = e.fact.e2.kind
            if e.fact.relation in self.relations and (
                    t1, t2) != self.relations[e.fact.relation]:
                raise ValueError("Ambiguous kinds for relation %r" %
                                 e.fact.relation)
            self.relations[e.fact.relation] = (t1, t2)
        # Precompute all the evidence that must be classified
        self.evidence = evidence = Knowledge()
        for r, (lkind, rkind) in self.relations.items():
            for segment in self.db_con.segments.segments_with_both_kinds(
                    lkind, rkind):
                for o1, o2 in segment.kind_occurrence_pairs(lkind, rkind):
                    e1 = db.get_entity(segment.entities[o1].kind,
                                       segment.entities[o1].key)
                    e2 = db.get_entity(segment.entities[o2].kind,
                                       segment.entities[o2].key)
                    f = Fact(e1, r, e2)
                    e = Evidence(f, segment, o1, o2)
                    evidence[e] = 0.5

    def do_iteration(self, data):
        for step in self.step_iterator:
            if step is None:
                return
            data = step(data)

    ###
    ### IEPY User API
    ###

    def start(self):
        """
        Blocking.
        """
        logger.info(u'Starting pipeline with {} seed '
                    u'facts'.format(len(self.knowledge)))
        self.do_iteration(self.knowledge)

    def questions_available(self):
        """
        Not blocking.
        Returned value won't change until a call to `add_answer` or
        `force_process`.
        If `id` of the returned value hasn't changed the returned value is the
        same.
        The available questions are a list of evidence.
        """
        if self.sort_questions_by == 'score':
            return self.questions.by_score(reverse=True)
        else:
            assert self.sort_questions_by == 'certainty'
            # TODO: Check: latest changes on generate_questions probably demand
            # some extra work on the following line to have back the usual
            # sort by certainty
            return self.questions.by_certainty()

    def add_answer(self, evidence, answer):
        """
        Blocking (potentially).
        After calling this method the values returned by `questions_available`
        and `known_facts` might change.
        """
        self.answers[evidence] = int(answer)

    def force_process(self):
        """
        Blocking.
        After calling this method the values returned by `questions_available`
        and `known_facts` might change.
        """
        self.do_iteration(None)

    def known_facts(self):
        """
        Not blocking.
        Returned value won't change until a call to `add_answer` or
        `force_process`.
        If `len` of the returned value hasn't changed the returned value is the
        same.
        """
        return self.knowledge

    ###
    ### Pipeline steps
    ###

    def generalize_knowledge(self, knowledge):
        """
        Stage 1 of pipeline.

        Based on the known facts (knowledge), generates all possible
        evidences of them. The generated evidence is scored using the scores
        given to the facts.
        """
        logger.debug(u'running generalize_knowledge')
        # XXX: there may be several scores for the same fact in knowledge.
        fact_knowledge = dict((e.fact, s) for e, s in knowledge.items())
        knowledge_evidence = Knowledge((e, fact_knowledge[e.fact])
                                       for e, _ in self.evidence.items()
                                       if e.fact in fact_knowledge)
        logger.info(u'Found {} potential evidences where the known facts could'
                    u' manifest'.format(len(knowledge_evidence)))
        return knowledge_evidence

    def generate_questions(self, knowledge_evidence):
        """
        Stage 2.1 of pipeline.

        Stores unanswered questions in self.questions and stops. Questions come
        from generalized evidence for known facts (knowledge_evidence), with
        high scores, and from undecided evidence scored by the last classifier
        in step 5 (self.evidence).
        """
        logger.debug(u'running generate_questions')
        # first add all evidence, then override scores for fact_evidence.
        self.questions = Knowledge(
            (e, s) for e, s in self.evidence.items() if e not in self.answers)
        self.questions.update((e, s) for e, s in knowledge_evidence.items()
                              if e not in self.answers)

    def filter_evidence(self, _):
        """
        Stage 2.2 of pipeline.

        Build evidence for training the classifiers, from user answers
        (self.answers) and unanswered evidence (self.evidence) with last
        classification score certainty over self.evidence_threshold.
        """
        logger.debug(u'running filter_evidence')
        evidence = Knowledge(self.answers)
        n = len(evidence)
        evidence.update((e, score > 0.5 and 1 or 0)
                        for e, score in self.evidence.items()
                        if certainty(score) > self.evidence_threshold
                        and e not in self.answers)
        logger.info(u'Filtering returns {} human-built evidences and {} '
                    u'over-threshold evidences'.format(n,
                                                       len(evidence) - n))
        return evidence

    def learn_fact_extractors(self, evidence):
        """
        Stage 3 of pipeline.
        evidence is a Knowledge instance of {evidence: is_good_evidence}
        """
        logger.debug(u'running learn_fact_extractors')
        classifiers = {}
        for rel, k in evidence.per_relation().items():
            yesno = set(k.values())
            if True not in yesno or False not in yesno:
                logger.warning(u'Not enough evidence to train a fact extractor'
                               u' for the "{}" relation'.format(rel))
                continue  # Not enough data to train a classifier
            assert len(yesno) == 2, "Evidence is not binary!"
            logger.info(u'Training "{}" relation with {} '
                        u'evidences'.format(rel, len(k)))
            classifiers[rel] = self._build_extractor(rel, Knowledge(k))
        return classifiers

    def _build_extractor(self, relation, data):
        """Actual invocation of classifier"""
        if self.extractor_config['classifier'] == 'labelspreading':
            # semi-supervised learning: add unlabeled data
            data.update((e, -1) for e in self.evidence if e not in data)
        return FactExtractorFactory(self.extractor_config, data)

    def _score_evidence(self, relation, classifier, evidence_list):
        """Given a classifier and a list of evidences, predict if they
        are positive evidences or not.
        Depending on the settings, prediction can be:
            - probabilistic or binary
            - scaled to a range, or not
        """
        # TODO: Is probably cleaner if this logic is inside FactExtractorFactory
        if classifier:
            method = self.prediction_config['method']
            ps = getattr(classifier, method)(evidence_list)
            if self.prediction_config['scale_to_range']:
                # scale scores to a given range
                range_min, range_max = sorted(
                    self.prediction_config['scale_to_range'])
                range_delta = range_max - range_min
                max_score = max(ps)
                min_score = min(ps)
                score_range = max_score - min_score
                scale = lambda x: (x - min_score
                                   ) * range_delta / score_range + range_min
                ps = map(scale, ps)
        else:
            # There was no evidence to train this classifier
            ps = [0.5] * len(evidence_list)  # Maximum uncertainty
        logger.info(u'Estimated fact manifestation probabilities for {} '
                    u'potential evidences for "{}" '
                    u'relation'.format(len(ps), relation))
        return ps

    def extract_facts(self, classifiers):
        """
        Stage 5 of pipeline.
        classifiers is a dict {relation: classifier, ...}
        """
        # TODO: this probably is smarter as an outer iteration through segments
        # and then an inner iteration over relations
        logger.debug(u'running extract_facts')
        result = Knowledge()

        for r, evidence in self.evidence.per_relation().items():
            evidence = list(evidence)
            ps = self._score_evidence(r, classifiers.get(r, None), evidence)
            result.update(zip(evidence, ps))
        # save scores for later use (e.g. in generate_questions, stage 2.1)
        self.evidence.update(result)
        return result

    def filter_facts(self, facts):
        """
        Stage 6 of pipeline.
        facts is [((a, b, relation), confidence), ...]
        """
        logger.debug(u'running filter_facts')
        if self.drop_guesses_each_round:
            logger.info(u'Discarding previously auto-accepted evidence.')
            self.knowledge = Knowledge(
                (e, answer) for (e, answer) in self.answers.items() if answer)
        n = len(self.knowledge)
        self.knowledge.update(
            (e, s) for e, s in facts.items() if s > self.fact_threshold)
        logger.debug(u'  classifiers accepted {} new evidences'.format(
            len(self.knowledge) - n))
        # unlearn user negative answers:
        m = len(self.knowledge)
        for e, s in self.answers.items():
            if s == 0 and e in self.knowledge:
                del self.knowledge[e]
        logger.debug(
            u'  user answers removed {} evidences'.format(m -
                                                          len(self.knowledge)))

        logger.info(
            u'Learnt {} new evidences this iteration (adding to a total '
            u'of {} evidences)'.format(
                len(self.knowledge) - n, len(self.knowledge)))

        return self.knowledge

    def evaluate(self, knowledge):
        """
        If a gold standard was given, compute precision and recall for current
        knowledge.
        """
        if self.gold_standard:
            logger.debug(u'running evaluate')
            result = evaluate(knowledge, self.gold_standard)
            logger.info(u'Precision: {}'.format(result['precision']))
            logger.info(u'Recall: {}'.format(result['recall']))

        return knowledge
Exemple #13
0
IEPY's seed generation utility.

Usage:
    generate_seeds.py <dbname> <relation_name> <kind_a> <kind_b> <output_filename>
    generate_seeds.py -h | --help | --version

Options:
  -h --help             Show this screen
  --version             Version number
"""
from docopt import docopt

from iepy.db import connect
from iepy.human_validation import human_oracle
from iepy.knowledge import Knowledge
from iepy.utils import save_facts_to_csv

if __name__ == u'__main__':
    opts = docopt(__doc__, version=0.1)
    connect(opts[u'<dbname>'])

    relation_name = opts[u'<relation_name>']
    kind_a = opts[u'<kind_a>']
    kind_b = opts[u'<kind_b>']
    output_filename = opts[u'<output_filename>']

    kn = Knowledge()
    kn.extend_from_oracle(kind_a, kind_b, relation_name, human_oracle)
    facts = set([ev.fact for (ev, value) in kn.items() if value == 1])
    save_facts_to_csv(sorted(facts), output_filename)
Exemple #14
0
"""
IEPY's result evaluator w.r.t. a reference corpus.

Usage:
    eval.py <dbname> <proposed_csv> <reference_csv>
    eval.py -h | --help | --version

Options:
  -h --help             Show this screen
  --version             Version number
"""
from docopt import docopt

from iepy.db import connect
from iepy.knowledge import Knowledge
from iepy.utils import evaluate

if __name__ == '__main__':
    opts = docopt(__doc__, version=0.1)
    connector = connect(opts['<dbname>'])
    proposed_csv = opts['<proposed_csv>']
    reference_csv = opts['<reference_csv>']

    proposed = Knowledge.load_from_csv(proposed_csv)
    reference = Knowledge.load_from_csv(reference_csv)
    result = evaluate(proposed, reference)

    print("Precision: %.2f" % result['precision'])
    print("Recall: %.2f" % result['recall'])
Exemple #15
0
  --version             Version number
  --with-score         Shows colored scores
  --with-line-number    Shows each item numbered sequentially
"""
from docopt import docopt

from colorama import Back, Style

from iepy import db
from iepy.knowledge import Knowledge

if __name__ == '__main__':
    opts = docopt(__doc__, version=0.1)
    connection = db.connect(opts['<dbname>'])
    csv_file = opts['<csv_file>']
    evidence = Knowledge.load_from_csv(csv_file)

    for nr, (e, score) in enumerate(evidence.items()):
        fact = e.colored_fact()
        fact_line = []
        if opts['--with-line-number']:
            fact_line.append(str(nr + 1))
        if opts['--with-score']:
            if score == 0:
                score_color = Back.YELLOW
            elif score == 1:
                score_color = Back.MAGENTA
            else:
                score_color = Back.CYAN
            colored_score = u''.join(
                [score_color, str(score), Style.RESET_ALL])