def filter_facts(self, facts): """ Stage 6 of pipeline. facts is [((a, b, relation), confidence), ...] """ logger.debug(u'running filter_facts') if self.drop_guesses_each_round: logger.info(u'Discarding previously auto-accepted evidence.') self.knowledge = Knowledge( (e, answer) for (e, answer) in self.answers.items() if answer) n = len(self.knowledge) self.knowledge.update( (e, s) for e, s in facts.items() if s > self.fact_threshold) logger.debug(u' classifiers accepted {} new evidences'.format( len(self.knowledge) - n)) # unlearn user negative answers: m = len(self.knowledge) for e, s in self.answers.items(): if s == 0 and e in self.knowledge: del self.knowledge[e] logger.debug( u' user answers removed {} evidences'.format(m - len(self.knowledge))) logger.info( u'Learnt {} new evidences this iteration (adding to a total ' u'of {} evidences)'.format( len(self.knowledge) - n, len(self.knowledge))) return self.knowledge
def main(options): logging.basicConfig(level=logging.DEBUG, stream=sys.stderr) connection = db.connect(options['<dbname>']) standard = Knowledge.load_from_csv(options['<gold_standard>']) logging.info("Loaded %d samples from gold standard", len(standard)) k = int(options['--k']) success = total = 0 confusion_matrix = [[[], []], [[], []]] logging.info("Splitting into %d subsamples", k) for subsample in range(k): logging.debug("Subsample = %d", subsample) train_data = Knowledge() test_data = [] test_labels = [] for i, (e, s) in enumerate(standard.items()): if i % k == subsample: test_data.append(e) test_labels.append(int(s)) else: train_data[e] = s extractor = FactExtractorFactory(config, train_data) prediction = extractor.predict(test_data) assert len(prediction) == len(test_data) total += len(prediction) success += sum(1 for (p, e) in zip(prediction, test_labels) if p == e) for i, (p, e) in enumerate(zip(prediction, test_labels)): confusion_matrix[p][e].append(test_data[i]) logging.info("%d values evaluated;", total) logging.info("%d accurate predictions (%d negative, %d positive)", success, len(confusion_matrix[0][0]), len(confusion_matrix[1][1])) logging.info( "%d inaccurate predictions (%d actual positive, %d actual negative)", total - success, len(confusion_matrix[0][1]), len(confusion_matrix[1][0])) for e in confusion_matrix[0][1][:3]: logging.info("Predicted negative, actually positive: %s", e) for e in confusion_matrix[1][0][:3]: logging.info("Predicted positive, actually negative: %s", e) try: precision = len(confusion_matrix[1][1]) / len(confusion_matrix[1][0] + confusion_matrix[1][1]) except ZeroDivisionError: precision = None try: recall = len(confusion_matrix[1][1]) / len(confusion_matrix[0][1] + confusion_matrix[1][1]) except ZeroDivisionError: recall = None accuracy = success / total return accuracy, precision, recall
def generate_questions(self, knowledge_evidence): """ Stage 2.1 of pipeline. Stores unanswered questions in self.questions and stops. Questions come from generalized evidence for known facts (knowledge_evidence), with high scores, and from undecided evidence scored by the last classifier in step 5 (self.evidence). """ logger.debug(u'running generate_questions') # first add all evidence, then override scores for fact_evidence. self.questions = Knowledge( (e, s) for e, s in self.evidence.items() if e not in self.answers) self.questions.update((e, s) for e, s in knowledge_evidence.items() if e not in self.answers)
def build_training_knowledge(self, relations_dict): k = Knowledge() for relation_name, number in relations_dict.items(): for i in range(number): ev = self.get_evidence(relation_name) k[ev] = (len(k) % 2 == 0) return k
def extract_facts(self, classifiers): """ Stage 5 of pipeline. classifiers is a dict {relation: classifier, ...} """ # TODO: this probably is smarter as an outer iteration through segments # and then an inner iteration over relations logger.debug(u'running extract_facts') result = Knowledge() for r, evidence in self.evidence.per_relation().items(): evidence = list(evidence) ps = self._score_evidence(r, classifiers.get(r, None), evidence) result.update(zip(evidence, ps)) # save scores for later use (e.g. in generate_questions, stage 2.1) self.evidence.update(result) return result
def filter_evidence(self, _): """ Stage 2.2 of pipeline. Build evidence for training the classifiers, from user answers (self.answers) and unanswered evidence (self.evidence) with last classification score certainty over self.evidence_threshold. """ logger.debug(u'running filter_evidence') evidence = Knowledge(self.answers) n = len(evidence) evidence.update((e, score > 0.5 and 1 or 0) for e, score in self.evidence.items() if certainty(score) > self.evidence_threshold and e not in self.answers) logger.info(u'Filtering returns {} human-built evidences and {} ' u'over-threshold evidences'.format(n, len(evidence) - n)) return evidence
def load_data(self, config): if self.last_dbname != self.dbname or self.last_path != self.path or \ self.last_hash != config[u"input_file_md5"]: iepy.db.connect(self.dbname) data = Knowledge.load_from_csv(self.path) self.last_dbname = self.dbname self.last_path = self.path hasher = hashlib.md5(open(self.path, "rb").read()) self.last_hash = hasher.hexdigest() if self.last_hash != config[u"input_file_md5"]: raise ValueError("Configured input file and actual input " "file have different MD5 checksums") self.data = data
def generalize_knowledge(self, knowledge): """ Stage 1 of pipeline. Based on the known facts (knowledge), generates all possible evidences of them. The generated evidence is scored using the scores given to the facts. """ logger.debug(u'running generalize_knowledge') # XXX: there may be several scores for the same fact in knowledge. fact_knowledge = dict((e.fact, s) for e, s in knowledge.items()) knowledge_evidence = Knowledge((e, fact_knowledge[e.fact]) for e, _ in self.evidence.items() if e.fact in fact_knowledge) logger.info(u'Found {} potential evidences where the known facts could' u' manifest'.format(len(knowledge_evidence))) return knowledge_evidence
def learn_fact_extractors(self, evidence): """ Stage 3 of pipeline. evidence is a Knowledge instance of {evidence: is_good_evidence} """ logger.debug(u'running learn_fact_extractors') classifiers = {} for rel, k in evidence.per_relation().items(): yesno = set(k.values()) if True not in yesno or False not in yesno: logger.warning(u'Not enough evidence to train a fact extractor' u' for the "{}" relation'.format(rel)) continue # Not enough data to train a classifier assert len(yesno) == 2, "Evidence is not binary!" logger.info(u'Training "{}" relation with {} ' u'evidences'.format(rel, len(k))) classifiers[rel] = self._build_extractor(rel, Knowledge(k)) return classifiers
import logging from iepy.core import BootstrappedIEPipeline from iepy import db from iepy.human_validation import TerminalInterviewer from iepy.knowledge import Knowledge from iepy.utils import load_facts_from_csv if __name__ == u'__main__': opts = docopt(__doc__, version=0.1) connection = db.connect(opts[u'<dbname>']) seed_facts = load_facts_from_csv(opts[u'<seeds_file>']) output_file = opts[u'<output_file>'] gold_standard_file = opts[u'--gold'] if gold_standard_file: gold_standard = Knowledge.load_from_csv(gold_standard_file) else: gold_standard = None p = BootstrappedIEPipeline(connection, seed_facts, gold_standard) logging.basicConfig( level=logging.DEBUG, format=u"%(asctime)s - %(name)s - %(levelname)s - %(message)s") STOP = u'STOP' p.start() # blocking keep_looping = True while keep_looping: qs = list(p.questions_available())
def __init__(self, db_connector, seed_facts, gold_standard=None, extractor_config=None, prediction_config=None, evidence_threshold=defaults.evidence_threshold, fact_threshold=defaults.fact_threshold, sort_questions_by=defaults.questions_sorting, drop_guesses_each_round=defaults.drop_guesses_each_round): """ Not blocking. """ self.db_con = db_connector self.knowledge = Knowledge( {Evidence(f, None, None, None): 1 for f in seed_facts}) self.evidence_threshold = evidence_threshold self.fact_threshold = fact_threshold self.questions = Knowledge() self.answers = {} self.gold_standard = gold_standard self.extractor_config = deepcopy(extractor_config or defaults.extractor_config) self.prediction_config = deepcopy(prediction_config or defaults.prediction_config) self.sort_questions_by = sort_questions_by self.drop_guesses_each_round = drop_guesses_each_round self.steps = [ self.generalize_knowledge, # Step 1 self.generate_questions, # Step 2, first half None, # Pause to wait question answers self.filter_evidence, # Step 2, second half self.learn_fact_extractors, # Step 3 self.extract_facts, # Step 5 self.filter_facts, # Step 6 self.evaluate # Optional evaluation step ] self.step_iterator = itertools.cycle(self.steps) # Build relation description: a map from relation labels to pairs of entity kinds self.relations = {} for e in self.knowledge: t1 = e.fact.e1.kind t2 = e.fact.e2.kind if e.fact.relation in self.relations and ( t1, t2) != self.relations[e.fact.relation]: raise ValueError("Ambiguous kinds for relation %r" % e.fact.relation) self.relations[e.fact.relation] = (t1, t2) # Precompute all the evidence that must be classified self.evidence = evidence = Knowledge() for r, (lkind, rkind) in self.relations.items(): for segment in self.db_con.segments.segments_with_both_kinds( lkind, rkind): for o1, o2 in segment.kind_occurrence_pairs(lkind, rkind): e1 = db.get_entity(segment.entities[o1].kind, segment.entities[o1].key) e2 = db.get_entity(segment.entities[o2].kind, segment.entities[o2].key) f = Fact(e1, r, e2) e = Evidence(f, segment, o1, o2) evidence[e] = 0.5
class BootstrappedIEPipeline(object): """ Iepy's main class. Implements a boostrapped information extraction pipeline. From the user's point of view this class is meant to be used like this:: p = BoostrappedIEPipeline(db_connector, seed_facts) p.start() # blocking while UserIsNotTired: for question in p.questions_available(): # Ask user # ... p.add_answer(question, answer) p.force_process() facts = p.get_facts() # profit """ def __init__(self, db_connector, seed_facts, gold_standard=None, extractor_config=None, prediction_config=None, evidence_threshold=defaults.evidence_threshold, fact_threshold=defaults.fact_threshold, sort_questions_by=defaults.questions_sorting, drop_guesses_each_round=defaults.drop_guesses_each_round): """ Not blocking. """ self.db_con = db_connector self.knowledge = Knowledge( {Evidence(f, None, None, None): 1 for f in seed_facts}) self.evidence_threshold = evidence_threshold self.fact_threshold = fact_threshold self.questions = Knowledge() self.answers = {} self.gold_standard = gold_standard self.extractor_config = deepcopy(extractor_config or defaults.extractor_config) self.prediction_config = deepcopy(prediction_config or defaults.prediction_config) self.sort_questions_by = sort_questions_by self.drop_guesses_each_round = drop_guesses_each_round self.steps = [ self.generalize_knowledge, # Step 1 self.generate_questions, # Step 2, first half None, # Pause to wait question answers self.filter_evidence, # Step 2, second half self.learn_fact_extractors, # Step 3 self.extract_facts, # Step 5 self.filter_facts, # Step 6 self.evaluate # Optional evaluation step ] self.step_iterator = itertools.cycle(self.steps) # Build relation description: a map from relation labels to pairs of entity kinds self.relations = {} for e in self.knowledge: t1 = e.fact.e1.kind t2 = e.fact.e2.kind if e.fact.relation in self.relations and ( t1, t2) != self.relations[e.fact.relation]: raise ValueError("Ambiguous kinds for relation %r" % e.fact.relation) self.relations[e.fact.relation] = (t1, t2) # Precompute all the evidence that must be classified self.evidence = evidence = Knowledge() for r, (lkind, rkind) in self.relations.items(): for segment in self.db_con.segments.segments_with_both_kinds( lkind, rkind): for o1, o2 in segment.kind_occurrence_pairs(lkind, rkind): e1 = db.get_entity(segment.entities[o1].kind, segment.entities[o1].key) e2 = db.get_entity(segment.entities[o2].kind, segment.entities[o2].key) f = Fact(e1, r, e2) e = Evidence(f, segment, o1, o2) evidence[e] = 0.5 def do_iteration(self, data): for step in self.step_iterator: if step is None: return data = step(data) ### ### IEPY User API ### def start(self): """ Blocking. """ logger.info(u'Starting pipeline with {} seed ' u'facts'.format(len(self.knowledge))) self.do_iteration(self.knowledge) def questions_available(self): """ Not blocking. Returned value won't change until a call to `add_answer` or `force_process`. If `id` of the returned value hasn't changed the returned value is the same. The available questions are a list of evidence. """ if self.sort_questions_by == 'score': return self.questions.by_score(reverse=True) else: assert self.sort_questions_by == 'certainty' # TODO: Check: latest changes on generate_questions probably demand # some extra work on the following line to have back the usual # sort by certainty return self.questions.by_certainty() def add_answer(self, evidence, answer): """ Blocking (potentially). After calling this method the values returned by `questions_available` and `known_facts` might change. """ self.answers[evidence] = int(answer) def force_process(self): """ Blocking. After calling this method the values returned by `questions_available` and `known_facts` might change. """ self.do_iteration(None) def known_facts(self): """ Not blocking. Returned value won't change until a call to `add_answer` or `force_process`. If `len` of the returned value hasn't changed the returned value is the same. """ return self.knowledge ### ### Pipeline steps ### def generalize_knowledge(self, knowledge): """ Stage 1 of pipeline. Based on the known facts (knowledge), generates all possible evidences of them. The generated evidence is scored using the scores given to the facts. """ logger.debug(u'running generalize_knowledge') # XXX: there may be several scores for the same fact in knowledge. fact_knowledge = dict((e.fact, s) for e, s in knowledge.items()) knowledge_evidence = Knowledge((e, fact_knowledge[e.fact]) for e, _ in self.evidence.items() if e.fact in fact_knowledge) logger.info(u'Found {} potential evidences where the known facts could' u' manifest'.format(len(knowledge_evidence))) return knowledge_evidence def generate_questions(self, knowledge_evidence): """ Stage 2.1 of pipeline. Stores unanswered questions in self.questions and stops. Questions come from generalized evidence for known facts (knowledge_evidence), with high scores, and from undecided evidence scored by the last classifier in step 5 (self.evidence). """ logger.debug(u'running generate_questions') # first add all evidence, then override scores for fact_evidence. self.questions = Knowledge( (e, s) for e, s in self.evidence.items() if e not in self.answers) self.questions.update((e, s) for e, s in knowledge_evidence.items() if e not in self.answers) def filter_evidence(self, _): """ Stage 2.2 of pipeline. Build evidence for training the classifiers, from user answers (self.answers) and unanswered evidence (self.evidence) with last classification score certainty over self.evidence_threshold. """ logger.debug(u'running filter_evidence') evidence = Knowledge(self.answers) n = len(evidence) evidence.update((e, score > 0.5 and 1 or 0) for e, score in self.evidence.items() if certainty(score) > self.evidence_threshold and e not in self.answers) logger.info(u'Filtering returns {} human-built evidences and {} ' u'over-threshold evidences'.format(n, len(evidence) - n)) return evidence def learn_fact_extractors(self, evidence): """ Stage 3 of pipeline. evidence is a Knowledge instance of {evidence: is_good_evidence} """ logger.debug(u'running learn_fact_extractors') classifiers = {} for rel, k in evidence.per_relation().items(): yesno = set(k.values()) if True not in yesno or False not in yesno: logger.warning(u'Not enough evidence to train a fact extractor' u' for the "{}" relation'.format(rel)) continue # Not enough data to train a classifier assert len(yesno) == 2, "Evidence is not binary!" logger.info(u'Training "{}" relation with {} ' u'evidences'.format(rel, len(k))) classifiers[rel] = self._build_extractor(rel, Knowledge(k)) return classifiers def _build_extractor(self, relation, data): """Actual invocation of classifier""" if self.extractor_config['classifier'] == 'labelspreading': # semi-supervised learning: add unlabeled data data.update((e, -1) for e in self.evidence if e not in data) return FactExtractorFactory(self.extractor_config, data) def _score_evidence(self, relation, classifier, evidence_list): """Given a classifier and a list of evidences, predict if they are positive evidences or not. Depending on the settings, prediction can be: - probabilistic or binary - scaled to a range, or not """ # TODO: Is probably cleaner if this logic is inside FactExtractorFactory if classifier: method = self.prediction_config['method'] ps = getattr(classifier, method)(evidence_list) if self.prediction_config['scale_to_range']: # scale scores to a given range range_min, range_max = sorted( self.prediction_config['scale_to_range']) range_delta = range_max - range_min max_score = max(ps) min_score = min(ps) score_range = max_score - min_score scale = lambda x: (x - min_score ) * range_delta / score_range + range_min ps = map(scale, ps) else: # There was no evidence to train this classifier ps = [0.5] * len(evidence_list) # Maximum uncertainty logger.info(u'Estimated fact manifestation probabilities for {} ' u'potential evidences for "{}" ' u'relation'.format(len(ps), relation)) return ps def extract_facts(self, classifiers): """ Stage 5 of pipeline. classifiers is a dict {relation: classifier, ...} """ # TODO: this probably is smarter as an outer iteration through segments # and then an inner iteration over relations logger.debug(u'running extract_facts') result = Knowledge() for r, evidence in self.evidence.per_relation().items(): evidence = list(evidence) ps = self._score_evidence(r, classifiers.get(r, None), evidence) result.update(zip(evidence, ps)) # save scores for later use (e.g. in generate_questions, stage 2.1) self.evidence.update(result) return result def filter_facts(self, facts): """ Stage 6 of pipeline. facts is [((a, b, relation), confidence), ...] """ logger.debug(u'running filter_facts') if self.drop_guesses_each_round: logger.info(u'Discarding previously auto-accepted evidence.') self.knowledge = Knowledge( (e, answer) for (e, answer) in self.answers.items() if answer) n = len(self.knowledge) self.knowledge.update( (e, s) for e, s in facts.items() if s > self.fact_threshold) logger.debug(u' classifiers accepted {} new evidences'.format( len(self.knowledge) - n)) # unlearn user negative answers: m = len(self.knowledge) for e, s in self.answers.items(): if s == 0 and e in self.knowledge: del self.knowledge[e] logger.debug( u' user answers removed {} evidences'.format(m - len(self.knowledge))) logger.info( u'Learnt {} new evidences this iteration (adding to a total ' u'of {} evidences)'.format( len(self.knowledge) - n, len(self.knowledge))) return self.knowledge def evaluate(self, knowledge): """ If a gold standard was given, compute precision and recall for current knowledge. """ if self.gold_standard: logger.debug(u'running evaluate') result = evaluate(knowledge, self.gold_standard) logger.info(u'Precision: {}'.format(result['precision'])) logger.info(u'Recall: {}'.format(result['recall'])) return knowledge
IEPY's seed generation utility. Usage: generate_seeds.py <dbname> <relation_name> <kind_a> <kind_b> <output_filename> generate_seeds.py -h | --help | --version Options: -h --help Show this screen --version Version number """ from docopt import docopt from iepy.db import connect from iepy.human_validation import human_oracle from iepy.knowledge import Knowledge from iepy.utils import save_facts_to_csv if __name__ == u'__main__': opts = docopt(__doc__, version=0.1) connect(opts[u'<dbname>']) relation_name = opts[u'<relation_name>'] kind_a = opts[u'<kind_a>'] kind_b = opts[u'<kind_b>'] output_filename = opts[u'<output_filename>'] kn = Knowledge() kn.extend_from_oracle(kind_a, kind_b, relation_name, human_oracle) facts = set([ev.fact for (ev, value) in kn.items() if value == 1]) save_facts_to_csv(sorted(facts), output_filename)
""" IEPY's result evaluator w.r.t. a reference corpus. Usage: eval.py <dbname> <proposed_csv> <reference_csv> eval.py -h | --help | --version Options: -h --help Show this screen --version Version number """ from docopt import docopt from iepy.db import connect from iepy.knowledge import Knowledge from iepy.utils import evaluate if __name__ == '__main__': opts = docopt(__doc__, version=0.1) connector = connect(opts['<dbname>']) proposed_csv = opts['<proposed_csv>'] reference_csv = opts['<reference_csv>'] proposed = Knowledge.load_from_csv(proposed_csv) reference = Knowledge.load_from_csv(reference_csv) result = evaluate(proposed, reference) print("Precision: %.2f" % result['precision']) print("Recall: %.2f" % result['recall'])
--version Version number --with-score Shows colored scores --with-line-number Shows each item numbered sequentially """ from docopt import docopt from colorama import Back, Style from iepy import db from iepy.knowledge import Knowledge if __name__ == '__main__': opts = docopt(__doc__, version=0.1) connection = db.connect(opts['<dbname>']) csv_file = opts['<csv_file>'] evidence = Knowledge.load_from_csv(csv_file) for nr, (e, score) in enumerate(evidence.items()): fact = e.colored_fact() fact_line = [] if opts['--with-line-number']: fact_line.append(str(nr + 1)) if opts['--with-score']: if score == 0: score_color = Back.YELLOW elif score == 1: score_color = Back.MAGENTA else: score_color = Back.CYAN colored_score = u''.join( [score_color, str(score), Style.RESET_ALL])