def main(options): logging.basicConfig(level=logging.DEBUG, stream=sys.stderr) connection = db.connect(options['<dbname>']) standard = Knowledge.load_from_csv(options['<gold_standard>']) logging.info("Loaded %d samples from gold standard", len(standard)) k = int(options['--k']) success = total = 0 confusion_matrix = [[[], []], [[], []]] logging.info("Splitting into %d subsamples", k) for subsample in range(k): logging.debug("Subsample = %d", subsample) train_data = Knowledge() test_data = [] test_labels = [] for i, (e, s) in enumerate(standard.items()): if i % k == subsample: test_data.append(e) test_labels.append(int(s)) else: train_data[e] = s extractor = FactExtractorFactory(config, train_data) prediction = extractor.predict(test_data) assert len(prediction) == len(test_data) total += len(prediction) success += sum(1 for (p, e) in zip(prediction, test_labels) if p == e) for i, (p, e) in enumerate(zip(prediction, test_labels)): confusion_matrix[p][e].append(test_data[i]) logging.info("%d values evaluated;", total) logging.info("%d accurate predictions (%d negative, %d positive)", success, len(confusion_matrix[0][0]), len(confusion_matrix[1][1])) logging.info( "%d inaccurate predictions (%d actual positive, %d actual negative)", total - success, len(confusion_matrix[0][1]), len(confusion_matrix[1][0])) for e in confusion_matrix[0][1][:3]: logging.info("Predicted negative, actually positive: %s", e) for e in confusion_matrix[1][0][:3]: logging.info("Predicted positive, actually negative: %s", e) try: precision = len(confusion_matrix[1][1]) / len(confusion_matrix[1][0] + confusion_matrix[1][1]) except ZeroDivisionError: precision = None try: recall = len(confusion_matrix[1][1]) / len(confusion_matrix[0][1] + confusion_matrix[1][1]) except ZeroDivisionError: recall = None accuracy = success / total return accuracy, precision, recall
def load_data(self, config): if self.last_dbname != self.dbname or self.last_path != self.path or \ self.last_hash != config[u"input_file_md5"]: iepy.db.connect(self.dbname) data = Knowledge.load_from_csv(self.path) self.last_dbname = self.dbname self.last_path = self.path hasher = hashlib.md5(open(self.path, "rb").read()) self.last_hash = hasher.hexdigest() if self.last_hash != config[u"input_file_md5"]: raise ValueError("Configured input file and actual input " "file have different MD5 checksums") self.data = data
import logging from iepy.core import BootstrappedIEPipeline from iepy import db from iepy.human_validation import TerminalInterviewer from iepy.knowledge import Knowledge from iepy.utils import load_facts_from_csv if __name__ == u'__main__': opts = docopt(__doc__, version=0.1) connection = db.connect(opts[u'<dbname>']) seed_facts = load_facts_from_csv(opts[u'<seeds_file>']) output_file = opts[u'<output_file>'] gold_standard_file = opts[u'--gold'] if gold_standard_file: gold_standard = Knowledge.load_from_csv(gold_standard_file) else: gold_standard = None p = BootstrappedIEPipeline(connection, seed_facts, gold_standard) logging.basicConfig( level=logging.DEBUG, format=u"%(asctime)s - %(name)s - %(levelname)s - %(message)s") STOP = u'STOP' p.start() # blocking keep_looping = True while keep_looping: qs = list(p.questions_available())
""" IEPY's result evaluator w.r.t. a reference corpus. Usage: eval.py <dbname> <proposed_csv> <reference_csv> eval.py -h | --help | --version Options: -h --help Show this screen --version Version number """ from docopt import docopt from iepy.db import connect from iepy.knowledge import Knowledge from iepy.utils import evaluate if __name__ == '__main__': opts = docopt(__doc__, version=0.1) connector = connect(opts['<dbname>']) proposed_csv = opts['<proposed_csv>'] reference_csv = opts['<reference_csv>'] proposed = Knowledge.load_from_csv(proposed_csv) reference = Knowledge.load_from_csv(reference_csv) result = evaluate(proposed, reference) print("Precision: %.2f" % result['precision']) print("Recall: %.2f" % result['recall'])
--version Version number --with-score Shows colored scores --with-line-number Shows each item numbered sequentially """ from docopt import docopt from colorama import Back, Style from iepy import db from iepy.knowledge import Knowledge if __name__ == '__main__': opts = docopt(__doc__, version=0.1) connection = db.connect(opts['<dbname>']) csv_file = opts['<csv_file>'] evidence = Knowledge.load_from_csv(csv_file) for nr, (e, score) in enumerate(evidence.items()): fact = e.colored_fact() fact_line = [] if opts['--with-line-number']: fact_line.append(str(nr + 1)) if opts['--with-score']: if score == 0: score_color = Back.YELLOW elif score == 1: score_color = Back.MAGENTA else: score_color = Back.CYAN colored_score = u''.join( [score_color, str(score), Style.RESET_ALL])