Example #1
0
def main(options):
    logging.basicConfig(level=logging.DEBUG, stream=sys.stderr)
    connection = db.connect(options['<dbname>'])
    standard = Knowledge.load_from_csv(options['<gold_standard>'])
    logging.info("Loaded %d samples from gold standard", len(standard))
    k = int(options['--k'])

    success = total = 0
    confusion_matrix = [[[], []], [[], []]]
    logging.info("Splitting into %d subsamples", k)
    for subsample in range(k):
        logging.debug("Subsample = %d", subsample)
        train_data = Knowledge()
        test_data = []
        test_labels = []
        for i, (e, s) in enumerate(standard.items()):
            if i % k == subsample:
                test_data.append(e)
                test_labels.append(int(s))
            else:
                train_data[e] = s
        extractor = FactExtractorFactory(config, train_data)
        prediction = extractor.predict(test_data)
        assert len(prediction) == len(test_data)
        total += len(prediction)
        success += sum(1 for (p, e) in zip(prediction, test_labels) if p == e)
        for i, (p, e) in enumerate(zip(prediction, test_labels)):
            confusion_matrix[p][e].append(test_data[i])
    logging.info("%d values evaluated;", total)
    logging.info("%d accurate predictions (%d negative, %d positive)", success,
                 len(confusion_matrix[0][0]), len(confusion_matrix[1][1]))
    logging.info(
        "%d inaccurate predictions (%d actual positive, %d actual negative)",
        total - success, len(confusion_matrix[0][1]),
        len(confusion_matrix[1][0]))
    for e in confusion_matrix[0][1][:3]:
        logging.info("Predicted negative, actually positive: %s", e)
    for e in confusion_matrix[1][0][:3]:
        logging.info("Predicted positive, actually negative: %s", e)

    try:
        precision = len(confusion_matrix[1][1]) / len(confusion_matrix[1][0] +
                                                      confusion_matrix[1][1])
    except ZeroDivisionError:
        precision = None
    try:
        recall = len(confusion_matrix[1][1]) / len(confusion_matrix[0][1] +
                                                   confusion_matrix[1][1])
    except ZeroDivisionError:
        recall = None
    accuracy = success / total
    return accuracy, precision, recall
Example #2
0
 def load_data(self, config):
     if self.last_dbname != self.dbname or self.last_path != self.path or \
        self.last_hash != config[u"input_file_md5"]:
         iepy.db.connect(self.dbname)
         data = Knowledge.load_from_csv(self.path)
         self.last_dbname = self.dbname
         self.last_path = self.path
         hasher = hashlib.md5(open(self.path, "rb").read())
         self.last_hash = hasher.hexdigest()
         if self.last_hash != config[u"input_file_md5"]:
             raise ValueError("Configured input file and actual input "
                              "file have different MD5 checksums")
         self.data = data
Example #3
0
import logging

from iepy.core import BootstrappedIEPipeline
from iepy import db
from iepy.human_validation import TerminalInterviewer
from iepy.knowledge import Knowledge
from iepy.utils import load_facts_from_csv

if __name__ == u'__main__':
    opts = docopt(__doc__, version=0.1)
    connection = db.connect(opts[u'<dbname>'])
    seed_facts = load_facts_from_csv(opts[u'<seeds_file>'])
    output_file = opts[u'<output_file>']
    gold_standard_file = opts[u'--gold']
    if gold_standard_file:
        gold_standard = Knowledge.load_from_csv(gold_standard_file)
    else:
        gold_standard = None

    p = BootstrappedIEPipeline(connection, seed_facts, gold_standard)

    logging.basicConfig(
        level=logging.DEBUG,
        format=u"%(asctime)s - %(name)s - %(levelname)s - %(message)s")

    STOP = u'STOP'

    p.start()  # blocking
    keep_looping = True
    while keep_looping:
        qs = list(p.questions_available())
Example #4
0
File: eval.py Project: imclab/iepy
"""
IEPY's result evaluator w.r.t. a reference corpus.

Usage:
    eval.py <dbname> <proposed_csv> <reference_csv>
    eval.py -h | --help | --version

Options:
  -h --help             Show this screen
  --version             Version number
"""
from docopt import docopt

from iepy.db import connect
from iepy.knowledge import Knowledge
from iepy.utils import evaluate

if __name__ == '__main__':
    opts = docopt(__doc__, version=0.1)
    connector = connect(opts['<dbname>'])
    proposed_csv = opts['<proposed_csv>']
    reference_csv = opts['<reference_csv>']

    proposed = Knowledge.load_from_csv(proposed_csv)
    reference = Knowledge.load_from_csv(reference_csv)
    result = evaluate(proposed, reference)

    print("Precision: %.2f" % result['precision'])
    print("Recall: %.2f" % result['recall'])
Example #5
0
  --version             Version number
  --with-score         Shows colored scores
  --with-line-number    Shows each item numbered sequentially
"""
from docopt import docopt

from colorama import Back, Style

from iepy import db
from iepy.knowledge import Knowledge

if __name__ == '__main__':
    opts = docopt(__doc__, version=0.1)
    connection = db.connect(opts['<dbname>'])
    csv_file = opts['<csv_file>']
    evidence = Knowledge.load_from_csv(csv_file)

    for nr, (e, score) in enumerate(evidence.items()):
        fact = e.colored_fact()
        fact_line = []
        if opts['--with-line-number']:
            fact_line.append(str(nr + 1))
        if opts['--with-score']:
            if score == 0:
                score_color = Back.YELLOW
            elif score == 1:
                score_color = Back.MAGENTA
            else:
                score_color = Back.CYAN
            colored_score = u''.join(
                [score_color, str(score), Style.RESET_ALL])