Ejemplo n.º 1
0
def main(options):
    logging.basicConfig(level=logging.DEBUG, stream=sys.stderr)
    connection = db.connect(options['<dbname>'])
    standard = Knowledge.load_from_csv(options['<gold_standard>'])
    logging.info("Loaded %d samples from gold standard", len(standard))
    k = int(options['--k'])

    success = total = 0
    confusion_matrix = [[[], []], [[], []]]
    logging.info("Splitting into %d subsamples", k)
    for subsample in range(k):
        logging.debug("Subsample = %d", subsample)
        train_data = Knowledge()
        test_data = []
        test_labels = []
        for i, (e, s) in enumerate(standard.items()):
            if i % k == subsample:
                test_data.append(e)
                test_labels.append(int(s))
            else:
                train_data[e] = s
        extractor = FactExtractorFactory(config, train_data)
        prediction = extractor.predict(test_data)
        assert len(prediction) == len(test_data)
        total += len(prediction)
        success += sum(1 for (p, e) in zip(prediction, test_labels) if p == e)
        for i, (p, e) in enumerate(zip(prediction, test_labels)):
            confusion_matrix[p][e].append(test_data[i])
    logging.info("%d values evaluated;", total)
    logging.info("%d accurate predictions (%d negative, %d positive)", success,
                 len(confusion_matrix[0][0]), len(confusion_matrix[1][1]))
    logging.info(
        "%d inaccurate predictions (%d actual positive, %d actual negative)",
        total - success, len(confusion_matrix[0][1]),
        len(confusion_matrix[1][0]))
    for e in confusion_matrix[0][1][:3]:
        logging.info("Predicted negative, actually positive: %s", e)
    for e in confusion_matrix[1][0][:3]:
        logging.info("Predicted positive, actually negative: %s", e)

    try:
        precision = len(confusion_matrix[1][1]) / len(confusion_matrix[1][0] +
                                                      confusion_matrix[1][1])
    except ZeroDivisionError:
        precision = None
    try:
        recall = len(confusion_matrix[1][1]) / len(confusion_matrix[0][1] +
                                                   confusion_matrix[1][1])
    except ZeroDivisionError:
        recall = None
    accuracy = success / total
    return accuracy, precision, recall
Ejemplo n.º 2
0
Options:
  -h --help             Show this screen
  --version             Version number
"""
from docopt import docopt
import logging

from iepy.core import BootstrappedIEPipeline
from iepy import db
from iepy.human_validation import TerminalInterviewer
from iepy.knowledge import Knowledge
from iepy.utils import load_facts_from_csv

if __name__ == u'__main__':
    opts = docopt(__doc__, version=0.1)
    connection = db.connect(opts[u'<dbname>'])
    seed_facts = load_facts_from_csv(opts[u'<seeds_file>'])
    output_file = opts[u'<output_file>']
    gold_standard_file = opts[u'--gold']
    if gold_standard_file:
        gold_standard = Knowledge.load_from_csv(gold_standard_file)
    else:
        gold_standard = None

    p = BootstrappedIEPipeline(connection, seed_facts, gold_standard)

    logging.basicConfig(
        level=logging.DEBUG,
        format=u"%(asctime)s - %(name)s - %(levelname)s - %(message)s")

    STOP = u'STOP'
Ejemplo n.º 3
0
"""
IEPY's result evaluator w.r.t. a reference corpus.

Usage:
    eval.py <dbname> <proposed_csv> <reference_csv>
    eval.py -h | --help | --version

Options:
  -h --help             Show this screen
  --version             Version number
"""
from docopt import docopt

from iepy.db import connect
from iepy.utils import load_evidence_from_csv, evaluate

if __name__ == '__main__':
    opts = docopt(__doc__, version=0.1)
    connector = connect(opts['<dbname>'])
    proposed_csv = opts['<proposed_csv>']
    reference_csv = opts['<reference_csv>']

    proposed = load_evidence_from_csv(proposed_csv, connector)
    reference = load_evidence_from_csv(reference_csv, connector)
    result = evaluate(proposed, reference)

    print("Precision: %.2f" % result['precision'])
    print("Recall: %.2f" % result['recall'])
Ejemplo n.º 4
0
IEPY's seed generation utility.

Usage:
    generate_seeds.py <dbname> <relation_name> <kind_a> <kind_b> <output_filename>
    generate_seeds.py -h | --help | --version

Options:
  -h --help             Show this screen
  --version             Version number
"""
from docopt import docopt

from iepy.db import connect
from iepy.human_validation import human_oracle
from iepy.knowledge import Knowledge
from iepy.utils import save_facts_to_csv

if __name__ == u'__main__':
    opts = docopt(__doc__, version=0.1)
    connect(opts[u'<dbname>'])

    relation_name = opts[u'<relation_name>']
    kind_a = opts[u'<kind_a>']
    kind_b = opts[u'<kind_b>']
    output_filename = opts[u'<output_filename>']

    kn = Knowledge()
    kn.extend_from_oracle(kind_a, kind_b, relation_name, human_oracle)
    facts = set([ev.fact for (ev, value) in kn.items() if value == 1])
    save_facts_to_csv(sorted(facts), output_filename)
Ejemplo n.º 5
0
 def setUpClass(cls):
     disconnect()
     connect(cls.mongodb_name)
     cls.manager = cls.ManagerClass()