Beispiel #1
0
    def __call__(self, config):
        # Prepare data
        if self.data is None or self.relname != config["relation"]:
            self.relname = config["relation"]
            self.relation = iepy.data.models.Relation.objects.get(
                name=config["relation"])

            candidates = CEM.candidates_for_relation(self.relation)
            self.data = CEM.labels_for(self.relation, candidates,
                                       CEM.conflict_resolution_newest_wins)
            self.evidences = []
            self.labels = []
            for evidence, label in self.data.items():
                if label is not None:
                    self.labels.append(label)
                    self.evidences.append(evidence)

        if not self.data:
            raise NotEnoughLabeledData(
                "There is no labeled data for training!")

        result = {
            "dataset_size": len(self.data),
            "start_time": time.time(),
        }

        # Load rules in the config
        if config["rules"] == "<all>":
            rules = self.rules.values()
        else:
            for rule_name in config["rules"]:
                if rule_name not in self.rules.keys():
                    raise RuleNotFound(rule_name)
            rules = [
                rule for rule_name, rule in self.rules.items()
                if rule_name in config["rules"]
            ]

        # Run the rule based pipeline
        pipeline = RuleBasedCore(self.relation, self.evidences, rules)
        pipeline.start()
        matched = pipeline.known_facts()
        predicted_labels = [e in matched for e in self.evidences]

        # Evaluate prediction
        result.update(
            result_dict_from_predictions(self.evidences, self.labels,
                                         predicted_labels))

        return result
Beispiel #2
0
def run_from_command_line():
    logging.basicConfig(level=logging.INFO, format='%(message)s')

    try:
        relation_name = iepy.instance.rules.RELATION
    except AttributeError:
        logging.error("RELATION not defined in rules file")
        sys.exit(1)

    try:
        relation = models.Relation.objects.get(name=relation_name)
    except ObjectDoesNotExist:
        logging.error("Relation {!r} not found".format(relation_name))
        sys.exit(1)

    # Load rules
    rules = load_rules()

    # Load evidences
    evidences = CandidateEvidenceManager.candidates_for_relation(relation)

    # Run the pipeline
    iextractor = RuleBasedCore(relation, rules)
    iextractor.start()
    iextractor.process()
    predictions = iextractor.predict(evidences)
    output.dump_output_loop(predictions)
Beispiel #3
0
def run_from_command_line():
    logging.basicConfig(level=logging.INFO, format='%(message)s')

    try:
        relation_name = iepy.instance.rules.RELATION
    except AttributeError:
        logging.error("RELATION not defined in rules file")
        sys.exit(1)

    try:
        relation = models.Relation.objects.get(name=relation_name)
    except ObjectDoesNotExist:
        logging.error("Relation {!r} not found".format(relation_name))
        sys.exit(1)

    # Load rules
    rules = load_rules()

    # Load evidences
    evidences = CandidateEvidenceManager.candidates_for_relation(relation)

    # Run the pipeline
    iextractor = RuleBasedCore(relation, evidences, rules)
    iextractor.start()
    iextractor.process()
    predictions = iextractor.predict()
    output.dump_output_loop(predictions)
Beispiel #4
0
def _e(markup, **kwargs):
    base_pos = kwargs.pop('base_pos', ["DT", u"JJ", u"NN"])
    evidence = EvidenceFactory(markup=markup, **kwargs)
    evidence = CandidateEvidenceManager.hydrate(evidence)
    n = len(evidence.segment.tokens)
    pos = (base_pos * n)[:n]
    evidence.segment.postags = pos
    return evidence
Beispiel #5
0
def _e(markup, **kwargs):
    base_pos = kwargs.pop('base_pos', ["DT", u"JJ", u"NN"])
    evidence = EvidenceFactory(markup=markup, **kwargs)
    evidence = CandidateEvidenceManager.hydrate(evidence)
    n = len(evidence.segment.tokens)
    pos = (base_pos * n)[:n]
    evidence.segment.postags = pos
    return evidence
Beispiel #6
0
def run_from_command_line():
    opts = docopt(__doc__, version=iepy.__version__)

    logging.basicConfig(level=logging.INFO, format='%(message)s')
    logging.getLogger("featureforge").setLevel(logging.WARN)

    tuning_mode = _get_tuning_mode(opts)
    relation = _get_relation(opts)

    candidates = CandidateEvidenceManager.candidates_for_relation(relation)
    labeled_evidences = load_labeled_evidences(relation, candidates)

    if opts.get('--trained-extractor'):
        iextractor = _load_extractor(opts, relation, labeled_evidences)
        was_ever_trained = True
        opts["--no-questions"] = True
    else:
        iextractor = _construct_extractor(opts, relation, labeled_evidences, tuning_mode)
        iextractor.start()
        was_ever_trained = False

    if not opts.get("--no-questions", False):
        questions_loop(iextractor, relation, was_ever_trained)

    # Candidates generator was consumed when generating labeled_evidences, so we'll
    # define it fresh again
    candidates = CandidateEvidenceManager.candidates_for_relation(relation)
    # Predict and store output
    predictions = iextractor.predict(candidates)  # asking predictions for EVERYTHING
    if not predictions:
        print("Nothing was predicted")
        exit(1)

    if opts.get("--db-store"):
        output.dump_predictions_to_database(relation, predictions)

    output_file = opts.get("<output>")
    if output_file:
        output.dump_runner_output_to_csv(predictions, output_file)

    classifier_output = opts.get("--store-extractor")
    if classifier_output:
        iextractor.save(classifier_output)
Beispiel #7
0
def _e(markup, **kwargs):
    base_pos = kwargs.pop('base_pos', ["DT", u"JJ", u"NN"])
    base_lemmas = kwargs.pop('base_lemmas', None)
    evidence = EvidenceFactory(markup=markup, **kwargs)
    evidence = CandidateEvidenceManager.hydrate(evidence)

    if base_lemmas is None:
        base_lemmas = [x.lower() for x in evidence.segment.tokens]
    n = len(evidence.segment.tokens)
    pos = (base_pos * n)[:n]
    evidence.segment.postags = pos
    evidence.segment.lemmas = base_lemmas
    return evidence
Beispiel #8
0
def _e(markup, **kwargs):
    base_pos = kwargs.pop('base_pos', ["DT", u"JJ", u"NN"])
    base_lemmas = kwargs.pop('base_lemmas', None)
    evidence = EvidenceFactory(markup=markup, **kwargs)
    evidence = CandidateEvidenceManager.hydrate(evidence)

    if base_lemmas is None:
        base_lemmas = [x.lower() for x in evidence.segment.tokens]
    n = len(evidence.segment.tokens)
    pos = (base_pos * n)[:n]
    evidence.segment.postags = pos
    evidence.segment.lemmas = base_lemmas
    return evidence
Beispiel #9
0
def run_from_command_line():
    opts = docopt(__doc__, version=iepy.__version__)
    relation_name = opts.get("<relation>")
    limit = opts.get("--limit")
    rule_name = opts.get("--rule")
    shuffle = opts.get("--shuffle")
    create_evidences = opts.get("--create-evidences")

    if limit is None:
        limit = -1

    try:
        limit = int(limit)
    except ValueError:
        logging.error("Invalid limit value, it must be a number")
        sys.exit(1)

    try:
        relation = models.Relation.objects.get(name=relation_name)
    except ObjectDoesNotExist:
        logging.error("Relation {!r} not found".format(relation_name))
        sys.exit(1)

    # Load rules
    rules = get_rules(rule_name)
    rule_regexes = [(rule.__name__, compile_rule(rule, relation), rule.answer)
                    for rule in rules]

    # Load evidences
    if EvidenceCandidate.objects.all().count() == 0:
        create_evidences = True
    evidences = CandidateEvidenceManager.candidates_for_relation(
        relation, create_evidences, seg_limit=limit, shuffle_segs=shuffle)
    conflict_solver = CandidateEvidenceManager.conflict_resolution_newest_wins
    answers = CandidateEvidenceManager.labels_for(relation, evidences,
                                                  conflict_solver)
    run_tests(rule_regexes, evidences, answers)
Beispiel #10
0
        print (__doc__)
        exit(1)

    try:
        relation = Relation.objects.get(name=relation)
    except Relation.DoesNotExist:
        print("Relation {!r} non existent".format(relation))
        print_all_relations()
        exit(1)

    extractor_config = opts.get("--extractor-config")
    if extractor_config:
        with open(extractor_config) as filehandler:
            extractor_config = json.load(filehandler)

    candidates = CandidateEvidenceManager.candidates_for_relation(relation)
    labeled_evidences = load_labeled_evidences(relation, candidates)
    iextractor = ActiveLearningCore(relation, labeled_evidences, extractor_config,
                                    performance_tradeoff=tuning_mode)
    iextractor.start()

    STOP = u'STOP'
    term = TerminalAdministration(relation,
                                  extra_options=[(STOP, u'Stop execution')])
    was_ever_trained = False
    while iextractor.questions:
        questions = list(iextractor.questions)  # copying the list
        term.update_candidate_evidences_to_label(questions)
        result = term()
        i = 0
        for c, label_value in load_labeled_evidences(relation, questions).items():
Beispiel #11
0
 def get_candidates(self, relation):
     return CandidateEvidenceManager.candidates_for_relation(relation)
Beispiel #12
0
        level=logging.DEBUG,
        format=u"%(asctime)s - %(name)s - %(levelname)s - %(message)s")

    try:
        relation = Relation.objects.get(name=relation)
    except Relation.DoesNotExist:
        print("Relation {!r} non existent".format(relation))
        print_all_relations()
        exit(1)

    extractor_config = opts.get("--extractor-config")
    if extractor_config:
        with open(extractor_config) as filehandler:
            extractor_config = json.load(filehandler)

    candidates = CandidateEvidenceManager.candidates_for_relation(relation)
    labeled_evidences = load_labeled_evidences(relation, candidates)
    iextractor = ActiveLearningCore(relation, labeled_evidences,
                                    extractor_config)
    iextractor.start()

    STOP = u'STOP'
    term = TerminalAdministration(relation,
                                  extra_options=[(STOP, u'Stop execution ASAP')
                                                 ])

    while iextractor.questions:
        questions = list(iextractor.questions)  # copying the list
        term.update_candidate_evidences_to_label(questions)
        result = term()
        if result == STOP:
Beispiel #13
0
def run_from_command_line():
    opts = docopt(__doc__, version=iepy.__version__)
    relation = opts['<relation_name>']
    classifier_path = opts.get('--classifier')

    logging.basicConfig(level=logging.INFO, format='%(message)s')
    logging.getLogger("featureforge").setLevel(logging.WARN)

    if opts['--tune-for'] == 'high-prec':
        tuning_mode = HIPREC
    elif opts['--tune-for'] == 'high-recall':
        tuning_mode = HIREC
    else:
        print ('Invalid tuning mode')
        print (__doc__)
        exit(1)

    try:
        relation = Relation.objects.get(name=relation)
    except Relation.DoesNotExist:
        print("Relation {!r} non existent".format(relation))
        print_all_relations()
        exit(1)

    candidates = CandidateEvidenceManager.candidates_for_relation(relation)
    labeled_evidences = load_labeled_evidences(relation, candidates)

    if classifier_path:
        try:
            loaded_classifier = output.load_classifier(classifier_path)
        except ValueError:
            print("Error: unable to load classifier, invalid file")
            exit(1)

        iextractor = ActiveLearningCore(
            relation, labeled_evidences, performance_tradeoff=tuning_mode,
            classifier=loaded_classifier
        )
        was_ever_trained = True
    else:
        extractor_config = opts.get("--extractor-config")
        if extractor_config:
            with open(extractor_config) as filehandler:
                extractor_config = json.load(filehandler)

        iextractor = ActiveLearningCore(
            relation, labeled_evidences, extractor_config,
            performance_tradeoff=tuning_mode
        )
        iextractor.start()
        was_ever_trained = False


    if not opts.get("--no-questions", False):
        questions_loop(iextractor, relation, was_ever_trained)

    # Predict and store output
    predictions = iextractor.predict()
    if predictions:
        output.dump_output_loop(predictions)
        output.dump_classifier_loop(iextractor)
Beispiel #14
0
 def refresh_info(self):
     c = CandidateEvidenceManager.value_labeled_candidates_count_for_relation(
         self.relation)
     print ('There are %s labels with yes/no answers' % c)
Beispiel #15
0
 def get_candidates(self, relation):
     return CandidateEvidenceManager.candidates_for_relation(relation)
Beispiel #16
0
    def __call__(self, config):
        if u"class_weight" in config[u"classifier_args"]:
            d = config[u"classifier_args"][u"class_weight"]
            assert "true" in d and "false" in d and len(d) == 2
            config[u"classifier_args"][u"class_weight"] = {
                True: d["true"],
                False: d["false"]
            }

        # Prepare data
        if self.data is None or self.relname != config["relation"]:
            relation = iepy.data.models.Relation.objects.get(
                name=config["relation"])
            c_evidences = CEM.candidates_for_relation(relation)
            self.data = CEM.labels_for(relation, c_evidences,
                                       CEM.conflict_resolution_newest_wins)
            self.data = [(x, label) for x, label in self.data.items()
                         if label is not None]
            self.relname = config["relation"]
        data = self.data
        testset = {x: label for x, label in data}
        candidate_evidences = {x: None for x, _ in data}
        if not data:
            raise NotEnoughData("There is no labeled data for training")
        oracle_answers = config["oracle_answers"]
        N = len(data)
        M = N - oracle_answers  # test set size
        if M / N < 0.1:  # if there ir less than 10% left for testing
            raise NotEnoughData("There is not enough data for evaluation")

        result = {
            "train_size": oracle_answers,
            "test_size": M,
            "dataset_size": N,
            "start_time": time.time(),
        }

        # Interact with oracle
        alcore = ActiveLearningCore(config["relation"],
                                    candidate_evidences,
                                    extractor_config=config,
                                    performance_tradeoff=config["tradeoff"])
        alcore.start()
        # ^ Is acainst creenhouse emissions
        for _ in range(oracle_answers):
            q = alcore.questions[0]
            alcore.add_answer(q, testset[q])
            del testset[q]  # Once given for training cannot be part of testset
            alcore.process()

        test_evidences, test_labels = zip(*list(testset.items()))
        extractor = alcore.relation_classifier

        # Evaluate prediction
        predicted_dict = alcore.predict()
        test_evidences = list(testset)
        test_labels = [testset[x] for x in test_evidences]
        predicted_labels = [predicted_dict[x] for x in test_evidences]
        result.update(
            result_dict_from_predictions(test_evidences, test_labels,
                                         predicted_labels))

        # Evaluate ranking
        predicted_scores = extractor.decision_function(test_evidences)
        auroc = roc_auc_score(test_labels, predicted_scores)
        avgprec = average_precision_score(test_labels, predicted_scores)

        result.update({
            "auROC": auroc,
            "average_precision": avgprec,
        })
        return result
Beispiel #17
0
def run_from_command_line():
    opts = docopt(__doc__, version=iepy.__version__)
    relation = opts['<relation_name>']
    classifier_path = opts.get('--classifier')

    logging.basicConfig(level=logging.INFO, format='%(message)s')
    logging.getLogger("featureforge").setLevel(logging.WARN)

    if opts['--tune-for'] == 'high-prec':
        tuning_mode = HIPREC
    elif opts['--tune-for'] == 'high-recall':
        tuning_mode = HIREC
    else:
        print('Invalid tuning mode')
        print(__doc__)
        exit(1)

    try:
        relation = Relation.objects.get(name=relation)
    except Relation.DoesNotExist:
        print("Relation {!r} non existent".format(relation))
        print_all_relations()
        exit(1)

    candidates = CandidateEvidenceManager.candidates_for_relation(relation)
    labeled_evidences = load_labeled_evidences(relation, candidates)

    if classifier_path:
        try:
            loaded_classifier = output.load_classifier(classifier_path)
        except ValueError:
            print("Error: unable to load classifier, invalid file")
            exit(1)

        iextractor = ActiveLearningCore(relation,
                                        labeled_evidences,
                                        performance_tradeoff=tuning_mode,
                                        classifier=loaded_classifier)
        was_ever_trained = True
    else:
        config_filepath = opts.get("--extractor-config")
        if not config_filepath:
            config_filepath = os.path.join(INSTANCE_PATH,
                                           "extractor_config.json")

        if not os.path.exists(config_filepath):
            print("Error: extractor config does not exists, please create the "
                  "file extractor_config.json or use the --extractor-config")
            exit(1)

        with open(config_filepath) as filehandler:
            try:
                extractor_config = json.load(filehandler)
            except Exception as error:
                print(
                    "Error: unable to load extractor config: {}".format(error))
                exit(1)

        iextractor = ActiveLearningCore(relation,
                                        labeled_evidences,
                                        extractor_config,
                                        performance_tradeoff=tuning_mode)
        iextractor.start()
        was_ever_trained = False

    if not opts.get("--no-questions", False):
        questions_loop(iextractor, relation, was_ever_trained)

    # Predict and store output
    predictions = iextractor.predict()
    if predictions:
        output.dump_output_loop(predictions)
        output.dump_classifier_loop(iextractor)