Ejemplo n.º 1
0
    def __call__(self, config):
        # Prepare data
        if self.data is None or self.relname != config["relation"]:
            self.relname = config["relation"]
            self.relation = iepy.data.models.Relation.objects.get(
                name=config["relation"])

            candidates = CEM.candidates_for_relation(self.relation)
            self.data = CEM.labels_for(self.relation, candidates,
                                       CEM.conflict_resolution_newest_wins)
            self.evidences = []
            self.labels = []
            for evidence, label in self.data.items():
                if label is not None:
                    self.labels.append(label)
                    self.evidences.append(evidence)

        if not self.data:
            raise NotEnoughLabeledData(
                "There is no labeled data for training!")

        result = {
            "dataset_size": len(self.data),
            "start_time": time.time(),
        }

        # Load rules in the config
        if config["rules"] == "<all>":
            rules = self.rules.values()
        else:
            for rule_name in config["rules"]:
                if rule_name not in self.rules.keys():
                    raise RuleNotFound(rule_name)
            rules = [
                rule for rule_name, rule in self.rules.items()
                if rule_name in config["rules"]
            ]

        # Run the rule based pipeline
        pipeline = RuleBasedCore(self.relation, self.evidences, rules)
        pipeline.start()
        matched = pipeline.known_facts()
        predicted_labels = [e in matched for e in self.evidences]

        # Evaluate prediction
        result.update(
            result_dict_from_predictions(self.evidences, self.labels,
                                         predicted_labels))

        return result
Ejemplo n.º 2
0
def run_from_command_line():
    opts = docopt(__doc__, version=iepy.__version__)
    relation_name = opts.get("<relation>")
    limit = opts.get("--limit")
    rule_name = opts.get("--rule")
    shuffle = opts.get("--shuffle")
    create_evidences = opts.get("--create-evidences")

    if limit is None:
        limit = -1

    try:
        limit = int(limit)
    except ValueError:
        logging.error("Invalid limit value, it must be a number")
        sys.exit(1)

    try:
        relation = models.Relation.objects.get(name=relation_name)
    except ObjectDoesNotExist:
        logging.error("Relation {!r} not found".format(relation_name))
        sys.exit(1)

    # Load rules
    rules = get_rules(rule_name)
    rule_regexes = [(rule.__name__, compile_rule(rule, relation), rule.answer)
                    for rule in rules]

    # Load evidences
    if EvidenceCandidate.objects.all().count() == 0:
        create_evidences = True
    evidences = CandidateEvidenceManager.candidates_for_relation(
        relation, create_evidences, seg_limit=limit, shuffle_segs=shuffle)
    conflict_solver = CandidateEvidenceManager.conflict_resolution_newest_wins
    answers = CandidateEvidenceManager.labels_for(relation, evidences,
                                                  conflict_solver)
    run_tests(rule_regexes, evidences, answers)
Ejemplo n.º 3
0
    def __call__(self, config):
        if u"class_weight" in config[u"classifier_args"]:
            d = config[u"classifier_args"][u"class_weight"]
            assert "true" in d and "false" in d and len(d) == 2
            config[u"classifier_args"][u"class_weight"] = {
                True: d["true"],
                False: d["false"]
            }

        # Prepare data
        if self.data is None or self.relname != config["relation"]:
            relation = iepy.data.models.Relation.objects.get(
                name=config["relation"])
            c_evidences = CEM.candidates_for_relation(relation)
            self.data = CEM.labels_for(relation, c_evidences,
                                       CEM.conflict_resolution_newest_wins)
            self.data = [(x, label) for x, label in self.data.items()
                         if label is not None]
            self.relname = config["relation"]
        data = self.data
        testset = {x: label for x, label in data}
        candidate_evidences = {x: None for x, _ in data}
        if not data:
            raise NotEnoughData("There is no labeled data for training")
        oracle_answers = config["oracle_answers"]
        N = len(data)
        M = N - oracle_answers  # test set size
        if M / N < 0.1:  # if there ir less than 10% left for testing
            raise NotEnoughData("There is not enough data for evaluation")

        result = {
            "train_size": oracle_answers,
            "test_size": M,
            "dataset_size": N,
            "start_time": time.time(),
        }

        # Interact with oracle
        alcore = ActiveLearningCore(config["relation"],
                                    candidate_evidences,
                                    extractor_config=config,
                                    performance_tradeoff=config["tradeoff"])
        alcore.start()
        # ^ Is acainst creenhouse emissions
        for _ in range(oracle_answers):
            q = alcore.questions[0]
            alcore.add_answer(q, testset[q])
            del testset[q]  # Once given for training cannot be part of testset
            alcore.process()

        test_evidences, test_labels = zip(*list(testset.items()))
        extractor = alcore.relation_classifier

        # Evaluate prediction
        predicted_dict = alcore.predict()
        test_evidences = list(testset)
        test_labels = [testset[x] for x in test_evidences]
        predicted_labels = [predicted_dict[x] for x in test_evidences]
        result.update(
            result_dict_from_predictions(test_evidences, test_labels,
                                         predicted_labels))

        # Evaluate ranking
        predicted_scores = extractor.decision_function(test_evidences)
        auroc = roc_auc_score(test_labels, predicted_scores)
        avgprec = average_precision_score(test_labels, predicted_scores)

        result.update({
            "auROC": auroc,
            "average_precision": avgprec,
        })
        return result