def __call__(self, config): # Prepare data if self.data is None or self.relname != config["relation"]: self.relname = config["relation"] self.relation = iepy.data.models.Relation.objects.get( name=config["relation"]) candidates = CEM.candidates_for_relation(self.relation) self.data = CEM.labels_for(self.relation, candidates, CEM.conflict_resolution_newest_wins) self.evidences = [] self.labels = [] for evidence, label in self.data.items(): if label is not None: self.labels.append(label) self.evidences.append(evidence) if not self.data: raise NotEnoughLabeledData( "There is no labeled data for training!") result = { "dataset_size": len(self.data), "start_time": time.time(), } # Load rules in the config if config["rules"] == "<all>": rules = self.rules.values() else: for rule_name in config["rules"]: if rule_name not in self.rules.keys(): raise RuleNotFound(rule_name) rules = [ rule for rule_name, rule in self.rules.items() if rule_name in config["rules"] ] # Run the rule based pipeline pipeline = RuleBasedCore(self.relation, self.evidences, rules) pipeline.start() matched = pipeline.known_facts() predicted_labels = [e in matched for e in self.evidences] # Evaluate prediction result.update( result_dict_from_predictions(self.evidences, self.labels, predicted_labels)) return result
def run_from_command_line(): logging.basicConfig(level=logging.INFO, format='%(message)s') try: relation_name = iepy.instance.rules.RELATION except AttributeError: logging.error("RELATION not defined in rules file") sys.exit(1) try: relation = models.Relation.objects.get(name=relation_name) except ObjectDoesNotExist: logging.error("Relation {!r} not found".format(relation_name)) sys.exit(1) # Load rules rules = load_rules() # Load evidences evidences = CandidateEvidenceManager.candidates_for_relation(relation) # Run the pipeline iextractor = RuleBasedCore(relation, rules) iextractor.start() iextractor.process() predictions = iextractor.predict(evidences) output.dump_output_loop(predictions)
def run_from_command_line(): logging.basicConfig(level=logging.INFO, format='%(message)s') try: relation_name = iepy.instance.rules.RELATION except AttributeError: logging.error("RELATION not defined in rules file") sys.exit(1) try: relation = models.Relation.objects.get(name=relation_name) except ObjectDoesNotExist: logging.error("Relation {!r} not found".format(relation_name)) sys.exit(1) # Load rules rules = load_rules() # Load evidences evidences = CandidateEvidenceManager.candidates_for_relation(relation) # Run the pipeline iextractor = RuleBasedCore(relation, evidences, rules) iextractor.start() iextractor.process() predictions = iextractor.predict() output.dump_output_loop(predictions)
def _e(markup, **kwargs): base_pos = kwargs.pop('base_pos', ["DT", u"JJ", u"NN"]) evidence = EvidenceFactory(markup=markup, **kwargs) evidence = CandidateEvidenceManager.hydrate(evidence) n = len(evidence.segment.tokens) pos = (base_pos * n)[:n] evidence.segment.postags = pos return evidence
def run_from_command_line(): opts = docopt(__doc__, version=iepy.__version__) logging.basicConfig(level=logging.INFO, format='%(message)s') logging.getLogger("featureforge").setLevel(logging.WARN) tuning_mode = _get_tuning_mode(opts) relation = _get_relation(opts) candidates = CandidateEvidenceManager.candidates_for_relation(relation) labeled_evidences = load_labeled_evidences(relation, candidates) if opts.get('--trained-extractor'): iextractor = _load_extractor(opts, relation, labeled_evidences) was_ever_trained = True opts["--no-questions"] = True else: iextractor = _construct_extractor(opts, relation, labeled_evidences, tuning_mode) iextractor.start() was_ever_trained = False if not opts.get("--no-questions", False): questions_loop(iextractor, relation, was_ever_trained) # Candidates generator was consumed when generating labeled_evidences, so we'll # define it fresh again candidates = CandidateEvidenceManager.candidates_for_relation(relation) # Predict and store output predictions = iextractor.predict(candidates) # asking predictions for EVERYTHING if not predictions: print("Nothing was predicted") exit(1) if opts.get("--db-store"): output.dump_predictions_to_database(relation, predictions) output_file = opts.get("<output>") if output_file: output.dump_runner_output_to_csv(predictions, output_file) classifier_output = opts.get("--store-extractor") if classifier_output: iextractor.save(classifier_output)
def _e(markup, **kwargs): base_pos = kwargs.pop('base_pos', ["DT", u"JJ", u"NN"]) base_lemmas = kwargs.pop('base_lemmas', None) evidence = EvidenceFactory(markup=markup, **kwargs) evidence = CandidateEvidenceManager.hydrate(evidence) if base_lemmas is None: base_lemmas = [x.lower() for x in evidence.segment.tokens] n = len(evidence.segment.tokens) pos = (base_pos * n)[:n] evidence.segment.postags = pos evidence.segment.lemmas = base_lemmas return evidence
def run_from_command_line(): opts = docopt(__doc__, version=iepy.__version__) relation_name = opts.get("<relation>") limit = opts.get("--limit") rule_name = opts.get("--rule") shuffle = opts.get("--shuffle") create_evidences = opts.get("--create-evidences") if limit is None: limit = -1 try: limit = int(limit) except ValueError: logging.error("Invalid limit value, it must be a number") sys.exit(1) try: relation = models.Relation.objects.get(name=relation_name) except ObjectDoesNotExist: logging.error("Relation {!r} not found".format(relation_name)) sys.exit(1) # Load rules rules = get_rules(rule_name) rule_regexes = [(rule.__name__, compile_rule(rule, relation), rule.answer) for rule in rules] # Load evidences if EvidenceCandidate.objects.all().count() == 0: create_evidences = True evidences = CandidateEvidenceManager.candidates_for_relation( relation, create_evidences, seg_limit=limit, shuffle_segs=shuffle) conflict_solver = CandidateEvidenceManager.conflict_resolution_newest_wins answers = CandidateEvidenceManager.labels_for(relation, evidences, conflict_solver) run_tests(rule_regexes, evidences, answers)
print (__doc__) exit(1) try: relation = Relation.objects.get(name=relation) except Relation.DoesNotExist: print("Relation {!r} non existent".format(relation)) print_all_relations() exit(1) extractor_config = opts.get("--extractor-config") if extractor_config: with open(extractor_config) as filehandler: extractor_config = json.load(filehandler) candidates = CandidateEvidenceManager.candidates_for_relation(relation) labeled_evidences = load_labeled_evidences(relation, candidates) iextractor = ActiveLearningCore(relation, labeled_evidences, extractor_config, performance_tradeoff=tuning_mode) iextractor.start() STOP = u'STOP' term = TerminalAdministration(relation, extra_options=[(STOP, u'Stop execution')]) was_ever_trained = False while iextractor.questions: questions = list(iextractor.questions) # copying the list term.update_candidate_evidences_to_label(questions) result = term() i = 0 for c, label_value in load_labeled_evidences(relation, questions).items():
def get_candidates(self, relation): return CandidateEvidenceManager.candidates_for_relation(relation)
level=logging.DEBUG, format=u"%(asctime)s - %(name)s - %(levelname)s - %(message)s") try: relation = Relation.objects.get(name=relation) except Relation.DoesNotExist: print("Relation {!r} non existent".format(relation)) print_all_relations() exit(1) extractor_config = opts.get("--extractor-config") if extractor_config: with open(extractor_config) as filehandler: extractor_config = json.load(filehandler) candidates = CandidateEvidenceManager.candidates_for_relation(relation) labeled_evidences = load_labeled_evidences(relation, candidates) iextractor = ActiveLearningCore(relation, labeled_evidences, extractor_config) iextractor.start() STOP = u'STOP' term = TerminalAdministration(relation, extra_options=[(STOP, u'Stop execution ASAP') ]) while iextractor.questions: questions = list(iextractor.questions) # copying the list term.update_candidate_evidences_to_label(questions) result = term() if result == STOP:
def run_from_command_line(): opts = docopt(__doc__, version=iepy.__version__) relation = opts['<relation_name>'] classifier_path = opts.get('--classifier') logging.basicConfig(level=logging.INFO, format='%(message)s') logging.getLogger("featureforge").setLevel(logging.WARN) if opts['--tune-for'] == 'high-prec': tuning_mode = HIPREC elif opts['--tune-for'] == 'high-recall': tuning_mode = HIREC else: print ('Invalid tuning mode') print (__doc__) exit(1) try: relation = Relation.objects.get(name=relation) except Relation.DoesNotExist: print("Relation {!r} non existent".format(relation)) print_all_relations() exit(1) candidates = CandidateEvidenceManager.candidates_for_relation(relation) labeled_evidences = load_labeled_evidences(relation, candidates) if classifier_path: try: loaded_classifier = output.load_classifier(classifier_path) except ValueError: print("Error: unable to load classifier, invalid file") exit(1) iextractor = ActiveLearningCore( relation, labeled_evidences, performance_tradeoff=tuning_mode, classifier=loaded_classifier ) was_ever_trained = True else: extractor_config = opts.get("--extractor-config") if extractor_config: with open(extractor_config) as filehandler: extractor_config = json.load(filehandler) iextractor = ActiveLearningCore( relation, labeled_evidences, extractor_config, performance_tradeoff=tuning_mode ) iextractor.start() was_ever_trained = False if not opts.get("--no-questions", False): questions_loop(iextractor, relation, was_ever_trained) # Predict and store output predictions = iextractor.predict() if predictions: output.dump_output_loop(predictions) output.dump_classifier_loop(iextractor)
def refresh_info(self): c = CandidateEvidenceManager.value_labeled_candidates_count_for_relation( self.relation) print ('There are %s labels with yes/no answers' % c)
def __call__(self, config): if u"class_weight" in config[u"classifier_args"]: d = config[u"classifier_args"][u"class_weight"] assert "true" in d and "false" in d and len(d) == 2 config[u"classifier_args"][u"class_weight"] = { True: d["true"], False: d["false"] } # Prepare data if self.data is None or self.relname != config["relation"]: relation = iepy.data.models.Relation.objects.get( name=config["relation"]) c_evidences = CEM.candidates_for_relation(relation) self.data = CEM.labels_for(relation, c_evidences, CEM.conflict_resolution_newest_wins) self.data = [(x, label) for x, label in self.data.items() if label is not None] self.relname = config["relation"] data = self.data testset = {x: label for x, label in data} candidate_evidences = {x: None for x, _ in data} if not data: raise NotEnoughData("There is no labeled data for training") oracle_answers = config["oracle_answers"] N = len(data) M = N - oracle_answers # test set size if M / N < 0.1: # if there ir less than 10% left for testing raise NotEnoughData("There is not enough data for evaluation") result = { "train_size": oracle_answers, "test_size": M, "dataset_size": N, "start_time": time.time(), } # Interact with oracle alcore = ActiveLearningCore(config["relation"], candidate_evidences, extractor_config=config, performance_tradeoff=config["tradeoff"]) alcore.start() # ^ Is acainst creenhouse emissions for _ in range(oracle_answers): q = alcore.questions[0] alcore.add_answer(q, testset[q]) del testset[q] # Once given for training cannot be part of testset alcore.process() test_evidences, test_labels = zip(*list(testset.items())) extractor = alcore.relation_classifier # Evaluate prediction predicted_dict = alcore.predict() test_evidences = list(testset) test_labels = [testset[x] for x in test_evidences] predicted_labels = [predicted_dict[x] for x in test_evidences] result.update( result_dict_from_predictions(test_evidences, test_labels, predicted_labels)) # Evaluate ranking predicted_scores = extractor.decision_function(test_evidences) auroc = roc_auc_score(test_labels, predicted_scores) avgprec = average_precision_score(test_labels, predicted_scores) result.update({ "auROC": auroc, "average_precision": avgprec, }) return result
def run_from_command_line(): opts = docopt(__doc__, version=iepy.__version__) relation = opts['<relation_name>'] classifier_path = opts.get('--classifier') logging.basicConfig(level=logging.INFO, format='%(message)s') logging.getLogger("featureforge").setLevel(logging.WARN) if opts['--tune-for'] == 'high-prec': tuning_mode = HIPREC elif opts['--tune-for'] == 'high-recall': tuning_mode = HIREC else: print('Invalid tuning mode') print(__doc__) exit(1) try: relation = Relation.objects.get(name=relation) except Relation.DoesNotExist: print("Relation {!r} non existent".format(relation)) print_all_relations() exit(1) candidates = CandidateEvidenceManager.candidates_for_relation(relation) labeled_evidences = load_labeled_evidences(relation, candidates) if classifier_path: try: loaded_classifier = output.load_classifier(classifier_path) except ValueError: print("Error: unable to load classifier, invalid file") exit(1) iextractor = ActiveLearningCore(relation, labeled_evidences, performance_tradeoff=tuning_mode, classifier=loaded_classifier) was_ever_trained = True else: config_filepath = opts.get("--extractor-config") if not config_filepath: config_filepath = os.path.join(INSTANCE_PATH, "extractor_config.json") if not os.path.exists(config_filepath): print("Error: extractor config does not exists, please create the " "file extractor_config.json or use the --extractor-config") exit(1) with open(config_filepath) as filehandler: try: extractor_config = json.load(filehandler) except Exception as error: print( "Error: unable to load extractor config: {}".format(error)) exit(1) iextractor = ActiveLearningCore(relation, labeled_evidences, extractor_config, performance_tradeoff=tuning_mode) iextractor.start() was_ever_trained = False if not opts.get("--no-questions", False): questions_loop(iextractor, relation, was_ever_trained) # Predict and store output predictions = iextractor.predict() if predictions: output.dump_output_loop(predictions) output.dump_classifier_loop(iextractor)