def learn_new_triples(predicate): """learns new triples and stores them in a file""" sc = get_sentence_classifier(predicate) if predicate in numeric_predicates: pattern = '<%s> <%s> "%s"^^<http://www.w3.org/2001/XMLSchema#int> .' else: pattern = '<%s> <%s> "%s"@pl .' entities = CandidatesSelector.get_candidates(predicate) entities = entities[: candidates_limit] if verbose: print '%s candidates identified' % len(entities) n = 1000 entities_list = [ entities[i*n : (i+1)*n] for i in xrange(int(ceil(len(entities) / n))) ] out = open(results_path + 'triples-%s' % predicate, 'w') ve = ValueExtractor(predicate, sc.extractor_training_data) for entities in entities_list: extracted_sentences = sc.extract_sentences(entities) values = ve.extract_values(extracted_sentences) for e, v in values.iteritems(): if v: print >>out, pattern % ( full_resource_name(e).encode('utf-8'), full_predicate_name(predicate).encode('utf-8'), v )
def learn_new_triples(predicate): """learns new triples and stores them in a file""" sc = get_sentence_classifier(predicate) if predicate in numeric_predicates: pattern = '<%s> <%s> "%s"^^<http://www.w3.org/2001/XMLSchema#int> .' else: pattern = '<%s> <%s> "%s"@pl .' entities = CandidatesSelector.get_candidates(predicate) entities = entities[:candidates_limit] if verbose: print '%s candidates identified' % len(entities) n = 1000 entities_list = [ entities[i * n:(i + 1) * n] for i in xrange(int(ceil(len(entities) / n))) ] out = open(results_path + 'triples-%s' % predicate, 'w') ve = ValueExtractor(predicate, sc.extractor_training_data) for entities in entities_list: extracted_sentences = sc.extract_sentences(entities) values = ve.extract_values(extracted_sentences) for e, v in values.iteritems(): if v: print >> out, pattern % ( full_resource_name(e).encode('utf-8'), full_predicate_name(predicate).encode('utf-8'), v)
def __init__(self, predicate, training_data): self.predicate = predicate self.predominant_types = map( lambda t: t.split('/')[-1], CandidatesSelector.get_predominant_types(predicate, False) ) self.model_filename = 'crfmodel-%s' % predicate self.features_train_filename = 'features_train' self.features_tag_filename = 'features_tag' try: open(models_cache_path % self.model_filename) except IOError: self.train(training_data)