def learn_new_triples(predicate):
    """learns new triples and stores them in a file"""
    sc = get_sentence_classifier(predicate)
    if predicate in numeric_predicates:
        pattern = '<%s> <%s> "%s"^^<http://www.w3.org/2001/XMLSchema#int> .'
    else:
        pattern = '<%s> <%s> "%s"@pl .'
    entities = CandidatesSelector.get_candidates(predicate)
    entities = entities[: candidates_limit]
    if verbose:
        print '%s candidates identified' % len(entities)
    n = 1000
    entities_list = [
        entities[i*n : (i+1)*n] 
        for i in xrange(int(ceil(len(entities) / n)))
    ]
    out = open(results_path + 'triples-%s' % predicate, 'w')
    ve = ValueExtractor(predicate, sc.extractor_training_data)
    for entities in entities_list:
        extracted_sentences = sc.extract_sentences(entities)
        values = ve.extract_values(extracted_sentences)
        for e, v in values.iteritems():
            if v:
                print >>out, pattern % (
                    full_resource_name(e).encode('utf-8'), 
                    full_predicate_name(predicate).encode('utf-8'), 
                    v
                )
def learn_new_triples(predicate):
    """learns new triples and stores them in a file"""
    sc = get_sentence_classifier(predicate)
    if predicate in numeric_predicates:
        pattern = '<%s> <%s> "%s"^^<http://www.w3.org/2001/XMLSchema#int> .'
    else:
        pattern = '<%s> <%s> "%s"@pl .'
    entities = CandidatesSelector.get_candidates(predicate)
    entities = entities[:candidates_limit]
    if verbose:
        print '%s candidates identified' % len(entities)
    n = 1000
    entities_list = [
        entities[i * n:(i + 1) * n]
        for i in xrange(int(ceil(len(entities) / n)))
    ]
    out = open(results_path + 'triples-%s' % predicate, 'w')
    ve = ValueExtractor(predicate, sc.extractor_training_data)
    for entities in entities_list:
        extracted_sentences = sc.extract_sentences(entities)
        values = ve.extract_values(extracted_sentences)
        for e, v in values.iteritems():
            if v:
                print >> out, pattern % (
                    full_resource_name(e).encode('utf-8'),
                    full_predicate_name(predicate).encode('utf-8'), v)
 def __init__(self, predicate, training_data):
     self.predicate = predicate
     self.predominant_types = map(
         lambda t: t.split('/')[-1], 
         CandidatesSelector.get_predominant_types(predicate, False)
     )
     self.model_filename = 'crfmodel-%s' % predicate
     self.features_train_filename = 'features_train'
     self.features_tag_filename = 'features_tag'
     try:
         open(models_cache_path % self.model_filename)
     except IOError:
         self.train(training_data)