Ejemplo n.º 1
0
 def collect_sentences(self, names):
     '''classifies all sentences based on the fact that they contain a reference to the subject of the article, the searched value and if there is more than one such sentence in an article also to at least part of the predicate. Both types of sentences are returned, positive sentences contain also the value.'''
     positive, negative = [], []
     types = [
         'hrabstwo', 'gmina', 'prowincja',
         quote_plus('województwo'), 'powiat', 'region'
     ]
     for subject, object in names:
         try:
             article = get_article(subject)
         except:
             continue
         pos = []
         object = lt.prepare_value(object, self.predicate)
         best_match = (0, '')
         for sentence in article:
             lemmas = [word.lemma for word in sentence]
             if any(o in lemmas for o in object):
                 if self.predicate not in types or any(
                         p in [l for l in lemmas]
                         for p in self.predicate_words):
                     num_matches = len(set(lemmas) & set(object))
                     if num_matches > best_match[0]:
                         best_match = (num_matches, (sentence, object))
             else:
                 negative.append(sentence)
         if best_match[0]:
             positive.append(best_match[1])
     assert len(positive) > 10, 'Too little training examples.'
     return positive, negative
Ejemplo n.º 2
0
 def extract_sentences(self, entities):
     articles = prepare_articles(entities)
     extracted_sentences = defaultdict(list)
     if verbose:
         print 'Classifying sentences:'
     for entity in entities:
         try:
             article = get_article(entity)
         except:
             continue
         if not article:
             continue
         if verbose:
             print entity
         probabilities = [
             prob[1] for prob in self.classifier.predict_proba(
                 map(self.get_features, article))
         ]
         #for each article return all sentences with scores > confidence_level
         for sentence, p in izip(article, probabilities):
             if p > self.confidence_level:
                 extracted_sentences[entity].append(sentence)
                 if verbose:
                     print '***', '%.2f' % p, ' '.join(
                         [w.segment for w in sentence])
             elif verbose:
                 print '%.2f' % p, ' '.join([w.segment for w in sentence])
         if verbose:
             print
     return extracted_sentences
Ejemplo n.º 3
0
 def collect_sentences(self, names):
     """classifies all sentences based on the fact that they contain a reference to the subject of the article, the searched value and if there is more than one such sentence in an article also to at least part of the predicate. Both types of sentences are returned, positive sentences contain also the value."""
     positive, negative = [], []
     types = ["hrabstwo", "gmina", "prowincja", quote_plus("województwo"), "powiat", "region"]
     for subject, object in names:
         try:
             article = get_article(subject)
         except:
             continue
         pos = []
         object = lt.prepare_value(object, self.predicate)
         best_match = (0, "")
         for sentence in article:
             lemmas = [word.lemma for word in sentence]
             if any(o in lemmas for o in object):
                 if self.predicate not in types or any(p in [l for l in lemmas] for p in self.predicate_words):
                     num_matches = len(set(lemmas) & set(object))
                     if num_matches > best_match[0]:
                         best_match = (num_matches, (sentence, object))
             else:
                 negative.append(sentence)
         if best_match[0]:
             positive.append(best_match[1])
     assert len(positive) > 10, "Too little training examples."
     return positive, negative
Ejemplo n.º 4
0
 def extract_sentences(self, entities):
     articles = prepare_articles(entities)
     extracted_sentences = defaultdict(list)
     if verbose:
         print "Classifying sentences:"
     for entity in entities:
         try:
             article = get_article(entity)
         except:
             continue
         if not article:
             continue
         if verbose:
             print entity
         probabilities = [prob[1] for prob in self.classifier.predict_proba(map(self.get_features, article))]
         # for each article return all sentences with scores > confidence_level
         for sentence, p in izip(article, probabilities):
             if p > self.confidence_level:
                 extracted_sentences[entity].append(sentence)
                 if verbose:
                     print "***", "%.2f" % p, " ".join([w.segment for w in sentence])
             elif verbose:
                 print "%.2f" % p, " ".join([w.segment for w in sentence])
         if verbose:
             print
     return extracted_sentences
            raise
    entities_f = open(tests_path + '%s/entities' % predicate, 'w')
    values_f = open(tests_path + '%s/values' % predicate, 'w')
    articles_f = open(tests_path + '%s/articles' % predicate, 'w')
    if predicate in type_restrictions:
        names = select_entities_of_type_in_relation(
            type_restrictions[predicate], predicate
        )
    else:
        names = select_all({'p': predicate})
    shuffle(names)
    names = names[: test_data_limit]
    subjects, objects = zip(*list(names))
    values = defaultdict(list)
    for subject, value in names:
        values[subject].append(value)
    prepare_articles(subjects)
    for subject, value in values.iteritems():
        try:
            article = get_article(subject)
        except:
            continue
        print >>articles_f, subject, lt.prepare_value(value[0], predicate)
        for sentence in article:
            sentence = [word.segment for word in sentence]
            print >>articles_f, ' '.join(sentence)
        print >>articles_f
        print >>entities_f, subject
        print >>values_f, subject, value[0].replace(' ', '_').encode('utf-8')

    except OSError as e:
        if e.errno != errno.EEXIST:
            raise
    entities_f = open(tests_path + '%s/entities' % predicate, 'w')
    values_f = open(tests_path + '%s/values' % predicate, 'w')
    articles_f = open(tests_path + '%s/articles' % predicate, 'w')
    if predicate in type_restrictions:
        names = select_entities_of_type_in_relation(
            type_restrictions[predicate], predicate)
    else:
        names = select_all({'p': predicate})
    shuffle(names)
    names = names[:test_data_limit]
    subjects, objects = zip(*list(names))
    values = defaultdict(list)
    for subject, value in names:
        values[subject].append(value)
    prepare_articles(subjects)
    for subject, value in values.iteritems():
        try:
            article = get_article(subject)
        except:
            continue
        print >> articles_f, subject, lt.prepare_value(value[0], predicate)
        for sentence in article:
            sentence = [word.segment for word in sentence]
            print >> articles_f, ' '.join(sentence)
        print >> articles_f
        print >> entities_f, subject
        print >> values_f, subject, value[0].replace(' ', '_').encode('utf-8')