Ejemplo n.º 1
0
    def test_main_verbs(self):

        for _, _, sentence in self.computed_sentences:
            print()
            print(sentence)
            verbs = set(
                Part.get_main_verbs(sentence,
                                    token_map=lambda t: t.features["lemma"]))
            print("\t", verbs)
Ejemplo n.º 2
0
    def generate(self, corpus, f_set, use_gold, use_pred):
        assert not (use_gold and use_pred), "No support for both"

        self.extract_abbreviation_synonyms(corpus, use_gold, use_pred)

        for docid, document in corpus.documents.items():
            for edge in document.edges():

                sentence = edge.get_combined_sentence()

                entities_in_sentences = edge.get_any_entities_in_sentences(predicted=use_pred)
                total_count = 0
                # We sort to have a deterministic order creation of the features
                for e_class_id in sorted(entities_in_sentences):
                    entities = entities_in_sentences[e_class_id]
                    # TODO this is wrong for other entitiey types nor appearing in the edge
                    # TODO also what about if the same entity type appears in both ends of the same edge? as in a protein-protein relation --> Just rest the counts of the edge
                    individual_count = len(entities) - 1  # rest 1, as one is already one of the edge's entities --
                    assert individual_count >= 0
                    total_count += individual_count
                    self.add_with_value(f_set, edge, 'f_counts_individual', individual_count, 'int', 'individual', e_class_id)

                self.add_with_value(f_set, edge, 'f_counts_total', total_count, 'int', 'total (all classes)')

                entities_between_entities = edge.get_any_entities_between_entities(predicted=use_pred)
                total_count = 0
                # We sort to have a deterministic order creation of the features
                for e_class_id in sorted(entities_between_entities):
                    entities = entities_between_entities[e_class_id]
                    individual_count = len(entities)
                    total_count += individual_count
                    self.add_with_value(f_set, edge, 'f_counts_in_between_individual', individual_count, 'int', 'individual', e_class_id)

                self.add_with_value(f_set, edge, 'f_counts_in_between_total', total_count, 'int', 'total (all classes)')

                order = edge.entity1.class_id < edge.entity2.class_id
                if order:
                    self.add(f_set, edge, 'f_order')

                for token in sentence:
                    self.add(f_set, edge, 'f_bow', masked_text(token, edge.same_part, use_gold, use_pred, token_map=lambda t: t.features['lemma'], token_is_number_fun=lambda _: "NUM"))
                    self.add(f_set, edge, 'f_pos', token.features['coarsed_pos'])

                self.add_with_value(f_set, edge, 'f_tokens_count', len(sentence))

                # Remember, the edge's entities are sorted, i.e. e1.offset < e2.offset
                _e1_first_token_index = edge.entity1.tokens[0].features['tmp_id']
                _e2_last_token_index = edge.entity2.tokens[-1].features['tmp_id']
                assert _e1_first_token_index < _e2_last_token_index, (docid, sentence, edge.entity1.text, edge.entity2.text, _e1_first_token_index, _e2_last_token_index)

                self.add_with_value(f_set, edge, 'f_tokens_count_before', len(sentence[:_e1_first_token_index]))
                self.add_with_value(f_set, edge, 'f_tokens_count_after', len(sentence[(_e2_last_token_index+1):]))

                #

                if Part.is_negated(sentence):
                    self.add(f_set, edge, "f_sentence_is_negated")

                #

                verbs = set(Part.get_main_verbs(sentence, token_map=lambda t: t.features["lemma"]))

                if len(verbs) == 0:
                    self.add(f_set, edge, "f_main_verbs", "NO_MAIN_VERB")
                else:
                    for v in verbs:
                        self.add(f_set, edge, "f_main_verbs", v)

                counters = {}
                for part in document:
                    for entity in (part.annotations if use_gold else part.predicted_annotations):
                        ent_type_counter = counters.get(entity.class_id, Counter())
                        ent_key = __class__.entity2key(entity)
                        ent_type_counter.update([ent_key])
                        counters[entity.class_id] = ent_type_counter

                e1_key = __class__.entity2key(edge.entity1)
                e1_count = counters[edge.entity1.class_id][e1_key]
                self.add_with_value(f_set, edge, 'f_entity1_count', e1_count)

                e2_key = __class__.entity2key(edge.entity2)
                e2_count = counters[edge.entity2.class_id][e2_key]
                self.add_with_value(f_set, edge, 'f_entity2_count', e2_count)

                together_counter = Counter()
                diff_sentences = {}
                for aux_edge in document.edges():
                    if aux_edge.e1_sentence_id == aux_edge.e2_sentence_id:
                        together_key = __class__.edge2key(aux_edge)

                        sents = diff_sentences.get(together_key, [])
                        if aux_edge.e1_sentence_id not in sents:
                            sents.append(aux_edge.e1_sentence_id)
                            diff_sentences[together_key] = sents
                            together_counter.update([together_key])

                together_key = __class__.edge2key(edge)
                together_count = together_counter[together_key]
                if together_count > 0:
                    self.add_with_value(f_set, edge, 'f_diff_sents_together_count', together_count)