Exemple #1
0
    def generate(self, corpus, f_set, use_gold, use_pred):
        assert not (use_gold and use_pred), "No support for both"

        self.extract_abbreviation_synonyms(corpus, use_gold, use_pred)

        for docid, document in corpus.documents.items():
            for edge in document.edges():

                sentence = edge.get_combined_sentence()

                entities_in_sentences = edge.get_any_entities_in_sentences(predicted=use_pred)
                total_count = 0
                # We sort to have a deterministic order creation of the features
                for e_class_id in sorted(entities_in_sentences):
                    entities = entities_in_sentences[e_class_id]
                    # TODO this is wrong for other entitiey types nor appearing in the edge
                    # TODO also what about if the same entity type appears in both ends of the same edge? as in a protein-protein relation --> Just rest the counts of the edge
                    individual_count = len(entities) - 1  # rest 1, as one is already one of the edge's entities --
                    assert individual_count >= 0
                    total_count += individual_count
                    self.add_with_value(f_set, edge, 'f_counts_individual', individual_count, 'int', 'individual', e_class_id)

                self.add_with_value(f_set, edge, 'f_counts_total', total_count, 'int', 'total (all classes)')

                entities_between_entities = edge.get_any_entities_between_entities(predicted=use_pred)
                total_count = 0
                # We sort to have a deterministic order creation of the features
                for e_class_id in sorted(entities_between_entities):
                    entities = entities_between_entities[e_class_id]
                    individual_count = len(entities)
                    total_count += individual_count
                    self.add_with_value(f_set, edge, 'f_counts_in_between_individual', individual_count, 'int', 'individual', e_class_id)

                self.add_with_value(f_set, edge, 'f_counts_in_between_total', total_count, 'int', 'total (all classes)')

                order = edge.entity1.class_id < edge.entity2.class_id
                if order:
                    self.add(f_set, edge, 'f_order')

                for token in sentence:
                    self.add(f_set, edge, 'f_bow', masked_text(token, edge.same_part, use_gold, use_pred, token_map=lambda t: t.features['lemma'], token_is_number_fun=lambda _: "NUM"))
                    self.add(f_set, edge, 'f_pos', token.features['coarsed_pos'])

                self.add_with_value(f_set, edge, 'f_tokens_count', len(sentence))

                # Remember, the edge's entities are sorted, i.e. e1.offset < e2.offset
                _e1_first_token_index = edge.entity1.tokens[0].features['tmp_id']
                _e2_last_token_index = edge.entity2.tokens[-1].features['tmp_id']
                assert _e1_first_token_index < _e2_last_token_index, (docid, sentence, edge.entity1.text, edge.entity2.text, _e1_first_token_index, _e2_last_token_index)

                self.add_with_value(f_set, edge, 'f_tokens_count_before', len(sentence[:_e1_first_token_index]))
                self.add_with_value(f_set, edge, 'f_tokens_count_after', len(sentence[(_e2_last_token_index+1):]))

                #

                if Part.is_negated(sentence):
                    self.add(f_set, edge, "f_sentence_is_negated")

                #

                verbs = set(Part.get_main_verbs(sentence, token_map=lambda t: t.features["lemma"]))

                if len(verbs) == 0:
                    self.add(f_set, edge, "f_main_verbs", "NO_MAIN_VERB")
                else:
                    for v in verbs:
                        self.add(f_set, edge, "f_main_verbs", v)

                counters = {}
                for part in document:
                    for entity in (part.annotations if use_gold else part.predicted_annotations):
                        ent_type_counter = counters.get(entity.class_id, Counter())
                        ent_key = __class__.entity2key(entity)
                        ent_type_counter.update([ent_key])
                        counters[entity.class_id] = ent_type_counter

                e1_key = __class__.entity2key(edge.entity1)
                e1_count = counters[edge.entity1.class_id][e1_key]
                self.add_with_value(f_set, edge, 'f_entity1_count', e1_count)

                e2_key = __class__.entity2key(edge.entity2)
                e2_count = counters[edge.entity2.class_id][e2_key]
                self.add_with_value(f_set, edge, 'f_entity2_count', e2_count)

                together_counter = Counter()
                diff_sentences = {}
                for aux_edge in document.edges():
                    if aux_edge.e1_sentence_id == aux_edge.e2_sentence_id:
                        together_key = __class__.edge2key(aux_edge)

                        sents = diff_sentences.get(together_key, [])
                        if aux_edge.e1_sentence_id not in sents:
                            sents.append(aux_edge.e1_sentence_id)
                            diff_sentences[together_key] = sents
                            together_counter.update([together_key])

                together_key = __class__.edge2key(edge)
                together_count = together_counter[together_key]
                if together_count > 0:
                    self.add_with_value(f_set, edge, 'f_diff_sents_together_count', together_count)
Exemple #2
0
    def generate(self, corpus, f_set, use_gold, use_pred):
        assert not (use_gold and use_pred), "No support for both"

        for docid, document in corpus.documents.items():
            for edge in document.edges():
                sentence = edge.get_combined_sentence()

                # Remember, the edge's entities are sorted, i.e. e1.offset < e2.offset
                _e1_last_token_index = edge.entity1.tokens[-1].features[
                    'tmp_id']
                _e2_first_token_index = edge.entity2.tokens[0].features[
                    'tmp_id']
                assert _e1_last_token_index < _e2_first_token_index, (
                    docid, sentence, edge.entity1.text, edge.entity2.text,
                    _e1_last_token_index, _e2_first_token_index)
                _e1_head_token_index = edge.entity1.head_token.features[
                    'tmp_id']
                _e2_head_token_index = edge.entity2.head_token.features[
                    'tmp_id']
                assert _e1_head_token_index < _e2_head_token_index, (
                    docid, sentence, edge.entity1.text, edge.entity2.text,
                    _e1_head_token_index, _e2_head_token_index)

                dependency_paths = [
                    #
                    # Commented out as of now: they do not seem to provide any performance benefit yet cost running time
                    #
                    # Path(
                    #     name='OW1',
                    #     tokens=edge.entity1.prev_tokens(sentence, n=self.h_ow_size, include_ent_first_token=True, mk_reversed=True),
                    #     is_edge_type_constant=True,
                    #     there_is_target=False,
                    #     default_n_grams=self.h_ow_grams,
                    # ),
                    # Path(
                    #     name='IW1',
                    #     tokens=edge.entity1.next_tokens(sentence, n=self.h_iw_size, include_ent_last_token=True),
                    #     is_edge_type_constant=True,
                    #     there_is_target=False,
                    #     default_n_grams=self.h_iw_grams,
                    # ),
                    #
                    # Path(
                    #     name='IW2',
                    #     tokens=edge.entity2.prev_tokens(sentence, n=self.h_iw_size, include_ent_first_token=True, mk_reversed=True),
                    #     is_edge_type_constant=True,
                    #     there_is_target=False,
                    #     default_n_grams=self.h_iw_grams,
                    # ),
                    # Path(
                    #     name='OW2',
                    #     tokens=edge.entity2.next_tokens(sentence, n=self.h_ow_size, include_ent_last_token=True),
                    #     is_edge_type_constant=True,
                    #     there_is_target=False,
                    #     default_n_grams=self.h_ow_grams,
                    # ),
                    Path(
                        name='LD',
                        tokens=sentence[
                            _e1_last_token_index:_e2_first_token_index + 1],
                        is_edge_type_constant=True,
                        default_n_grams=self.h_ld_grams,
                    ),
                    compute_shortest_path(sentence, _e1_head_token_index,
                                          _e2_head_token_index).change_name(
                                              'PD').change_default_n_grams(
                                                  self.h_pd_grams)
                ]

                for dep_path in dependency_paths:
                    dep_type = dep_path.name

                    for n_gram in dep_path.default_n_grams:
                        self.add_n_grams(f_set, use_gold, use_pred, edge,
                                         dep_path, dep_type, n_gram)

                    count = len(dep_path.middle)
                    count_without_punct = len(
                        list(
                            filter(
                                lambda node: not node.token.features[
                                    'is_punct'], dep_path.middle)))
                    self.add_with_value(f_set, edge,
                                        self.f('f_XX_tokens_count', dep_type),
                                        count, dep_type)
                    self.add_with_value(
                        f_set, edge,
                        self.f('f_XX_tokens_count_without_punct', dep_type),
                        count_without_punct, dep_type)

                    if Part.is_negated(
                        [node.token for node in dep_path.middle]):
                        self.add(f_set, edge,
                                 self.f('f_XX_is_negated', dep_type), dep_type)