Ejemplo n.º 1
0
 def generate(self, dataset):
     for edge in dataset.edges():
         head1 = edge.entity1.head_token
         head2 = edge.entity2.head_token
         sentence = edge.part.sentences[edge.sentence_id]
         protein_word_found = False
         for token in sentence:
             if token.is_entity_part(edge.part) and token.word.lower().find('protein') >= 0:
                 protein_word_found = True
                 token_from = token.features['dependency_from'][0]
                 if token_from == head1:
                     feature_name = '78_dependency_from_entity_to_protein_word_[0]'
                     self.add_to_feature_set(edge, feature_name)
                 for dependency_to in token.features['dependency_to']:
                     token_to = dependency_to[0]
                     if token_to == head1:
                         feature_name = '79_dependency_from_protein_word_to_entity_[0]'
                         self.add_to_feature_set(edge, feature_name)
                     path = get_path(token, head1, edge.part, edge.sentence_id, self.graphs)
                     if path == []:
                         path = [token, head1]
                     for tok in path:
                         feature_name = '80_PWPE_bow_masked_'+tok.masked_text(edge.part)+'_[0]'
                         self.add_to_feature_set(edge, feature_name)
                         feature_name = '81_PWPE_pos_'+tok.features['pos']+'_[0]'
                         self.add_to_feature_set(edge, feature_name)
                         feature_name = '82_PWPE_bow_'+tok.word+'_[0]'
                         self.add_to_feature_set(edge, feature_name)
                     all_walks = build_walks(path)
                     for dep_list in all_walks:
                         dep_path = ''
                         for dep in dep_list:
                             feature_name = '83_'+'PWPE_dep_'+dep[1]+'_[0]'
                             self.add_to_feature_set(edge, feature_name)
                             dep_path += dep[1]
                         feature_name = '84_PWPE_dep_full+'+dep_path+'_[0]'
                         self.add_to_feature_set(edge, feature_name)
                     for j in range(len(all_walks)):
                         dir_grams = ''
                         for i in range(len(path)-1):
                             cur_walk = all_walks[j]
                             if cur_walk[i][0] == path[i]:
                                 dir_grams += 'F'
                             else:
                                 dir_grams += 'R'
                         feature_name = '85_PWPE_dep_gram_'+dir_grams+'_[0]'
                         self.add_to_feature_set(edge, feature_name)
         if protein_word_found:
             feature_name = '86_protein_word_found_[0]'
             self.add_to_feature_set(edge, feature_name)
         else:
             feature_name = '87_protein_not_word_found_[0]'
             self.add_to_feature_set(edge, feature_name)
Ejemplo n.º 2
0
    def path_grams(self, n, path, edge):
        token1 = path[0]
        token2 = path[-1]
        token1_anns = self.token_feature_generator.annotated_types(token1, edge)
        token2_anns = self.token_feature_generator.annotated_types(token2, edge)
        self.build_walk_paths(path, edge)
        all_walks = build_walks(path)

        for i in range(len(all_walks)):
            dir_grams = ''
            for j in range(len(path)-1):
                current_walk = all_walks[i]
                if current_walk[j][0].features['dependency_from'][0]==path[i]:
                    dir_grams += 'F'
                else:
                    dir_grams += 'R'
                if i>=n-1:
                    style_gram = ''
                    style_gram = dir_grams[i-n+1:i+1]
                    edge_gram = 'dep_gram_' + style_gram

                    for k in range(1, n):
                        token = edge.part.sentences[edge.sentence_id][(path[i-(n-1)+k]).features['id']-1]
                        self.token_feature_generator.token_features(token, 'tok_'+style_gram, edge)

                    for k in range(n):
                        dep = current_walk[i-(n-1)+k][1]
                        feature_name = '57_dep_'+style_gram+'_'+str(k)+'_'+dep+'_[0]'
                        self.add_to_feature_set(edge, feature_name)
                        edge_gram += '_' + dep

                    feature_name = '58_'+edge_gram+'_[0]'
                    self.add_to_feature_set(edge, feature_name)

                    for ann1 in token1_anns:
                        for ann2 in token2_anns:
                            feature_name = '59_'+ann1+'_'+edge_gram+'_'+ann2+'_[0]'
                            self.add_to_feature_set(edge, feature_name)

                feature_name = '60_edge_directions_' + dir_grams + '_[0]'
                self.add_to_feature_set(edge, feature_name)