def generate(self, dataset): for edge in dataset.edges(): head1 = edge.entity1.head_token head2 = edge.entity2.head_token sentence = edge.part.sentences[edge.sentence_id] protein_word_found = False for token in sentence: if token.is_entity_part(edge.part) and token.word.lower().find('protein') >= 0: protein_word_found = True token_from = token.features['dependency_from'][0] if token_from == head1: feature_name = '78_dependency_from_entity_to_protein_word_[0]' self.add_to_feature_set(edge, feature_name) for dependency_to in token.features['dependency_to']: token_to = dependency_to[0] if token_to == head1: feature_name = '79_dependency_from_protein_word_to_entity_[0]' self.add_to_feature_set(edge, feature_name) path = get_path(token, head1, edge.part, edge.sentence_id, self.graphs) if path == []: path = [token, head1] for tok in path: feature_name = '80_PWPE_bow_masked_'+tok.masked_text(edge.part)+'_[0]' self.add_to_feature_set(edge, feature_name) feature_name = '81_PWPE_pos_'+tok.features['pos']+'_[0]' self.add_to_feature_set(edge, feature_name) feature_name = '82_PWPE_bow_'+tok.word+'_[0]' self.add_to_feature_set(edge, feature_name) all_walks = build_walks(path) for dep_list in all_walks: dep_path = '' for dep in dep_list: feature_name = '83_'+'PWPE_dep_'+dep[1]+'_[0]' self.add_to_feature_set(edge, feature_name) dep_path += dep[1] feature_name = '84_PWPE_dep_full+'+dep_path+'_[0]' self.add_to_feature_set(edge, feature_name) for j in range(len(all_walks)): dir_grams = '' for i in range(len(path)-1): cur_walk = all_walks[j] if cur_walk[i][0] == path[i]: dir_grams += 'F' else: dir_grams += 'R' feature_name = '85_PWPE_dep_gram_'+dir_grams+'_[0]' self.add_to_feature_set(edge, feature_name) if protein_word_found: feature_name = '86_protein_word_found_[0]' self.add_to_feature_set(edge, feature_name) else: feature_name = '87_protein_not_word_found_[0]' self.add_to_feature_set(edge, feature_name)
def path_grams(self, n, path, edge): token1 = path[0] token2 = path[-1] token1_anns = self.token_feature_generator.annotated_types(token1, edge) token2_anns = self.token_feature_generator.annotated_types(token2, edge) self.build_walk_paths(path, edge) all_walks = build_walks(path) for i in range(len(all_walks)): dir_grams = '' for j in range(len(path)-1): current_walk = all_walks[i] if current_walk[j][0].features['dependency_from'][0]==path[i]: dir_grams += 'F' else: dir_grams += 'R' if i>=n-1: style_gram = '' style_gram = dir_grams[i-n+1:i+1] edge_gram = 'dep_gram_' + style_gram for k in range(1, n): token = edge.part.sentences[edge.sentence_id][(path[i-(n-1)+k]).features['id']-1] self.token_feature_generator.token_features(token, 'tok_'+style_gram, edge) for k in range(n): dep = current_walk[i-(n-1)+k][1] feature_name = '57_dep_'+style_gram+'_'+str(k)+'_'+dep+'_[0]' self.add_to_feature_set(edge, feature_name) edge_gram += '_' + dep feature_name = '58_'+edge_gram+'_[0]' self.add_to_feature_set(edge, feature_name) for ann1 in token1_anns: for ann2 in token2_anns: feature_name = '59_'+ann1+'_'+edge_gram+'_'+ann2+'_[0]' self.add_to_feature_set(edge, feature_name) feature_name = '60_edge_directions_' + dir_grams + '_[0]' self.add_to_feature_set(edge, feature_name)