コード例 #1
0
ファイル: sentence.py プロジェクト: wy692/relna
 def __init__(self, feature_set, training_mode=True):
     self.feature_set = feature_set
     """the feature set for the dataset"""
     self.training_mode = training_mode
     """whether the mode is training or testing"""
     self.token_feature_generator = TokenFeatureGenerator(
         feature_set, training_mode)
     """an instance of TokenFeatureGenerator"""
コード例 #2
0
 def __init__(self, feature_set, depth=3, training_mode=True):
     self.feature_set = feature_set
     """the feature set for the dataset"""
     self.depth = depth
     """the depth of the chain to generate"""
     self.training_mode = training_mode
     """whether the mode is training or testing"""
     self.stemmer = PorterStemmer()
     """an instance of the PorterStemmer"""
     self.token_feature_generator = TokenFeatureGenerator(feature_set, training_mode)
     """an instance of TokenFeatureGenerator"""
コード例 #3
0
 def __init__(self, feature_set, graphs, training_mode=True):
     self.feature_set = feature_set
     """the feature set for the dataset"""
     self.graphs = graphs
     """a dictionary of graphs to avoid recomputation of path"""
     self.training_mode = training_mode
     """whether the mode is training or testing"""
     self.stemmer = PorterStemmer()
     """an instance of PorterStemmer"""
     self.token_feature_generator = TokenFeatureGenerator(feature_set, training_mode)
     """an instance of TokenFeatureGenerator"""
コード例 #4
0
ファイル: sentence.py プロジェクト: Rostlab/relna
class SentenceFeatureGenerator(EdgeFeatureGenerator):
    """
    Generate features for each sentence containing an edge
    """

    def __init__(self, feature_set, training_mode=True):
        self.feature_set = feature_set
        """the feature set for the dataset"""
        self.training_mode = training_mode
        """whether the mode is training or testing"""
        self.token_feature_generator = TokenFeatureGenerator(
            feature_set, training_mode)
        """an instance of TokenFeatureGenerator"""

    def generate(self, dataset):
        for edge in dataset.edges():
            sentence = edge.part.sentences[edge.sentence_id]
            text_count = {}
            for token in sentence:
                ann_types = self.token_feature_generator.annotated_types(
                    token, edge)
                for ann in ann_types:
                    if ann not in text_count.keys():
                        text_count[ann] = 0
                    text_count[ann] = text_count[ann] + 1
            for key, value in text_count.items():
                feature_name = '5_' + key + '_[0]'
                self.add_to_feature_set(edge, feature_name, value=value)
コード例 #5
0
ファイル: sentence.py プロジェクト: wy692/relna
class SentenceFeatureGenerator(EdgeFeatureGenerator):
    """
    Generate features for each sentence containing an edge
    """
    def __init__(self, feature_set, training_mode=True):
        self.feature_set = feature_set
        """the feature set for the dataset"""
        self.training_mode = training_mode
        """whether the mode is training or testing"""
        self.token_feature_generator = TokenFeatureGenerator(
            feature_set, training_mode)
        """an instance of TokenFeatureGenerator"""

    def generate(self, dataset):
        for edge in dataset.edges():
            sentence = edge.part.sentences[edge.sentence_id]
            text_count = {}
            for token in sentence:
                ann_types = self.token_feature_generator.annotated_types(
                    token, edge)
                for ann in ann_types:
                    if ann not in text_count.keys():
                        text_count[ann] = 0
                    text_count[ann] = text_count[ann] + 1
            for key, value in text_count.items():
                feature_name = '5_' + key + '_[0]'
                self.add_to_feature_set(edge, feature_name, value=value)
コード例 #6
0
ファイル: sentence.py プロジェクト: Rostlab/relna
 def __init__(self, feature_set, training_mode=True):
     self.feature_set = feature_set
     """the feature set for the dataset"""
     self.training_mode = training_mode
     """whether the mode is training or testing"""
     self.token_feature_generator = TokenFeatureGenerator(feature_set, training_mode)
     """an instance of TokenFeatureGenerator"""
コード例 #7
0
class EntityHeadTokenChainFeatureGenerator(EdgeFeatureGenerator):
    """
    Generate chains of dependencies from the token of a given depth
    """

    def __init__(self, feature_set, depth=3, training_mode=True):
        self.feature_set = feature_set
        """the feature set for the dataset"""
        self.depth = depth
        """the depth of the chain to generate"""
        self.training_mode = training_mode
        """whether the mode is training or testing"""
        self.stemmer = PorterStemmer()
        """an instance of the PorterStemmer"""
        self.token_feature_generator = TokenFeatureGenerator(feature_set, training_mode)
        """an instance of TokenFeatureGenerator"""

    def generate(self, dataset):
        for edge in dataset.edges():
            head1 = edge.entity1.head_token
            head2 = edge.entity2.head_token
            self.build_chains(head1, edge.part.sentences[edge.sentence_id], edge, 'entity1_', '', self.depth)
            self.build_chains(head2, edge.part.sentences[edge.sentence_id], edge, 'entity2_', '', self.depth)
            self.build_token_features(edge)
            self.entity_combination(edge)

    def build_token_features(self, edge):
        sentence = edge.part.sentences[edge.sentence_id]
        for token in sentence:
            if token.is_entity_part(edge.part):
                if token.get_entity(edge.part)==edge.entity1:
                    self.token_feature_generator.token_features(token, 'e1_', edge)
                if token.get_entity(edge.part)==edge.entity2:
                    self.token_feature_generator.token_features(token, 'e2_', edge)

    def build_chains(self, token, sentence, edge, prefix, chain, depth_left):
        if depth_left==0:
            return
        depth_string = 'dist_'+str(depth_left)+'_'
        feature_name_1 = '19_'+prefix+'dep_'+depth_string+'from_'+token.features['dep']+'_[0]'
        feature_name_2 = '20_'+prefix+'chain_dep_dist_'+str(depth_left)+'_'+chain+'-fw_'+token.features['dep']+'_[0]'
        self.add_to_feature_set(edge, feature_name_1)
        self.add_to_feature_set(edge, feature_name_2)
        self.linear_order_features(prefix+depth_string, token.features['dependency_from'][0], edge, sentence)
        self.build_chains(token.features['dependency_from'][0], sentence, edge, prefix, chain+'-fw', depth_left-1)

        for dependency in token.features['dependency_to']:
            feature_name_1 = '21_'+prefix+'dep_dist_dist_'+str(depth_left)+'_to_'+dependency[1]+'_[0]'
            feature_name_2 = '22_'+prefix+'chain_dep_dist_'+str(depth_left)+'_'+chain+'-rv_'+dependency[1]+'_[0]'
            self.add_to_feature_set(edge, feature_name_1)
            self.add_to_feature_set(edge, feature_name_2)
            self.linear_order_features(prefix+'dist_'+str(depth_left)+'_', dependency[0], edge, sentence)
            self.build_chains(dependency[0], sentence, edge, prefix, chain+'-rv', depth_left-1)

    def linear_order_features(self, prefix, token, edge, sentence):
        feature_name_1 = '23_' + prefix + 'txt_' + token.word + '_[0]'
        feature_name_2 = '24_' + prefix + 'pos_' + token.features['pos'] + '_[0]'
        feature_name_3 = '25_' + prefix + 'given_[0]'
        feature_name_4 = '26_' + prefix + 'txt_' + token.masked_text(edge.part) + '_[0]'
        feature_name_5 = '27_' + prefix + 'ann_type_entity_[0]'
        self.add_to_feature_set(edge, feature_name_1)
        self.add_to_feature_set(edge, feature_name_2)
        self.add_to_feature_set(edge, feature_name_3)
        self.add_to_feature_set(edge, feature_name_4)
        if token.is_entity_part(edge.part):
            entity = token.get_entity(edge.part)
            feature_name_6 = '28_' + prefix + 'ann_type_' + entity.class_id + '_[0]'
            self.add_to_feature_set(edge, feature_name_5)
            self.add_to_feature_set(edge, feature_name_6)

    def entity_combination(self, edge):
        feature_name = '29_entity1_'+edge.entity1.class_id+'_entity2_'+edge.entity2.class_id+'_[0]'
        self.add_to_feature_set(edge, feature_name)
コード例 #8
0
class PathFeatureGenerator(EdgeFeatureGenerator):
    """
    The length of the path from entity 1 to entity 2 and token features for the
    two tokens at the terminal of the path
    """
    def __init__(self, feature_set, graphs, training_mode=True):
        self.feature_set = feature_set
        """the feature set for the dataset"""
        self.graphs = graphs
        """a dictionary of graphs to avoid recomputation of path"""
        self.training_mode = training_mode
        """whether the mode is training or testing"""
        self.stemmer = PorterStemmer()
        """an instance of PorterStemmer"""
        self.token_feature_generator = TokenFeatureGenerator(feature_set, training_mode)
        """an instance of TokenFeatureGenerator"""

    def generate(self, dataset):
        for edge in dataset.edges():
            head1 = edge.entity1.head_token
            head2 = edge.entity2.head_token
            sentence = edge.part.sentences[edge.sentence_id]
            path = []
            path = get_path(head1, head2, edge.part, edge.sentence_id, self.graphs)
            if len(path)==0:
                path = [head1, head2]
            self.path_length_features(path, edge)
            self.token_feature_generator.token_features(path[0], 'token_term_1_', edge)
            self.token_feature_generator.token_features(path[-1], 'token_term_2_', edge)
            self.path_dependency_features(path, edge)
            base_words = ['interact', 'bind', 'coactivator', 'complex', 'mediate']
            words = []
            for word in base_words:
                words.append(self.stemmer.stem(word))
            self.path_constituents(path, edge, words)
            self.path_grams(2, path, edge)
            self.path_grams(3, path, edge)
            self.path_grams(4, path, edge)
            self.path_edge_features(path, edge)

    def path_length_features(self, path, edge):
        feature_name_1 = '45_len_tokens_' + str(len(path)) + '_[0]'
        feature_name_2 = '46_len_[0]'
        self.add_to_feature_set(edge, feature_name_1)
        self.add_to_feature_set(edge, feature_name_2, len(path))

    def path_constituents(self, path, edge, words):
        for token in path:
            if self.stemmer.stem(token.word) in words:
                feature_name_1 = '47_word_in_path_' + self.stemmer.stem(token.word) + '_[0]'
                self.add_to_feature_set(edge, feature_name_1)

    def path_dependency_features(self, path, edge):
        for i in range(len(path)-1):
            token1 = path[i]
            token2 = path[i+1]
            for dep in token1.features['dependency_to']:
                if dep[0]==token2:
                    feature_name = '48_dep_'+dep[1]+'_forward_[0]'
                    self.add_to_feature_set(edge, feature_name)

            for dep in token2.features['dependency_to']:
                if dep[0]==token1:
                    feature_name = '49_dep_'+dep[1]+'_reverse_[0]'
                    self.add_to_feature_set(edge, feature_name)

        for i in range(1, len(path)-1):
            token = path[i]
            feature_name_1 = '50_internal_pos_' + token.features['pos'] + '_[0]'
            feature_name_2 = '51_internal_masked_txt_' + token.masked_text(edge.part) + '_[0]'
            feature_name_3 = '52_internal_txt_' + token.word + '_[0]'
            feature_name_4 = '53_internal_stem_' + self.stemmer.stem(token.word) + '_[0]'
            self.add_to_feature_set(edge, feature_name_1)
            self.add_to_feature_set(edge, feature_name_2)
            self.add_to_feature_set(edge, feature_name_3)
            self.add_to_feature_set(edge, feature_name_4)

        for i in range(2, len(path)-1):
            token1 = path[i]
            token2 = path[i+1]
            for dep in token1.features['dependency_to']:
                if dep[0]==token2:
                    feature_name = '54_internal_dep_'+dep[1]+'_forward_[0]'
                    self.add_to_feature_set(edge, feature_name)

            for dep in token2.features['dependency_to']:
                if dep[0]==token1:
                    feature_name = '55_internal_dep_'+dep[1]+'_reverse_[0]'
                    self.add_to_feature_set(edge, feature_name)

    def build_walk_paths(self, path, edge):
        internal_types = ''
        for token in path:
            ann_types = self.token_feature_generator.annotated_types(token, edge)
            for ann in ann_types:
                internal_types += '_'+ann
            internal_types += '_'
            feature_name = '56_token_path'+internal_types+'_[0]'
            self.add_to_feature_set(edge, feature_name)

    def path_grams(self, n, path, edge):
        token1 = path[0]
        token2 = path[-1]
        token1_anns = self.token_feature_generator.annotated_types(token1, edge)
        token2_anns = self.token_feature_generator.annotated_types(token2, edge)
        self.build_walk_paths(path, edge)
        all_walks = build_walks(path)

        for i in range(len(all_walks)):
            dir_grams = ''
            for j in range(len(path)-1):
                current_walk = all_walks[i]
                if current_walk[j][0].features['dependency_from'][0]==path[i]:
                    dir_grams += 'F'
                else:
                    dir_grams += 'R'
                if i>=n-1:
                    style_gram = ''
                    style_gram = dir_grams[i-n+1:i+1]
                    edge_gram = 'dep_gram_' + style_gram

                    for k in range(1, n):
                        token = edge.part.sentences[edge.sentence_id][(path[i-(n-1)+k]).features['id']-1]
                        self.token_feature_generator.token_features(token, 'tok_'+style_gram, edge)

                    for k in range(n):
                        dep = current_walk[i-(n-1)+k][1]
                        feature_name = '57_dep_'+style_gram+'_'+str(k)+'_'+dep+'_[0]'
                        self.add_to_feature_set(edge, feature_name)
                        edge_gram += '_' + dep

                    feature_name = '58_'+edge_gram+'_[0]'
                    self.add_to_feature_set(edge, feature_name)

                    for ann1 in token1_anns:
                        for ann2 in token2_anns:
                            feature_name = '59_'+ann1+'_'+edge_gram+'_'+ann2+'_[0]'
                            self.add_to_feature_set(edge, feature_name)

                feature_name = '60_edge_directions_' + dir_grams + '_[0]'
                self.add_to_feature_set(edge, feature_name)

    def path_edge_features(self, path, edge):
        head1 = edge.entity1.head_token
        head2 = edge.entity2.head_token
        dependency_list = []
        for i in range(len(path)-1):
            token1 = path[i]
            token2 = path[i+1]
            dependency_list.append(token2.features['dependency_from'])
            dependency_list.append(token1.features['dependency_from'])

        for dependency in dependency_list:
            feature_name = '61_dep_'+dependency[1]+'_[0]'
            self.add_to_feature_set(edge, feature_name)
            feature_name = '62_txt_'+dependency[0].masked_text(edge.part)+'_[0]'
            self.add_to_feature_set(edge, feature_name)
            feature_name = '63_pos_'+dependency[0].features['pos']+'_[0]'
            self.add_to_feature_set(edge, feature_name)

            token1 = dependency[0]
            ann_types_1 = self.token_feature_generator.annotated_types(token1, edge)
            for ann in ann_types_1:
                feature_name = '64_ann_type_'+ann+'_[0]'
                self.add_to_feature_set(edge, feature_name)

            g_text = dependency[0].masked_text(edge.part)
            g_pos = dependency[0].features['pos']
            g_at = 'no_ann_type'

            for dep in dependency[0].features['dependency_to']:
                feature_name = '65_'+dep[1]+'_[0]'
                self.add_to_feature_set(edge, feature_name)
                feature_name = '66_txt_'+dep[0].masked_text(edge.part)+'_[0]'
                self.add_to_feature_set(edge, feature_name)
                feature_name = '67_pos_'+dep[0].features['pos']+'_[0]'
                self.add_to_feature_set(edge, feature_name)

                token2 = dep[0]
                ann_types_2 = self.token_feature_generator.annotated_types(token2, edge)
                for ann in ann_types_2:
                    feature_name = '68_ann_type_'+ann+'_[0]'
                    self.add_to_feature_set(edge, feature_name)

                d_text = token2.masked_text(edge.part)
                d_pos = token2.features['pos']
                d_at = 'no_ann_type'

                feature_name = '69_gov_'+g_text+'_'+d_text+'_[0]'
                self.add_to_feature_set(edge, feature_name)

                feature_name = '70_gov_'+g_pos+'_'+d_pos+'_[0]'
                self.add_to_feature_set(edge, feature_name)

                for ann1 in ann_types_1:
                    for ann2 in ann_types_2:
                        feature_name = '71_gov_'+ann1+'_'+ann2+'_[0]'
                        self.add_to_feature_set(edge, feature_name)

                for ann1 in ann_types_1:
                    feature_name = '72_triple_'+ann1+'_'+dependency[1]+'_'+d_at+'_[0]'
                    self.add_to_feature_set(edge, feature_name)