Python EnglishTokenFilterの例

プログラミング言語: Python

名前空間/パッケージ名: lib.filter.english_token_filter

クラス/型: EnglishTokenFilter

hotexamples.comのコード掲載数: 10

Python EnglishTokenFilter - 10件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのlib.filter.english_token_filter.EnglishTokenFilterの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

EnglishTokenFilter(3)

filter(3)

コード例 #1

ファイルを表示

    def __init__(self, k_param, prime_feature_length=PRIME_FEATURE_LENGTH, add_prime_feature=False):
        self.k_param = k_param
        self.prime_feature_length = prime_feature_length
        self.add_prime_feature = add_prime_feature
        print>>sys.stderr," add_prime_feature:",self.add_prime_feature
        self.debugFlag = 0 # off by default

        self.english_filter = EnglishTokenFilter()
        pass

コード例 #2

ファイルを表示

ファイル: malt_parsed_pos_context_sequence_feature_extractor.py プロジェクト: anetschka/glossextractionengine

    def __init__(self,
                 k_param,
                 prime_feature_length=PRIME_FEATURE_LENGTH,
                 add_prime_feature=False):
        self.k_param = k_param
        self.prime_feature_length = prime_feature_length
        self.add_prime_feature = add_prime_feature
        self.debugFlag = 0  # off by default

        self.english_filter = EnglishTokenFilter()
        self.lex_fe = MaltParsedLexicalizedNgramsFeatureExtractor(n=2)
        pass

コード例 #3

ファイルを表示

ファイル: malt_parsed_pos_context_sequence_feature_extractor.py プロジェクト: kperisetla/glossextractionengine

    def __init__(self, k_param, prime_feature_length=PRIME_FEATURE_LENGTH, add_prime_feature=False):
        self.k_param = k_param
        self.prime_feature_length = prime_feature_length
        self.add_prime_feature = add_prime_feature
        self.debugFlag = 0 # off by default

        self.english_filter = EnglishTokenFilter()
        self.lex_fe = MaltParsedLexicalizedNgramsFeatureExtractor(n=2)
        pass

コード例 #4

ファイルを表示

ファイル: malt_parsed_pos_context_sequence_feature_extractor.py プロジェクト: anetschka/glossextractionengine

class MaltParsedPOSContextSequenceFeatureExtractor(BaseFeatureExtractor):
    def __init__(self,
                 k_param,
                 prime_feature_length=PRIME_FEATURE_LENGTH,
                 add_prime_feature=False):
        self.k_param = k_param
        self.prime_feature_length = prime_feature_length
        self.add_prime_feature = add_prime_feature
        self.debugFlag = 0  # off by default

        self.english_filter = EnglishTokenFilter()
        self.lex_fe = MaltParsedLexicalizedNgramsFeatureExtractor(n=2)
        pass

    def debug(self, s):
        if self.debugFlag == 1:
            pass
            print s

    # method that generates sequence model considering all words in the sentence
    # params- result_tuple with category, word, tokens
    def getFullSentenceSequenceModel(self, result_tuple):
        category, word, tokens, sentence, _old_word = result_tuple
        pos_tags = self.word_pos_tuple_collection

        feature_dict = {}
        for i, tpl in enumerate(pos_tags):
            key = "W" + str(i)
            wrd, p_tg = tpl
            val = p_tg
            feature_dict[key] = val
        return feature_dict

    # method that generates sequence model considering only k words from beginning of the line
    # params- result_tuple with category, word, tokens
    # returns a dictionary with features
    def getKSequenceModel(self, result_tuple):
        category, word, tokens, sentence, _old_word = result_tuple
        pos_tags = self.word_pos_tuple_collection
        feature_dict = {}
        for i, tpl in enumerate(pos_tags):

            # read k features- then break
            if i > self.k_param:
                break
            key = "W" + str(i)
            wrd, p_tg = tpl
            val = p_tg
            feature_dict[key] = val
        return feature_dict

    # method that gives start and end index of tokens to be considered for sequence model if NP lies in first half of the sentence
    def getIndicesFirstHalf(self, index, num_of_tokens):
        self.debug("num_of_tokens:" + str(num_of_tokens))
        self.debug("index got in getIndicesFirstHalf:" + str(index))

        self.debug("k_param:" + str(self.k_param))
        if num_of_tokens < self.k_param:
            start_index = 0
            end_index = num_of_tokens - 1
            return (start_index, end_index)

        while ((num_of_tokens) - index) < self.k_param:
            index = index - 1
        start_index = index
        end_index = index + self.k_param
        return (start_index, end_index)

    # method that gives start and end index of tokens to be considered for sequence model if NP lies in second half of the sentence
    def getIndicesSecondHalf(self, index, num_of_tokens):
        if num_of_tokens < self.k_param:
            start_index = 0
            end_index = num_of_tokens - 1
            return (start_index, end_index)

        while (index) < self.k_param:
            index = index + 1
        start_index = index - self.k_param + 1
        end_index = index + 1
        return (start_index, end_index)

    # method that gives start and end index of tokens to be considered for sequence model if NP lies at middle of the sentence
    def getIndicesForMiddleWord(self, index, num_of_tokens):
        if num_of_tokens < self.k_param:
            start_index = 0
            end_index = num_of_tokens - 1
            return (start_index, end_index)

        if self.k_param % 2 == 0:
            start_index = index - (self.k_param / 2)
            end_index = index + (self.k_param / 2)
        else:
            start_index = index - (self.k_param / 2)
            end_index = index + (self.k_param / 2) + 1
        return (start_index, end_index)

    # method that reads tokens and returns feature_dict
    def getSequenceModelForIndexRange(self, result_tuple, index, start_index,
                                      end_index):
        category, word, tokens, sentence, _old_word = result_tuple
        pos_tags = self.word_pos_tuple_collection

        self.debug(pos_tags)

        feature_dict = {}

        # # add feature for tokens from index to start_index
        # token_i = 1
        # if start_index!=index:
        #     for lower in range(index-1,start_index,-1):
        #         key = "W-"+str(token_i)
        #         wrd,p_tg = pos_tags[lower]
        #         val = p_tg
        #         feature_dict[key] = val
        #         token_i = token_i + 1
        #
        # # add feature for token at index
        # key = "W0"
        # wrd,p_tg = pos_tags[index]
        # val = p_tg
        # feature_dict[key] = val
        #
        # # add feature for tokens from index to end index
        # token_i = 1
        # for upper in range(index+1, end_index):
        #     key = "W+"+str(token_i)
        #     wrd,p_tg = pos_tags[upper]
        #     val = p_tg
        #     feature_dict[key] = val
        #     token_i = token_i + 1

        # adding the prime feature if the add_prime_featue flag is set to True
        if self.add_prime_feature:
            print >> sys.stderr, " pos_tags:", pos_tags
            self.debug("got features, looking for prime_feature")
            key = "prime_feature"
            val = ""
            if start_index == index:
                for i in range(
                        index,
                        min(len(pos_tags), index + self.prime_feature_length)):
                    wrd, p_tg = pos_tags[i]
                    # if curr token is head NP use HNP as pos tag
                    if i == index:
                        p_tg = "HNP"
                    val = val + p_tg + " "
            elif end_index - 1 == index:
                self.debug("STR:" + str(index - self.prime_feature_length) +
                           " END:" + str(index + 1))
                for i in range(max(0, index - self.prime_feature_length),
                               index + 1):
                    wrd, p_tg = pos_tags[i]
                    # if curr token is head NP use HNP as pos tag
                    if i == index:
                        p_tg = "HNP"
                    val = val + p_tg + " "
            else:
                while (end_index - start_index >= self.prime_feature_length):
                    start_index = start_index + 1
                    end_index = end_index - 1
                    self.debug("MIDDLE_start_index:" + str(start_index) +
                               " end_index:" + str(end_index))
                for i in range(start_index, end_index):
                    wrd, p_tg = pos_tags[i]
                    # if curr token is head NP use HNP as pos tag
                    if i == index:
                        p_tg = "HNP"
                    val = val + p_tg + " "
            feature_dict[key] = val.strip()
            print >> sys.stderr, " start_index:", start_index, " end_index:", end_index

            print >> sys.stderr, " feature_dict so far: ", feature_dict
            # adding prime feature for beginning and ending of the sentence as well
            # for beginning
            beg_pattern = " "
            for k in range(0, min(self.prime_feature_length, len(pos_tags))):
                wrd, p_tg = pos_tags[k]
                # if curr token is head NP use HNP as pos tag
                if k == index:
                    p_tg = "HNP"
                beg_pattern = beg_pattern + p_tg + " "

            feature_dict["beg_prime"] = beg_pattern.strip()

            # for ending
            end_pattern = " "
            for k in range(max(0,
                               len(pos_tags) - self.prime_feature_length),
                           len(pos_tags)):
                wrd, p_tg = pos_tags[k]
                # if curr token is head NP use HNP as pos tag
                if k == index:
                    p_tg = "HNP"
                end_pattern = end_pattern + p_tg + " "
            feature_dict["end_prime"] = end_pattern.strip()

        return feature_dict

    # method that takes instance as input and returns feature vector and optional category label
    # params: instance of format- <category> '<instance_name> | <instance>
    # returns: a tuple of (<feature_dict>, <category>, <word>, <sentence>) or a list of such tuples
    def extract_features(self, instance):
        _sentence_feature_extractor = MaltParsedSentenceFeatureExtractor()
        result_tuple = _sentence_feature_extractor.extract_features(instance)

        # copying attributes
        self.token_collection = _sentence_feature_extractor.get_token_collection(
        )
        self.word_pos_tuple_collection = _sentence_feature_extractor.get_word_pos_tuple_collection(
        )

        # malformed instance
        if result_tuple is None:
            return (None, None, None)

        # call _get_seq_model after copying attributes from MaltParsedFeatureExtactor

        # depending on if result_tuple is list or single tuple, take action
        if isinstance(result_tuple, list):
            result_list = []
            for _item in result_tuple:
                result_item = self._get_seq_model(_item)
                result_list.append(result_item)
            return result_list
        else:
            result = self._get_seq_model(result_tuple)
            return result

    # method to get the sequence model for malt parsed version of instance
    # returns a tuple: (feature_dict,category,word,sentence)
    def _get_seq_model(self, result_tuple):
        category, word, tokens, sentence, _old_word = result_tuple

        # if word is non english token then return None
        if not self.english_filter.filter(word):
            return (None, None, None, None)

        # if using test instance
        if category is None and word is None:
            if self.k_param == KPARAM:
                feature_dict = self.getFullSentenceSequenceModel(result_tuple)
            else:
                feature_dict = self.getKSequenceModel(result_tuple)

            return (feature_dict, None, None, None)

        tokens_set = set(tokens)
        num_of_tokens = len(tokens)

        # print>>sys.stderr," checking word:",word.lower()," in sentence:",sentence.lower()

        # if sentence contains the NP
        if word.lower() in sentence.lower():
            # print>>sys.stderr,"word in sentence"

            # if the word is not present as complete word in token list 'tokens'
            if tokens.count(word) == 0:
                # iterate over tokens
                for token_index, token in enumerate(tokens):
                    # check if word is part of any token
                    if word in token:
                        index = token_index
                        # print(sys.stderr,"containing word found at index :",str(index))

                        break  # found the token containing this word
            else:
                # pick the first index of 'word' in token list 'tokens'
                index = tokens.index(word)  # tokens.index(word)
                # print(sys.stderr,"exact word found at index :",str(index))

            # word lies in first half of the sentence
            if index < num_of_tokens / 2:
                self.debug("lies in lower half")
                # print(sys.stderr,"word lies in lower half:going for getIndicesFirstHalf")
                start_index, end_index = self.getIndicesFirstHalf(
                    index, num_of_tokens)
                self.debug("start_index:" + str(start_index) + " end_index:" +
                           str(end_index))

# word lies in second half of the sentence
            if index > num_of_tokens / 2:
                start_index, end_index = self.getIndicesSecondHalf(
                    index, num_of_tokens)
                self.debug("lies in second half")
                # print(sys.stderr,"word lies in second half:going for getIndicesSecondHalf")
                self.debug("start_index:" + str(start_index) + " end_index:" +
                           str(end_index))

            # word lies in middle of the sentence
            if index == num_of_tokens / 2:
                start_index, end_index = self.getIndicesForMiddleWord(
                    index, num_of_tokens)
                self.debug("lies at the middle")
                # print(sys.stderr,"word lies in middle :going for getIndicesForMiddleWord")
                self.debug("start_index:" + str(start_index) + " end_index:" +
                           str(end_index))

            # get sequence model for tokens in give index range
            feature_dict = self.getSequenceModelForIndexRange(
                result_tuple, index, start_index, end_index)

            # update feature for multi-word NP
            # feature_dict = self.update_feature_dict(feature_dict,word,_old_word,index)

        else:  # sentence doesn't contains the head NP
            # print>>sys.stderr," word:",word.lower()," not present in sentence:",sentence.lower()
            pass

        if not category is None:
            category = category.strip()

        # get the lexicalized features
        # lex_features, lex_category, lex_word = self.lex_fe._get_lexicalize_features(result_tuple)
        # update the feature dict
        # feature_dict.update(lex_features)

        return (feature_dict, category, word, sentence)

    # # method to place feature value for W0 as NP if its a multiword NP
    # def update_feature_dict(self, feature_dict, word, old_word`,index):
    #     if word!=old_word:
    #         _tag = "W"+str(index)
    #         feature_dict[_tag] = "NN"
    #     return feature_dict


#
# m = MaltParsedPOSContextSequenceFeatureExtractor(k_param=4)
# m.extract_features("Guests/NNS invited/VBD to/TO the/DT presidential/JJ ball/NN will/MD have/VB to/TO wear/VB tuxedoes/NNS ./.      1")

コード例 #5

ファイルを表示

ファイル: malt_parsed_lexicalized_sequence_feature_extractor.py プロジェクト: kperisetla/glossextractionengine

 def __init__(self, slice_window_length = SLICE_WINDOW_LENGTH):
     self.slice_window_length = slice_window_length
     self.english_filter = EnglishTokenFilter()
     self.pos_list = ["DT","JJ","NN","NNP","NNS"]
     pass

コード例 #6

ファイルを表示

ファイル: malt_parsed_lexicalized_sequence_feature_extractor.py プロジェクト: kperisetla/glossextractionengine

class MaltParsedLexicalizedSequenceFeatureExtractor(BaseFeatureExtractor):

    # params:
    # k_param: the length n in n-grams to be considered in the context window
    # prime_feature_length: the length of context window pos tags of words
    # falling in that window will constitute the prime feature
    # add_prime_feature: boolean flag to indicate whether to generate prime
    # feature
    def __init__(self, slice_window_length = SLICE_WINDOW_LENGTH):
        self.slice_window_length = slice_window_length
        self.english_filter = EnglishTokenFilter()
        self.pos_list = ["DT","JJ","NN","NNP","NNS"]
        pass

    # method that gives start and end index of tokens to be considered for
    # sequence model if NP lies in first half of the sentence
    def getIndicesFirstHalf(self,index,num_of_tokens):
        if num_of_tokens<self.k_param:
            start_index = 0
            end_index = num_of_tokens - 1
            return (start_index,end_index)

        while( (num_of_tokens)-index)< self.k_param:
            index = index -1
        start_index = index
        end_index = index + self.k_param
        return (start_index,end_index)

    # method that gives start and end index of tokens to be considered for
    # sequence model if NP lies in second half of the sentence
    def getIndicesSecondHalf(self,index,num_of_tokens):
        if num_of_tokens<self.k_param:
            start_index = 0
            end_index = num_of_tokens - 1
            return (start_index,end_index)

        while(index)< self.k_param:
            index = index +1
        start_index = index-self.k_param+1
        end_index = index+1
        return (start_index,end_index)

    # method that gives start and end index of tokens to be considered for
    # sequence model if NP lies at middle of the sentence
    def getIndicesForMiddleWord(self,index,num_of_tokens):
        if num_of_tokens<self.k_param:
            start_index = 0
            end_index = num_of_tokens - 1
            return (start_index,end_index)

        if self.k_param % 2==0:
            start_index = index -(self.k_param/2)
            end_index = index + (self.k_param/2)
        else:
            start_index = index -(self.k_param/2)
            end_index = index + (self.k_param/2)+1
        return (start_index,end_index)

    # method that reads tokens and returns feature_dict
    # params: index - position of head NP in the sentence
    # result_tuple - tuple consisting of category, word, tokens, sentence and
    # old word
    def getLexicalFeaturesForIndexRange(self,result_tuple,index):
        category,word,tokens,sentence,_old_word = result_tuple
        pos_tags = self.word_pos_tuple_collection

        feature_dict = {}

        # adding prime feature for beginning and ending of the sentence as well
        # for lexical pattern beginning with HNP
        _pattern = ""
        for k in range(index, min(index+self.slice_window_length,len(pos_tags))):
            wrd,p_tg = pos_tags[k]
            # if curr token is head NP use HNP as pos tag
            if k==index:
                p_tg = "HNP"
                _pattern = _pattern + p_tg+" "
            elif p_tg in self.pos_list:
                _pattern = _pattern + p_tg+" "
            else:   # use word itself
                _pattern = _pattern + wrd +" "
        # if HNP is last token in sentence then don't add hnp_begin
        if _pattern.strip() != "HNP":
            feature_dict["hnp_begin"] = _pattern.strip()

        # for lexical pattern ending with HNP
        _pattern = " "
        for k in range(max(0,index-(self.slice_window_length-1)),
                       min(index+1,len(pos_tags))):
            wrd,p_tg=pos_tags[k]
            if k==index:
                p_tg = "HNP"
                _pattern = _pattern + p_tg+" "
            elif p_tg in self.pos_list:
                _pattern = _pattern + p_tg+" "
            else:   # use lexical feature
                _pattern = _pattern + wrd +" "
        # if HNP is first token in sentence then don't add hnp_end
        if _pattern.strip() != "HNP":
            feature_dict["hnp_end"] = _pattern.strip()

        return feature_dict

    # method that takes instance as input and returns feature vector and
    # optional category label
    # params: instance of format- <category> '<instance_name> | <instance>
    # returns: a tuple of (<feature_dict>, <category>, <word>, <sentence>) or
    # a list of such tuples
    def extract_features(self, instance):
        _sentence_feature_extractor = MaltParsedSentenceFeatureExtractor()
        result_tuple = _sentence_feature_extractor.extract_features(instance)

        # copying attributes
        self.token_collection = \
            _sentence_feature_extractor.get_token_collection()
        self.word_pos_tuple_collection = \
            _sentence_feature_extractor.get_word_pos_tuple_collection()

        # malformed instance
        if result_tuple is None:
            return (None,None,None)

        # call _get_seq_model after copying attributes from
        # MaltParsedFeatureExtactor

        # depending on if result_tuple is list or single tuple, take action
        if isinstance(result_tuple,list):
            result_list = []
            for _item in result_tuple:
                result_item = self._get_lexicalized_seq_model(_item)
                result_list.append(result_item)
            return result_list
        else:
            result = self._get_lexicalized_seq_model(result_tuple)
            return result

    # method to get the sequence model for malt parsed version of instance
    # returns a tuple: (feature_dict,category,word,sentence)
    def _get_lexicalized_seq_model(self, result_tuple):
        category,word,tokens,sentence,_old_word = result_tuple

        # if word is non english token then return None
        if not self.english_filter.filter(word):
            return (None,None,None, None)

        tokens_set = set(tokens)
        num_of_tokens = len(tokens)

        # if sentence contains the NP
        if word.lower() in sentence.lower():
            # if the word is not present as complete word in token list 'tokens'
            if tokens.count(word)==0:
                # iterate over tokens
                for token_index,token in enumerate(tokens):
                    # check if word is part of any token
                    if word in token:
                        index=token_index
                        # found the token containing this word
                        break
            else:
                # pick the first index of 'word' in token list 'tokens'
                index = tokens.index(word)	# tokens.index(word)

            # get sequence model for tokens in give index range
            feature_dict = \
                self.getLexicalFeaturesForIndexRange(result_tuple,index)

        else:	# sentence doesn't contains the head NP
            pass

        if not category is None:
            category = category.strip()

        return (feature_dict,category,word,sentence)

コード例 #7

ファイルを表示

ファイル: lexical_sequence_feature_extractor.py プロジェクト: anetschka/glossextractionengine

 def __init__(self, slice_window_length=SLICE_WINDOW_LENGTH):
     self.slice_window_length = slice_window_length
     self.english_filter = EnglishTokenFilter()
     self.pos_list = ["DT", "JJ", "NN", "NNP", "NNS"]
     pass

コード例 #8

ファイルを表示

ファイル: lexical_sequence_feature_extractor.py プロジェクト: anetschka/glossextractionengine

class LexicalSequenceFeatureExtractor(BaseFeatureExtractor):
    # params:
    # k_param: the length n in n-grams to be considered in the context window
    # prime_feature_length: the length of context window pos tags of words
    # falling in that window will constitute the prime feature
    # add_prime_feature: boolean flag to indicate whether to generate prime
    # feature
    def __init__(self, slice_window_length=SLICE_WINDOW_LENGTH):
        self.slice_window_length = slice_window_length
        self.english_filter = EnglishTokenFilter()
        self.pos_list = ["DT", "JJ", "NN", "NNP", "NNS"]
        pass

    # method that gives start and end index of tokens to be considered for
    # sequence model if NP lies in first half of the sentence
    def getIndicesFirstHalf(self, index, num_of_tokens):
        if num_of_tokens < self.k_param:
            start_index = 0
            end_index = num_of_tokens - 1
            return (start_index, end_index)

        while ((num_of_tokens) - index) < self.k_param:
            index = index - 1
        start_index = index
        end_index = index + self.k_param
        return (start_index, end_index)

    # method that gives start and end index of tokens to be considered for
    # sequence model if NP lies in second half of the sentence
    def getIndicesSecondHalf(self, index, num_of_tokens):
        if num_of_tokens < self.k_param:
            start_index = 0
            end_index = num_of_tokens - 1
            return (start_index, end_index)

        while (index) < self.k_param:
            index = index + 1
        start_index = index - self.k_param + 1
        end_index = index + 1
        return (start_index, end_index)

    # method that gives start and end index of tokens to be considered for
    # sequence model if NP lies at middle of the sentence
    def getIndicesForMiddleWord(self, index, num_of_tokens):
        if num_of_tokens < self.k_param:
            start_index = 0
            end_index = num_of_tokens - 1
            return (start_index, end_index)

        if self.k_param % 2 == 0:
            start_index = index - (self.k_param / 2)
            end_index = index + (self.k_param / 2)
        else:
            start_index = index - (self.k_param / 2)
            end_index = index + (self.k_param / 2) + 1
        return (start_index, end_index)

    # method that reads tokens and returns feature_dict
    # params: index - position of head NP in the sentence
    # result_tuple - tuple consisting of category, word, tokens, sentence and
    # old word
    def getLexicalFeaturesForIndexRange(self, result_tuple, index):
        category, word, tokens, sentence, _old_word = result_tuple
        pos_tags = nltk.pos_tag(tokens)

        feature_dict = {}

        # adding prime feature for beginning and ending of the sentence as well
        # for lexical pattern beginning with HNP
        _pattern = ""
        for k in range(index,
                       min(index + self.slice_window_length, len(pos_tags))):
            wrd, p_tg = pos_tags[k]
            # if curr token is head NP use HNP as pos tag
            if k == index:
                p_tg = "HNP"
                _pattern = _pattern + p_tg + " "
            elif p_tg in self.pos_list:
                _pattern = _pattern + p_tg + " "
            else:  # use word itself
                _pattern = _pattern + wrd + " "
        # if HNP is last token in sentence then don't add hnp_begin
        if _pattern.strip() != "HNP":
            feature_dict["hnp_begin"] = _pattern.strip()

        # for lexical pattern ending with HNP
        _pattern = " "
        for k in range(max(0, index - (self.slice_window_length - 1)),
                       min(index + 1, len(pos_tags))):
            wrd, p_tg = pos_tags[k]
            if k == index:
                p_tg = "HNP"
                _pattern = _pattern + p_tg + " "
            elif p_tg in self.pos_list:
                _pattern = _pattern + p_tg + " "
            else:  # use lexical feature
                _pattern = _pattern + wrd + " "
        # if HNP is first token in sentence then don't add hnp_end
        if _pattern.strip() != "HNP":
            feature_dict["hnp_end"] = _pattern.strip()

        return feature_dict

    # method that takes instance as input and returns feature vector and
    # optional category label
    # params: instance of format- <category> '<instance_name> | <instance>
    # returns: a tuple of (<feature_dict>, <category>, <word>, <sentence>)
    # or a list of such tuples
    def extract_features(self, instance):
        _sentence_feature_extractor = SentenceTokensFeatureExtractor()
        result_tuple = _sentence_feature_extractor.extract_features(instance)
        # malformed instance
        if result_tuple is None:
            return (None, None, None, None)

        category, word, tokens, sentence, _old_word = result_tuple

        # if word is non english token then return None
        if not self.english_filter.filter(word):
            return (None, None, None, None)

        tokens_set = set(tokens)
        num_of_tokens = len(tokens)

        # if sentence contains the NP
        if word.lower() in sentence.lower():
            print(sys.stderr, "word:", word, " sentence:", sentence)

            if tokens.count(word) == 0:
                # iterate over tokens
                for token_index, token in enumerate(tokens):
                    # check if word is part of any token
                    if word in token:
                        index = token_index
                        print(sys.stderr, "containing word found at index :",
                              str(index))
                        break  # found the token containing this word
            else:
                # pick the first index of 'word' in token list 'tokens'
                index = tokens.index(word)  # tokens.index(word)
                print(sys.stderr, "exact word found at index :", str(index))

            # get sequence model for tokens in give index range
            feature_dict = self.getLexicalFeaturesForIndexRange(
                result_tuple, index)
        else:
            # sentence doesn't contains the head NP
            # just ignore such setences
            pass

        if not category is None:
            category = category.strip()

        tpl = (feature_dict, category, word, sentence)
        print >> sys.stderr, " lexical_fe returning :", tpl
        return tpl

コード例 #9

ファイルを表示

class POSContextSequenceFeatureExtractor(BaseFeatureExtractor):
    # params:
    # k_param: the length n in n-grams to be considered in the context window
    # prime_feature_length: the length of context window pos tags of words
    # falling in that window will constitute the prime feature
    # add_prime_feature: boolean flag to indicate whether to generate prime
    # feature
    def __init__(self, k_param, prime_feature_length=PRIME_FEATURE_LENGTH, add_prime_feature=False):
        self.k_param = k_param
        self.prime_feature_length = prime_feature_length
        self.add_prime_feature = add_prime_feature
        print>>sys.stderr," add_prime_feature:",self.add_prime_feature
        self.debugFlag = 0 # off by default

        self.english_filter = EnglishTokenFilter()
        pass

    def debug(self,s):
        if self.debugFlag==1:
            print s

    # method that generates sequence model considering all words in the sentence
    # params- result_tuple with category, word, tokens
    def getFullSentenceSequenceModel(self, result_tuple):
        category,word,tokens,sentence,_old_word = result_tuple
        pos_tags = nltk.pos_tag(tokens)
        feature_dict = {}
        for i,tpl in enumerate(pos_tags):
            key ="W"+str(i)
            wrd,p_tg = tpl
            val = p_tg
            feature_dict[key] = val
        return feature_dict

    # method that generates sequence model considering only k words from
    # beginning of the line params- result_tuple with category, word, tokens
    # returns a dictionary with features
    def getKSequenceModel(self, result_tuple):
        category,word,tokens,sentence,_old_word = result_tuple
        pos_tags = nltk.pos_tag(tokens)
        feature_dict = {}
        for i,tpl in enumerate(pos_tags):

            # read k features- then break
            if i>self.k_param:
                break
            key ="W"+str(i)
            wrd,p_tg = tpl
            val = p_tg
            feature_dict[key] = val
        return feature_dict

    # method that gives start and end index of tokens to be considered for
    # sequence model if NP lies in first half of the sentence
    def getIndicesFirstHalf(self,index,num_of_tokens):
        self.debug("num_of_tokens:"+str(num_of_tokens))
        self.debug("index got in getIndicesFirstHalf:"+str(index))

        self.debug("k_param:"+str(self.k_param))
        if num_of_tokens<self.k_param:
            start_index = 0
            end_index = num_of_tokens - 1
            return (start_index,end_index)

        while( (num_of_tokens)-index)< self.k_param:
            index = index -1
        start_index = index
        end_index = index + self.k_param
        return (start_index,end_index)

    # method that gives start and end index of tokens to be considered for
    # sequence model if NP lies in second half of the sentence
    def getIndicesSecondHalf(self,index,num_of_tokens):
        if num_of_tokens<self.k_param:
            start_index = 0
            end_index = num_of_tokens - 1
            return (start_index,end_index)

        while(index)< self.k_param:
            index = index +1
        start_index = index-self.k_param+1
        end_index = index+1
        return (start_index,end_index)

    # method that gives start and end index of tokens to be considered for
    # sequence model if NP lies at middle of the sentence
    def getIndicesForMiddleWord(self,index,num_of_tokens):
        if num_of_tokens<self.k_param:
            start_index = 0
            end_index = num_of_tokens - 1
            return (start_index,end_index)

        if self.k_param % 2==0:
            start_index = index -(self.k_param/2)
            end_index = index + (self.k_param/2)
        else:
            start_index = index -(self.k_param/2)
            end_index = index + (self.k_param/2)+1
        return (start_index,end_index)

    # method that reads tokens and returns feature_dict
    def getSequenceModelForIndexRange(self,result_tuple,index,start_index,
                                      end_index):
        category,word,tokens,sentence,_old_word = result_tuple
        pos_tags = nltk.pos_tag(tokens)

        self.debug(pos_tags)

        feature_dict = {}

        # # add feature for tokens from index to start_index
        # token_i = 1
        # if start_index!=index:
        #     for lower in range(index-1,start_index,-1):
        #         key = "W-"+str(token_i)
        #         wrd,p_tg = pos_tags[lower]
        #         val = p_tg
        #         feature_dict[key] = val
        #         token_i = token_i + 1
        #
        # # add feature for token at index
        # key = "W0"
        # wrd,p_tg = pos_tags[index]
        # val = p_tg
        # feature_dict[key] = val
        #
        # # add feature for tokens from index to end index
        # token_i = 1
        # for upper in range(index+1, end_index):
        #     key = "W+"+str(token_i)
        #     wrd,p_tg = pos_tags[upper]
        #     val = p_tg
        #     feature_dict[key] = val
        #     token_i = token_i + 1

        # adding the prime feature if the add_prime_featue flag is set to True
        if self.add_prime_feature:
            self.debug("got features, looking for prime_feature")
            key="prime_feature"
            val=""
            if start_index==index:
                for i in range(index,min(len(pos_tags),index+
                        self.prime_feature_length)):
                    wrd,p_tg=pos_tags[i]
                    # if curr token is head NP use HNP as pos tag
                    if i==index:
                        p_tg = "HNP"
                    val=val+p_tg+" "
            elif end_index-1==index:
                self.debug("STR:"+str(index-self.prime_feature_length)+
                           " END:"+str(index+1))
                for i in range(max(0,index-self.prime_feature_length),index+1):
                    wrd,p_tg=pos_tags[i]
                    # if curr token is head NP use HNP as pos tag
                    if i==index:
                        p_tg = "HNP"
                    val=val+p_tg+" "
            else:
                while(end_index-start_index>=self.prime_feature_length):
                    start_index=start_index+1
                    end_index=end_index-1
                    self.debug("MIDDLE_start_index:"+str(start_index)+
                               " end_index:"+str(end_index))
                for i in range(start_index,end_index):
                    wrd,p_tg=pos_tags[i]
                    # if curr token is head NP use HNP as pos tag
                    if i==index:
                        p_tg = "HNP"
                    val=val+p_tg+" "
            feature_dict[key]=val.strip()

            # adding prime feature for beginning and ending of the sentence
            # as well
            # for beginning
            beg_pattern = " "
            for k in range(0, min(self.prime_feature_length,len(pos_tags))):
                wrd,p_tg=pos_tags[k]
                # if curr token is head NP use HNP as pos tag
                if k==index:
                    p_tg = "HNP"
                beg_pattern = beg_pattern + p_tg+" "

            feature_dict["beg_prime"] = beg_pattern.strip()

            # for ending
            end_pattern = " "
            for k in range(max(0,len(pos_tags)-self.prime_feature_length),
                           len(pos_tags)):
                wrd,p_tg=pos_tags[k]
                if k==index:
                    p_tg = "HNP"
                end_pattern = end_pattern + p_tg+" "
            feature_dict["end_prime"] = end_pattern.strip()

        return feature_dict

    # method that takes instance as input and returns feature vector and
    # optional category label
    # params: instance of format- <category> '<instance_name> | <instance>
    # returns: a tuple of (<feature_dict>, <category>, <word>, <sentence>)
    # or a list of such tuples
    def extract_features(self, instance):
        _sentence_feature_extractor = SentenceTokensFeatureExtractor()
        result_tuple = _sentence_feature_extractor.extract_features(instance)
        # malformed instance
        if result_tuple is None:
            return (None,None,None,None)

        category,word,tokens,sentence,_old_word = result_tuple

        # if word is non english token then return None
        if not self.english_filter.filter(word):
            return (None,None,None,None)

        # if using test instance
        if category is None and word is None:
            print>>sys.stderr," category and word is None for :",result_tuple
            if self.k_param==KPARAM:
                feature_dict = self.getFullSentenceSequenceModel(result_tuple)
            else:
                feature_dict = self.getKSequenceModel(result_tuple)

            return (feature_dict,None, None,None)


        tokens_set = set(tokens)
        num_of_tokens = len(tokens)

        # if sentence contains the NP
        if word.lower() in sentence.lower():
            self.debug("word in sentence")
            print(sys.stderr,"word:",word," sentence:",sentence)

            if tokens.count(word)==0:
                # iterate over tokens
                for token_index,token in enumerate(tokens):
                    # check if word is part of any token
                    if word in token:
                        index=token_index
                        print(sys.stderr,"containing word found at index :",str(index))

                        break	# found the token containing this word
            else:
                # pick the first index of 'word' in token list 'tokens'
                index = tokens.index(word)	# tokens.index(word)
                print(sys.stderr,"exact word found at index :",str(index))


            # word lies in first half of the sentence
            if index<num_of_tokens/2:
                self.debug("lies in lower half")
                # print(sys.stderr,"word lies in lower half:going for getIndicesFirstHalf")
                start_index,end_index = self.getIndicesFirstHalf(index,num_of_tokens)
                self.debug("start_index:"+str(start_index)+" end_index:"+str(end_index))

            # word lies in second half of the sentence
            if index>num_of_tokens/2:
                start_index,end_index = self.getIndicesSecondHalf(index, num_of_tokens)
                self.debug("lies in second half")
                # print(sys.stderr,"word lies in second half:going for getIndicesSecondHalf")
                self.debug("start_index:"+str(start_index)+" end_index:"+str(end_index))

            # word lies in middle of the sentence
            if index == num_of_tokens/2:
                start_index,end_index = self.getIndicesForMiddleWord(index,num_of_tokens)
                self.debug("lies at the middle")
                # print(sys.stderr,"word lies in middle :going for getIndicesForMiddleWord")
                self.debug("start_index:"+str(start_index)+" end_index:"+str(end_index))

            # get sequence model for tokens in give index range
            feature_dict = self.getSequenceModelForIndexRange(result_tuple,index,start_index, end_index)

            # update feature for multiword NP
            # feature_dict = self.update_feature_dict(feature_dict,word,_old_word,index)

        else:
        	# sentence doesn't contains the head NP
            # just ignore such setences
            pass

        if not category is None:
            category = category.strip()

        tpl = (feature_dict,category,word,sentence)
        print>>sys.stderr," pos_context_seq_fe returning :",tpl
        return tpl

    # method to place feature value for W0 as NP if its a multiword NP
    def update_feature_dict(self, feature_dict, word, old_word,index):
        if word!=old_word:
            _tag = "W"+str(index)
            feature_dict[_tag] = "NN"
        return feature_dict

コード例 #10

ファイルを表示

ファイル: malt_parsed_pos_context_sequence_feature_extractor.py プロジェクト: kperisetla/glossextractionengine

class MaltParsedPOSContextSequenceFeatureExtractor(BaseFeatureExtractor):

    def __init__(self, k_param, prime_feature_length=PRIME_FEATURE_LENGTH, add_prime_feature=False):
        self.k_param = k_param
        self.prime_feature_length = prime_feature_length
        self.add_prime_feature = add_prime_feature
        self.debugFlag = 0 # off by default

        self.english_filter = EnglishTokenFilter()
        self.lex_fe = MaltParsedLexicalizedNgramsFeatureExtractor(n=2)
        pass

    def debug(self,s):
        if self.debugFlag==1:
            pass
            print s

    # method that generates sequence model considering all words in the sentence
    # params- result_tuple with category, word, tokens
    def getFullSentenceSequenceModel(self, result_tuple):
        category,word,tokens,sentence,_old_word = result_tuple
        pos_tags = self.word_pos_tuple_collection

        feature_dict = {}
        for i,tpl in enumerate(pos_tags):
            key ="W"+str(i)
            wrd,p_tg = tpl
            val = p_tg
            feature_dict[key] = val
        return feature_dict

    # method that generates sequence model considering only k words from beginning of the line
    # params- result_tuple with category, word, tokens
    # returns a dictionary with features
    def getKSequenceModel(self, result_tuple):
        category,word,tokens,sentence,_old_word = result_tuple
        pos_tags = self.word_pos_tuple_collection
        feature_dict = {}
        for i,tpl in enumerate(pos_tags):

            # read k features- then break
            if i>self.k_param:
                break
            key ="W"+str(i)
            wrd,p_tg = tpl
            val = p_tg
            feature_dict[key] = val
        return feature_dict

    # method that gives start and end index of tokens to be considered for sequence model if NP lies in first half of the sentence
    def getIndicesFirstHalf(self,index,num_of_tokens):
        self.debug("num_of_tokens:"+str(num_of_tokens))
        self.debug("index got in getIndicesFirstHalf:"+str(index))

        self.debug("k_param:"+str(self.k_param))
        if num_of_tokens<self.k_param:
            start_index = 0
            end_index = num_of_tokens - 1
            return (start_index,end_index)

        while( (num_of_tokens)-index)< self.k_param:
            index = index -1
        start_index = index
        end_index = index + self.k_param
        return (start_index,end_index)

    # method that gives start and end index of tokens to be considered for sequence model if NP lies in second half of the sentence
    def getIndicesSecondHalf(self,index,num_of_tokens):
        if num_of_tokens<self.k_param:
            start_index = 0
            end_index = num_of_tokens - 1
            return (start_index,end_index)

        while(index)< self.k_param:
            index = index +1
        start_index = index-self.k_param+1
        end_index = index+1
        return (start_index,end_index)

    # method that gives start and end index of tokens to be considered for sequence model if NP lies at middle of the sentence
    def getIndicesForMiddleWord(self,index,num_of_tokens):
        if num_of_tokens<self.k_param:
            start_index = 0
            end_index = num_of_tokens - 1
            return (start_index,end_index)

        if self.k_param % 2==0:
            start_index = index -(self.k_param/2)
            end_index = index + (self.k_param/2)
        else:
            start_index = index -(self.k_param/2)
            end_index = index + (self.k_param/2)+1
        return (start_index,end_index)

    # method that reads tokens and returns feature_dict
    def getSequenceModelForIndexRange(self,result_tuple,index,start_index,end_index):
        category,word,tokens,sentence,_old_word = result_tuple
        pos_tags = self.word_pos_tuple_collection

        self.debug(pos_tags)

        feature_dict = {}

        # # add feature for tokens from index to start_index
        # token_i = 1
        # if start_index!=index:
        #     for lower in range(index-1,start_index,-1):
        #         key = "W-"+str(token_i)
        #         wrd,p_tg = pos_tags[lower]
        #         val = p_tg
        #         feature_dict[key] = val
        #         token_i = token_i + 1
        #
        # # add feature for token at index
        # key = "W0"
        # wrd,p_tg = pos_tags[index]
        # val = p_tg
        # feature_dict[key] = val
        #
        # # add feature for tokens from index to end index
        # token_i = 1
        # for upper in range(index+1, end_index):
        #     key = "W+"+str(token_i)
        #     wrd,p_tg = pos_tags[upper]
        #     val = p_tg
        #     feature_dict[key] = val
        #     token_i = token_i + 1

        # adding the prime feature if the add_prime_featue flag is set to True
        if self.add_prime_feature:
            print>>sys.stderr," pos_tags:",pos_tags
            self.debug("got features, looking for prime_feature")
            key="prime_feature"
            val=""
            if start_index==index:
                for i in range(index,min(len(pos_tags),index+self.prime_feature_length)):
                    wrd,p_tg=pos_tags[i]
                    # if curr token is head NP use HNP as pos tag
                    if i==index:
                        p_tg = "HNP"
                    val=val+p_tg+" "
            elif end_index-1==index:
                self.debug("STR:"+str(index-self.prime_feature_length)+" END:"+str(index+1))
                for i in range(max(0,index-self.prime_feature_length),index+1):
                    wrd,p_tg=pos_tags[i]
                    # if curr token is head NP use HNP as pos tag
                    if i==index:
                        p_tg = "HNP"
                    val=val+p_tg+" "
            else:
                while(end_index-start_index>=self.prime_feature_length):
                    start_index=start_index+1
                    end_index=end_index-1
                    self.debug("MIDDLE_start_index:"+str(start_index)+" end_index:"+str(end_index))
                for i in range(start_index,end_index):
                    wrd,p_tg=pos_tags[i]
                    # if curr token is head NP use HNP as pos tag
                    if i==index:
                        p_tg = "HNP"
                    val=val+p_tg+" "
            feature_dict[key]=val.strip()
            print>>sys.stderr," start_index:",start_index, " end_index:",end_index

            print>>sys.stderr," feature_dict so far: ",feature_dict
            # adding prime feature for beginning and ending of the sentence as well
            # for beginning
            beg_pattern = " "
            for k in range(0, min(self.prime_feature_length,len(pos_tags))):
                wrd,p_tg=pos_tags[k]
                # if curr token is head NP use HNP as pos tag
                if k==index:
                    p_tg = "HNP"
                beg_pattern = beg_pattern + p_tg+" "

            feature_dict["beg_prime"] = beg_pattern.strip()

            # for ending
            end_pattern = " "
            for k in range(max(0,len(pos_tags)-self.prime_feature_length), len(pos_tags)):
                wrd,p_tg=pos_tags[k]
                # if curr token is head NP use HNP as pos tag
                if k==index:
                    p_tg = "HNP"
                end_pattern = end_pattern + p_tg+" "
            feature_dict["end_prime"] = end_pattern.strip()

        return feature_dict

    # method that takes instance as input and returns feature vector and optional category label
    # params: instance of format- <category> '<instance_name> | <instance>
    # returns: a tuple of (<feature_dict>, <category>, <word>, <sentence>) or a list of such tuples
    def extract_features(self, instance):
        _sentence_feature_extractor = MaltParsedSentenceFeatureExtractor()
        result_tuple = _sentence_feature_extractor.extract_features(instance)

        # copying attributes
        self.token_collection = _sentence_feature_extractor.get_token_collection()
        self.word_pos_tuple_collection = _sentence_feature_extractor.get_word_pos_tuple_collection()

        # malformed instance
        if result_tuple is None:
            return (None,None,None)

        # call _get_seq_model after copying attributes from MaltParsedFeatureExtactor

        # depending on if result_tuple is list or single tuple, take action
        if isinstance(result_tuple,list):
            result_list = []
            for _item in result_tuple:
                result_item = self._get_seq_model(_item)
                result_list.append(result_item)
            return result_list
        else:
            result = self._get_seq_model(result_tuple)
            return result

    # method to get the sequence model for malt parsed version of instance
    # returns a tuple: (feature_dict,category,word,sentence)
    def _get_seq_model(self, result_tuple):
        category,word,tokens,sentence,_old_word = result_tuple

        # if word is non english token then return None
        if not self.english_filter.filter(word):
            return (None,None,None, None)

        # if using test instance
        if category is None and word is None:
            if self.k_param==KPARAM:
                feature_dict = self.getFullSentenceSequenceModel(result_tuple)
            else:
                feature_dict = self.getKSequenceModel(result_tuple)

            return (feature_dict,None, None, None)


        tokens_set = set(tokens)
        num_of_tokens = len(tokens)

        # print>>sys.stderr," checking word:",word.lower()," in sentence:",sentence.lower()

        # if sentence contains the NP
        if word.lower() in sentence.lower():
            # print>>sys.stderr,"word in sentence"

            # if the word is not present as complete word in token list 'tokens'
            if tokens.count(word)==0:
                # iterate over tokens
                for token_index,token in enumerate(tokens):
                    # check if word is part of any token
                    if word in token:
                        index=token_index
                        # print(sys.stderr,"containing word found at index :",str(index))

                        break	# found the token containing this word
            else:
                # pick the first index of 'word' in token list 'tokens'
                index = tokens.index(word)	# tokens.index(word)
                # print(sys.stderr,"exact word found at index :",str(index))


            # word lies in first half of the sentence
            if index<num_of_tokens/2:
                self.debug("lies in lower half")
                # print(sys.stderr,"word lies in lower half:going for getIndicesFirstHalf")
                start_index,end_index = self.getIndicesFirstHalf(index,num_of_tokens)
                self.debug("start_index:"+str(start_index)+" end_index:"+str(end_index))

# word lies in second half of the sentence
            if index>num_of_tokens/2:
                start_index,end_index = self.getIndicesSecondHalf(index, num_of_tokens)
                self.debug("lies in second half")
                # print(sys.stderr,"word lies in second half:going for getIndicesSecondHalf")
                self.debug("start_index:"+str(start_index)+" end_index:"+str(end_index))

            # word lies in middle of the sentence
            if index == num_of_tokens/2:
                start_index,end_index = self.getIndicesForMiddleWord(index,num_of_tokens)
                self.debug("lies at the middle")
                # print(sys.stderr,"word lies in middle :going for getIndicesForMiddleWord")
                self.debug("start_index:"+str(start_index)+" end_index:"+str(end_index))

            # get sequence model for tokens in give index range
            feature_dict = self.getSequenceModelForIndexRange(result_tuple,index,start_index, end_index)

            # update feature for multi-word NP
            # feature_dict = self.update_feature_dict(feature_dict,word,_old_word,index)

        else:	# sentence doesn't contains the head NP
            # print>>sys.stderr," word:",word.lower()," not present in sentence:",sentence.lower()
            pass


        if not category is None:
            category = category.strip()

        # get the lexicalized features
        # lex_features, lex_category, lex_word = self.lex_fe._get_lexicalize_features(result_tuple)
        # update the feature dict
        # feature_dict.update(lex_features)

        return (feature_dict,category,word,sentence)

    # # method to place feature value for W0 as NP if its a multiword NP
    # def update_feature_dict(self, feature_dict, word, old_word`,index):
    #     if word!=old_word:
    #         _tag = "W"+str(index)
    #         feature_dict[_tag] = "NN"
    #     return feature_dict



#
# m = MaltParsedPOSContextSequenceFeatureExtractor(k_param=4)
# m.extract_features("Guests/NNS invited/VBD to/TO the/DT presidential/JJ ball/NN will/MD have/VB to/TO wear/VB tuxedoes/NNS ./.      1")