Python SentenceTokensFeatureExtractorの例、lib.feature_extractor.sentence_tokens_feature_extractor.SentenceTokensFeatureExtractor Pythonの例

コード例 #1

0

ファイルを表示

ファイル: lexical_sequence_feature_extractor.py プロジェクト: anetschka/glossextractionengine

    def extract_features(self, instance):
        _sentence_feature_extractor = SentenceTokensFeatureExtractor()
        result_tuple = _sentence_feature_extractor.extract_features(instance)
        # malformed instance
        if result_tuple is None:
            return (None, None, None, None)

        category, word, tokens, sentence, _old_word = result_tuple

        # if word is non english token then return None
        if not self.english_filter.filter(word):
            return (None, None, None, None)

        tokens_set = set(tokens)
        num_of_tokens = len(tokens)

        # if sentence contains the NP
        if word.lower() in sentence.lower():
            print(sys.stderr, "word:", word, " sentence:", sentence)

            if tokens.count(word) == 0:
                # iterate over tokens
                for token_index, token in enumerate(tokens):
                    # check if word is part of any token
                    if word in token:
                        index = token_index
                        print(sys.stderr, "containing word found at index :",
                              str(index))
                        break  # found the token containing this word
            else:
                # pick the first index of 'word' in token list 'tokens'
                index = tokens.index(word)  # tokens.index(word)
                print(sys.stderr, "exact word found at index :", str(index))

            # get sequence model for tokens in give index range
            feature_dict = self.getLexicalFeaturesForIndexRange(
                result_tuple, index)
        else:
            # sentence doesn't contains the head NP
            # just ignore such setences
            pass

        if not category is None:
            category = category.strip()

        tpl = (feature_dict, category, word, sentence)
        print >> sys.stderr, " lexical_fe returning :", tpl
        return tpl

コード例 #2

0

ファイルを表示

ファイル: lexical_sequence_feature_extractor.py プロジェクト: kperisetla/glossextractionengine

    def extract_features(self, instance):
        _sentence_feature_extractor = SentenceTokensFeatureExtractor()
        result_tuple = _sentence_feature_extractor.extract_features(instance)
        # malformed instance
        if result_tuple is None:
            return (None, None, None, None)

        category, word, tokens, sentence, _old_word = result_tuple

        # if word is non english token then return None
        if not self.english_filter.filter(word):
            return (None, None, None, None)

        tokens_set = set(tokens)
        num_of_tokens = len(tokens)

        # if sentence contains the NP
        if word.lower() in sentence.lower():
            print(sys.stderr, "word:", word, " sentence:", sentence)

            if tokens.count(word) == 0:
                # iterate over tokens
                for token_index, token in enumerate(tokens):
                    # check if word is part of any token
                    if word in token:
                        index = token_index
                        print(sys.stderr, "containing word found at index :", str(index))
                        break  # found the token containing this word
            else:
                # pick the first index of 'word' in token list 'tokens'
                index = tokens.index(word)  # tokens.index(word)
                print(sys.stderr, "exact word found at index :", str(index))

            # get sequence model for tokens in give index range
            feature_dict = self.getLexicalFeaturesForIndexRange(result_tuple, index)
        else:
            # sentence doesn't contains the head NP
            # just ignore such setences
            pass

        if not category is None:
            category = category.strip()

        tpl = (feature_dict, category, word, sentence)
        print >>sys.stderr, " lexical_fe returning :", tpl
        return tpl

コード例 #3

0

ファイルを表示

    def extract_features(self, instance):
        _sentence_feature_extractor = SentenceTokensFeatureExtractor()
        result_tuple = _sentence_feature_extractor.extract_features(instance)
        # malformed instance
        if result_tuple is None:
            return (None, None, None)

        category, word, tokens, sentence, _old_word = result_tuple
        num_of_tokens = len(tokens)

        # if sentence contains the NP
        if word.lower() in sentence.lower() and self.n_value <= num_of_tokens:
            # capture index of head NP in the instance
            index = tokens.index(word)

            # head NP is first word in sentence
            if index == 0:
                start = index + 1
                end = index + self.n_value + 1
                n_grams = tokens[start:end]
                result = ' '.join(n_grams)

            # head NP is last word in sentence
            elif index == len(tokens) - 1:
                start = index - self.n_value
                end = index
                n_grams = tokens[start:end]
                result = ' '.join(n_grams)
            # for any other case
            else:
                _temp_lst = []
                curr_index = index - 1
                while curr_index != 0 and curr_index >= (index - self.n_value):
                    _temp_lst.insert(0, tokens[curr_index])
                    curr_index = curr_index - 1
                result = ' '.join(_temp_lst)

            key = str(self.n_value) + "-gram"
            self.feature_dict[key] = result
            resultant_tuple = (self.feature_dict, None, tokens[index])
            return resultant_tuple
        return (None, None, None)

コード例 #4

0

ファイルを表示

ファイル: lexicalized_ngrams_feature_extractor.py プロジェクト: kperisetla/glossextractionengine

    def extract_features(self, instance):
        _sentence_feature_extractor = SentenceTokensFeatureExtractor()
        result_tuple = _sentence_feature_extractor.extract_features(instance)
        # malformed instance
        if result_tuple is None:
            return (None,None,None)

        category,word,tokens,sentence,_old_word = result_tuple
        num_of_tokens = len(tokens)

        # if sentence contains the NP
        if word.lower() in sentence.lower() and self.n_value<=num_of_tokens :
            # capture index of head NP in the instance
            index = tokens.index(word)

            # head NP is first word in sentence
            if index==0:
                start = index+1
                end = index +self.n_value+1
                n_grams = tokens[start:end]
                result = ' '.join(n_grams)

            # head NP is last word in sentence
            elif index==len(tokens)-1:
                start = index- self.n_value
                end = index
                n_grams = tokens[start:end]
                result = ' '.join(n_grams)
            # for any other case
            else:
                _temp_lst = []
                curr_index = index-1
                while curr_index!=0 and curr_index>=(index-self.n_value):
                    _temp_lst.insert(0,tokens[curr_index])
                    curr_index = curr_index - 1
                result = ' '.join(_temp_lst)

            key = str(self.n_value)+"-gram"
            self.feature_dict[key] = result
            resultant_tuple = (self.feature_dict,None,tokens[index])
            return resultant_tuple
        return (None,None,None)

コード例 #5

0

ファイルを表示

ファイル: length_feature_extractor.py プロジェクト: kperisetla/glossextractionengine

    def extract_features(self, instance):
        try:
            _sentence_feature_extractor = SentenceTokensFeatureExtractor()
            result_tuple = _sentence_feature_extractor.extract_features(instance)
            category,word,tokens,sentence,_old_word = result_tuple
            
            if not instance is None:
                if "|" in sentence:
                    norm_sentence = sentence.split("|")[1]
                else:
                    norm_sentence = sentence

                tokens = nltk.word_tokenize(norm_sentence)
                length = len(tokens)
                feat_dict ={}
                feat_dict["instance_length"]=length

                return (feat_dict,None,word,sentence)
        except Exception as ex:
            print>>sys.stderr,ex.message
        return (None,None,None,None)

コード例 #6

0

ファイルを表示

ファイル: length_feature_extractor.py プロジェクト: anetschka/glossextractionengine

    def extract_features(self, instance):
        try:
            _sentence_feature_extractor = SentenceTokensFeatureExtractor()
            result_tuple = _sentence_feature_extractor.extract_features(
                instance)
            category, word, tokens, sentence, _old_word = result_tuple

            if not instance is None:
                if "|" in sentence:
                    norm_sentence = sentence.split("|")[1]
                else:
                    norm_sentence = sentence

                tokens = nltk.word_tokenize(norm_sentence)
                length = len(tokens)
                feat_dict = {}
                feat_dict["instance_length"] = length

                return (feat_dict, None, word, sentence)
        except Exception as ex:
            print >> sys.stderr, ex.message
        return (None, None, None, None)

コード例 #7

0

ファイルを表示

    def extract_features(self, instance):
        _sentence_feature_extractor = SentenceTokensFeatureExtractor()
        result_tuple = _sentence_feature_extractor.extract_features(instance)
        # malformed instance
        if result_tuple is None:
            return (None,None,None,None)

        category,word,tokens,sentence,_old_word = result_tuple

        # if word is non english token then return None
        if not self.english_filter.filter(word):
            return (None,None,None,None)

        # if using test instance
        if category is None and word is None:
            print>>sys.stderr," category and word is None for :",result_tuple
            if self.k_param==KPARAM:
                feature_dict = self.getFullSentenceSequenceModel(result_tuple)
            else:
                feature_dict = self.getKSequenceModel(result_tuple)

            return (feature_dict,None, None,None)


        tokens_set = set(tokens)
        num_of_tokens = len(tokens)

        # if sentence contains the NP
        if word.lower() in sentence.lower():
            self.debug("word in sentence")
            print(sys.stderr,"word:",word," sentence:",sentence)

            if tokens.count(word)==0:
                # iterate over tokens
                for token_index,token in enumerate(tokens):
                    # check if word is part of any token
                    if word in token:
                        index=token_index
                        print(sys.stderr,"containing word found at index :",str(index))

                        break	# found the token containing this word
            else:
                # pick the first index of 'word' in token list 'tokens'
                index = tokens.index(word)	# tokens.index(word)
                print(sys.stderr,"exact word found at index :",str(index))


            # word lies in first half of the sentence
            if index<num_of_tokens/2:
                self.debug("lies in lower half")
                # print(sys.stderr,"word lies in lower half:going for getIndicesFirstHalf")
                start_index,end_index = self.getIndicesFirstHalf(index,num_of_tokens)
                self.debug("start_index:"+str(start_index)+" end_index:"+str(end_index))

            # word lies in second half of the sentence
            if index>num_of_tokens/2:
                start_index,end_index = self.getIndicesSecondHalf(index, num_of_tokens)
                self.debug("lies in second half")
                # print(sys.stderr,"word lies in second half:going for getIndicesSecondHalf")
                self.debug("start_index:"+str(start_index)+" end_index:"+str(end_index))

            # word lies in middle of the sentence
            if index == num_of_tokens/2:
                start_index,end_index = self.getIndicesForMiddleWord(index,num_of_tokens)
                self.debug("lies at the middle")
                # print(sys.stderr,"word lies in middle :going for getIndicesForMiddleWord")
                self.debug("start_index:"+str(start_index)+" end_index:"+str(end_index))

            # get sequence model for tokens in give index range
            feature_dict = self.getSequenceModelForIndexRange(result_tuple,index,start_index, end_index)

            # update feature for multiword NP
            # feature_dict = self.update_feature_dict(feature_dict,word,_old_word,index)

        else:
        	# sentence doesn't contains the head NP
            # just ignore such setences
            pass

        if not category is None:
            category = category.strip()

        tpl = (feature_dict,category,word,sentence)
        print>>sys.stderr," pos_context_seq_fe returning :",tpl
        return tpl