def extract_features(self, instance): _sentence_feature_extractor = SentenceTokensFeatureExtractor() result_tuple = _sentence_feature_extractor.extract_features(instance) # malformed instance if result_tuple is None: return (None, None, None, None) category, word, tokens, sentence, _old_word = result_tuple # if word is non english token then return None if not self.english_filter.filter(word): return (None, None, None, None) tokens_set = set(tokens) num_of_tokens = len(tokens) # if sentence contains the NP if word.lower() in sentence.lower(): print(sys.stderr, "word:", word, " sentence:", sentence) if tokens.count(word) == 0: # iterate over tokens for token_index, token in enumerate(tokens): # check if word is part of any token if word in token: index = token_index print(sys.stderr, "containing word found at index :", str(index)) break # found the token containing this word else: # pick the first index of 'word' in token list 'tokens' index = tokens.index(word) # tokens.index(word) print(sys.stderr, "exact word found at index :", str(index)) # get sequence model for tokens in give index range feature_dict = self.getLexicalFeaturesForIndexRange( result_tuple, index) else: # sentence doesn't contains the head NP # just ignore such setences pass if not category is None: category = category.strip() tpl = (feature_dict, category, word, sentence) print >> sys.stderr, " lexical_fe returning :", tpl return tpl
def extract_features(self, instance): _sentence_feature_extractor = SentenceTokensFeatureExtractor() result_tuple = _sentence_feature_extractor.extract_features(instance) # malformed instance if result_tuple is None: return (None, None, None, None) category, word, tokens, sentence, _old_word = result_tuple # if word is non english token then return None if not self.english_filter.filter(word): return (None, None, None, None) tokens_set = set(tokens) num_of_tokens = len(tokens) # if sentence contains the NP if word.lower() in sentence.lower(): print(sys.stderr, "word:", word, " sentence:", sentence) if tokens.count(word) == 0: # iterate over tokens for token_index, token in enumerate(tokens): # check if word is part of any token if word in token: index = token_index print(sys.stderr, "containing word found at index :", str(index)) break # found the token containing this word else: # pick the first index of 'word' in token list 'tokens' index = tokens.index(word) # tokens.index(word) print(sys.stderr, "exact word found at index :", str(index)) # get sequence model for tokens in give index range feature_dict = self.getLexicalFeaturesForIndexRange(result_tuple, index) else: # sentence doesn't contains the head NP # just ignore such setences pass if not category is None: category = category.strip() tpl = (feature_dict, category, word, sentence) print >>sys.stderr, " lexical_fe returning :", tpl return tpl
def extract_features(self, instance): _sentence_feature_extractor = SentenceTokensFeatureExtractor() result_tuple = _sentence_feature_extractor.extract_features(instance) # malformed instance if result_tuple is None: return (None, None, None) category, word, tokens, sentence, _old_word = result_tuple num_of_tokens = len(tokens) # if sentence contains the NP if word.lower() in sentence.lower() and self.n_value <= num_of_tokens: # capture index of head NP in the instance index = tokens.index(word) # head NP is first word in sentence if index == 0: start = index + 1 end = index + self.n_value + 1 n_grams = tokens[start:end] result = ' '.join(n_grams) # head NP is last word in sentence elif index == len(tokens) - 1: start = index - self.n_value end = index n_grams = tokens[start:end] result = ' '.join(n_grams) # for any other case else: _temp_lst = [] curr_index = index - 1 while curr_index != 0 and curr_index >= (index - self.n_value): _temp_lst.insert(0, tokens[curr_index]) curr_index = curr_index - 1 result = ' '.join(_temp_lst) key = str(self.n_value) + "-gram" self.feature_dict[key] = result resultant_tuple = (self.feature_dict, None, tokens[index]) return resultant_tuple return (None, None, None)
def extract_features(self, instance): _sentence_feature_extractor = SentenceTokensFeatureExtractor() result_tuple = _sentence_feature_extractor.extract_features(instance) # malformed instance if result_tuple is None: return (None,None,None) category,word,tokens,sentence,_old_word = result_tuple num_of_tokens = len(tokens) # if sentence contains the NP if word.lower() in sentence.lower() and self.n_value<=num_of_tokens : # capture index of head NP in the instance index = tokens.index(word) # head NP is first word in sentence if index==0: start = index+1 end = index +self.n_value+1 n_grams = tokens[start:end] result = ' '.join(n_grams) # head NP is last word in sentence elif index==len(tokens)-1: start = index- self.n_value end = index n_grams = tokens[start:end] result = ' '.join(n_grams) # for any other case else: _temp_lst = [] curr_index = index-1 while curr_index!=0 and curr_index>=(index-self.n_value): _temp_lst.insert(0,tokens[curr_index]) curr_index = curr_index - 1 result = ' '.join(_temp_lst) key = str(self.n_value)+"-gram" self.feature_dict[key] = result resultant_tuple = (self.feature_dict,None,tokens[index]) return resultant_tuple return (None,None,None)
def extract_features(self, instance): try: _sentence_feature_extractor = SentenceTokensFeatureExtractor() result_tuple = _sentence_feature_extractor.extract_features(instance) category,word,tokens,sentence,_old_word = result_tuple if not instance is None: if "|" in sentence: norm_sentence = sentence.split("|")[1] else: norm_sentence = sentence tokens = nltk.word_tokenize(norm_sentence) length = len(tokens) feat_dict ={} feat_dict["instance_length"]=length return (feat_dict,None,word,sentence) except Exception as ex: print>>sys.stderr,ex.message return (None,None,None,None)
def extract_features(self, instance): try: _sentence_feature_extractor = SentenceTokensFeatureExtractor() result_tuple = _sentence_feature_extractor.extract_features( instance) category, word, tokens, sentence, _old_word = result_tuple if not instance is None: if "|" in sentence: norm_sentence = sentence.split("|")[1] else: norm_sentence = sentence tokens = nltk.word_tokenize(norm_sentence) length = len(tokens) feat_dict = {} feat_dict["instance_length"] = length return (feat_dict, None, word, sentence) except Exception as ex: print >> sys.stderr, ex.message return (None, None, None, None)
def extract_features(self, instance): _sentence_feature_extractor = SentenceTokensFeatureExtractor() result_tuple = _sentence_feature_extractor.extract_features(instance) # malformed instance if result_tuple is None: return (None,None,None,None) category,word,tokens,sentence,_old_word = result_tuple # if word is non english token then return None if not self.english_filter.filter(word): return (None,None,None,None) # if using test instance if category is None and word is None: print>>sys.stderr," category and word is None for :",result_tuple if self.k_param==KPARAM: feature_dict = self.getFullSentenceSequenceModel(result_tuple) else: feature_dict = self.getKSequenceModel(result_tuple) return (feature_dict,None, None,None) tokens_set = set(tokens) num_of_tokens = len(tokens) # if sentence contains the NP if word.lower() in sentence.lower(): self.debug("word in sentence") print(sys.stderr,"word:",word," sentence:",sentence) if tokens.count(word)==0: # iterate over tokens for token_index,token in enumerate(tokens): # check if word is part of any token if word in token: index=token_index print(sys.stderr,"containing word found at index :",str(index)) break # found the token containing this word else: # pick the first index of 'word' in token list 'tokens' index = tokens.index(word) # tokens.index(word) print(sys.stderr,"exact word found at index :",str(index)) # word lies in first half of the sentence if index<num_of_tokens/2: self.debug("lies in lower half") # print(sys.stderr,"word lies in lower half:going for getIndicesFirstHalf") start_index,end_index = self.getIndicesFirstHalf(index,num_of_tokens) self.debug("start_index:"+str(start_index)+" end_index:"+str(end_index)) # word lies in second half of the sentence if index>num_of_tokens/2: start_index,end_index = self.getIndicesSecondHalf(index, num_of_tokens) self.debug("lies in second half") # print(sys.stderr,"word lies in second half:going for getIndicesSecondHalf") self.debug("start_index:"+str(start_index)+" end_index:"+str(end_index)) # word lies in middle of the sentence if index == num_of_tokens/2: start_index,end_index = self.getIndicesForMiddleWord(index,num_of_tokens) self.debug("lies at the middle") # print(sys.stderr,"word lies in middle :going for getIndicesForMiddleWord") self.debug("start_index:"+str(start_index)+" end_index:"+str(end_index)) # get sequence model for tokens in give index range feature_dict = self.getSequenceModelForIndexRange(result_tuple,index,start_index, end_index) # update feature for multiword NP # feature_dict = self.update_feature_dict(feature_dict,word,_old_word,index) else: # sentence doesn't contains the head NP # just ignore such setences pass if not category is None: category = category.strip() tpl = (feature_dict,category,word,sentence) print>>sys.stderr," pos_context_seq_fe returning :",tpl return tpl