def get_prepared_KB(file_prepared_KB, stemmed=True): phrase2candidates = {} with open(file_prepared_KB, 'r', encoding='utf-8') as f: p = '' candi_list = [] for line in f: if line.startswith('=='): # process last one if p != '': phrase2candidates[p] = copy(candi_list) # process next one strs = line.split('\t') p = strs[1].strip() candi_list.clear() # clear the list else: strs = line.split('\t') if len(strs) > 1: explain = strs[1].replace("==SS==", "") explain = explain.replace("==DB==", "") if stemmed: explain = stem_words(explain) candi_list.append(explain) if p != '': phrase2candidates[p] = copy(candi_list) # print(phrase2candidates) return phrase2candidates
def get_meals(tokenized_string, enum=False): """ Returns a tuple of (index, meal) or a list of meals from a tokenized string. >>> raw_input_string = "I want cats for breakfast and dogs for dinner." >>> tokenizer = nltk.WordPunctTokenizer() >>> tokenized_string = tokenizer.tokenize(raw_input_string) >>> for i,w in get_meals(tokenized_string, enum=True): print i,w 4 breakfast 8 dinner """ stemmed_string = utils.stem_words(tokenized_string) stemmed_meals = utils.stem_words(wordlists.meal_types) results = _extract_words_from_list(stemmed_meals, stemmed_string, True) if enum: return [(i, tokenized_string[i]) for i, w in results] else: return [tokenized_string[i] for i, w in results]
def get_meals(tokenized_string, enum=False): """ Returns a tuple of (index, meal) or a list of meals from a tokenized string. >>> raw_input_string = "I want cats for breakfast and dogs for dinner." >>> tokenizer = nltk.WordPunctTokenizer() >>> tokenized_string = tokenizer.tokenize(raw_input_string) >>> for i,w in get_meals(tokenized_string, enum=True): print i,w 4 breakfast 8 dinner """ stemmed_string = utils.stem_words(tokenized_string) stemmed_meals = utils.stem_words(wordlists.meal_types) results = extract_words_from_list(stemmed_meals, stemmed_string, True) if enum: return [(i, tokenized_string[i]) for i, w in results] else: return [tokenized_string[i] for i, w in results]
def get_cuisines(tokenized_string, enum=False): """ Returns a tuple of (index, cuisine) or a list of cuisines from a tokenized string. >>> raw_input_string = "I want a chinese or mexican dish." >>> tokenizer = nltk.WordPunctTokenizer() >>> tokenized_string = tokenizer.tokenize(raw_input_string) >>> for i,w in get_cuisines(tokenized_string, enum=True): print i,w 3 chinese 5 mexican """ stemmed_string = utils.stem_words(tokenized_string) cuisines = set.difference(wordlists.cuisines, wordlists.meal_types) cuisines = cuisines.union(wordlists.list_of_adjectivals) stemmed_cuisines = utils.stem_words(cuisines) results = _extract_words_from_list(stemmed_cuisines, stemmed_string, True) if enum: return [(i, tokenized_string[i]) for i, w in results] else: return [tokenized_string[i] for i, w in results]
def get_cuisines(tokenized_string, enum=False): """ Returns a tuple of (index, cuisine) or a list of cuisines from a tokenized string. >>> raw_input_string = "I want a chinese or mexican dish." >>> tokenizer = nltk.WordPunctTokenizer() >>> tokenized_string = tokenizer.tokenize(raw_input_string) >>> for i,w in get_cuisines(tokenized_string, enum=True): print i,w 3 chinese 5 mexican """ stemmed_string = utils.stem_words(tokenized_string) cuisines = set.difference(wordlists.cuisines, wordlists.meal_types) cuisines = cuisines.union(wordlists.list_of_adjectivals) stemmed_cuisines = utils.stem_words(cuisines) results = extract_words_from_list(stemmed_cuisines, stemmed_string, True) if enum: return [(i, tokenized_string[i]) for i, w in results] else: return [tokenized_string[i] for i, w in results]
def filter_text(self): text = self.get_tweet_text() text = set(utils.tokenise(text)) filteredText = list(self.remove_stopwords(text)) keywords = utils.stem_words(filteredText) self.set_tweet_tokens(keywords)