def collect_feedback_words(self): self.words_to_add = [] self.NEList = [] id_topic = self.id_topic if id_topic in self.feedbacks: for feedback in self.feedbacks[id_topic]: if(feedback[1] not in self.rel_docs): self.rel_docs.append(feedback[1]) words_topic_name = feedback[2] passage_text = feedback[3] try: tokens = nltk.word_tokenize(' '.join(diversification.NErecognition(passage_text))) tokens_topic_name = nltk.word_tokenize(words_topic_name) self.words_to_add += tokens_topic_name NEList = list(set(tokens)) print 'Nelist',NEList print 'Topics title',tokens_topic_name # if(self.domain_name != 'local politics'): if(self.domain_name): NEList = [pxml.stem_and_lemmatize(word) for word in NEList if pxml.lemmatize(word.lower()) not in stopwords.words('english')] tokens_topic_name = [pxml.stem_and_lemmatize(word) for word in tokens_topic_name if pxml.lemmatize(word.lower()) not in stopwords.words('english')] except UnicodeError: NEList = [] self.NEList += NEList return list(set(self.NEList)), list(set(self.words_to_add)) return [], []
def process_words_feedback(self, words_to_add): list_words_to_add = [] for words in words_to_add: list_words_to_add += nltk.word_tokenize(words.lower()) list_words_to_add = list(set(list_words_to_add)) if(self.domain_name): list_words_to_add = [pxml.stem_and_lemmatize(word) for word in list_words_to_add] return list_words_to_add
def process_query(self,query): self.domain_name = self.map_name_domaine[self.dict_query_domain[query]] query = re.sub(r'[^\w]', ' ', query) query = nltk.word_tokenize(query) if(self.domain_name): query = [pxml.stem_and_lemmatize(word) for word in query] query = ' '.join(query) self.raw_query = query self.query = query
def format_query(self,query): query = re.sub(r'[^\w]', ' ', query) query = nltk.word_tokenize(query.lower()) query = [pxml.stem_and_lemmatize(word) for word in query] return query