def query(self, question=None, tbl=None): import csv #print 'query: ',question self.question = question self.vec_bow = self.dict.doc2bow(clean_text_by_word(question)) #vec_bow=self.dict.doc2bow(question.lower().split()) #print 'question bow: ' , vec_bow vec_bow_reduced = self.model[self.vec_bow] if vec_bow_reduced == []: print 'vector is empty' return #print 'reduced dimension vector:', vec_bow_reduced n_returned = 10 sims = sorted(enumerate(self.sim_index[vec_bow_reduced]), key=lambda x: -x[1])[:n_returned] # sims_kw = sorted(enumerate(self.sim_raw[self.vec_bow]), key=lambda x: -x[1])[:n_returned] print 'LSI similarities: ', sims # print 'KW similarities:',sims_kw sims_id = [i for i in sims] # sims_id_kw=[i for i in sims_kw] fn = query.replace(' ', '_') self.get_results(sims_id, save=True, name=fn)
def intradocsim(self, text=None): sentences = nltk.sent_tokenize(text) doc_tokens = [] for sentence in sentences: sentence_tokens = clean_text_by_word(sentence) doc_tokens.append(sentence_tokens) doc_dict = Dictionary(doc_tokens) doc_corpus = [doc_dict.doc2bow(i) for i in doc_tokens] #doc_corpus_tfidf=TfidfModel(doc_corpus,id2word=doc_dict) doc_index = MatrixSimilarity(doc_corpus) qvect = doc_dict.doc2bow(clean_text_by_word(self.question)) sims = sorted(enumerate(doc_index[qvect]), key=lambda x: -x[1])[:5] self.f.write('sentence similarity ') for i, j in sims: self.f.write('(\n{:.2f}) {}\n'.format(j, sentences[i][:])) return
def get_stems(doc_dict: Dict[str, str], keywords_set: Set[str]) -> Dict[str, Set[str]]: """ Generates a list of stems with associated words Parameters ---------- doc_dict: Dict[str,str] Dictionary of text documents keywords_set: Set[str] Set of keywords to generate stems dict for Returns ------- result: Dict[str,Set[str] Dictionary containing lists of stems and their associated words """ if len(doc_dict) < 1: raise ValueError("No documents to scan") output: Dict[str, Set[str]] = {} for v in doc_dict.values(): tokens: Dict = clean_text_by_word(v) # Generate a list of stems from keywords filtered_tokens: List[str] = [ k for k in tokens.keys() if k in keywords_set ] stems: Set[str] = set() for word in filtered_tokens: stems.add(tokens[word].token) # Build list of words per stem (including non-keywords) for stem in stems: word_list = set() for t in tokens.keys(): if tokens[t].token == stem: word_list.add(tokens[t].text) if stem not in output.keys(): output[stem] = word_list else: output[stem].update(word_list) return output
def iterrecords(self): # generates document tokens for the dictionary self.index=[] cursor.execute(self.sql) ct=0 for doc in cursor: print ct self.index.append(str(doc[0]).strip()) doc=doc[1] # print to_beautiful(doc[1]) if self.first_sentences: doc=get_first_n_sentences_from_document(doc,self.n_sentences) tokens=clean_text_by_word(doc) ct+=1 yield tokens # or whatever tokenization suits you
from gensim.summarization.textcleaner import clean_text_by_word print(clean_text_by_word("Hola como estas."))