def _getSources(): all_word_lists = [] for i,f in enumerate(docs_list): with open(f, 'r') as file: text = file.read().lower() text = removeNonAscii(text) # text = unicode(text , errors='ignore') sentences = nltk.sent_tokenize(text) #print self._generate_word_list(sentences) current_doc_uniq_word_list = list(set(_generate_word_list(sentences))) con = DB.get_con() for w in current_doc_uniq_word_list: if not isNumeric(w) and len(w)>1: DB.incr_occurrence(w,con) DB.incr_total_doc_number(con) con.close() file.close() return all_word_lists