def process_texts(self): relevant_words = [] path = os.path.join('data', 'wiki') file_names = os.listdir(path) documents = [] for file_name in file_names: file_path = os.path.join(path, file_name) f = open(file_path) documents.append((file_name, TextBlob(str.decode(f.read(), 'UTF-8', 'ignore')))) f.close() tfidf = TfIdf(documents) for file_name, document in documents: print file_name scores = {word: tfidf.compute_tfidf(word, document) for word in document.words} selected_scores = {} for word in scores: similars = sorted(self.get_similar(scores.keys(), word)) selected_scores[similars[-1]] = scores[word] sorted_words = sorted(selected_scores.items(), key=lambda x: x[1], reverse=True) for word, score in sorted_words[:10]: if word not in relevant_words: relevant_words.append(word) return set(relevant_words)