def test_shouldCheckForConsistencyOfLSIModel(self): import os filepath = os.path.join(os.path.dirname(__file__), "test_data/reuters_rupee_decline/doc1") text = FileReader.read(filepath) processor = TextProcessor() sentences = processor.nltk_sentences(text) tokenised_sentence_map = dict( [(index, processor.stopped_tokenize(sentence)) for index, sentence in enumerate(sentences)]) for i in range(5): print "\n\n************* ITERATION ", i, " *************" lsi_transformation = LSITransformation(tokenised_sentence_map) lsi_transformation.print_transformation()
def best_community(self, community_levels, tokenised_sentences_dict): best_communities = self.dissimilar_sentences.find_best_community_level(community_levels) communities_subgraphs = best_communities.subgraphs() best_community_id = 0 best_community_index = 0.0 text_processor = TextProcessor() for id, community in enumerate(communities_subgraphs): vertices = community.vs["name"] sigma_info_index = 0.0 for vertex in vertices: sentence = tokenised_sentences_dict[vertex] info_index = text_processor.information_index(sentence) sigma_info_index += info_index sigma_info_index /= float(len(vertices)) if best_community_index < sigma_info_index: best_community_index = sigma_info_index best_community_id = id return communities_subgraphs[best_community_id]
def __init__(self, list_of_tokens): set_of_tokens = Sets.union_all(list_of_tokens) self.synonyms = TextProcessor.synonyms_for(set_of_tokens)