def summarize(self,input_path): dataset_preprocessor = Preprocessor() dataset_FeatureReducer = FeatureReducer() dataset_WeightsHandler = WeightsHandler() files = [f for f in os.listdir(input_path) if os.path.isfile(input_path + f)] preprocessed_list = dataset_preprocessor.preprocess(files, input_path) sentencelist = dataset_preprocessor.extract_sentences(files, input_path) dataset_WeightsHandler.set_preprocessed_list(preprocessed_list) dataset_WeightsHandler.set_sentence_list(sentencelist) dataset_WeightsHandler.update_totfreq_dict() dataset_WeightsHandler.replace_totfreq_dict(dataset_FeatureReducer.reduceFeatures(dataset_WeightsHandler.tot_freq_dict())) dataset_WeightsHandler.generate_inv_doc_freq_dict(preprocessed_list) dataset_WeightsHandler.generate_tot_weight_dict() dataset_WeightsHandler.generate_STM() vector_dict = dataset_WeightsHandler.sentence_weight_dict() #vector_dict[sentence]=vector dataset_FeatureReducer.remove_features_with_zero_weight(vector_dict) sentencelist_without_stopwords = dataset_preprocessor.remove_stop_words_from_sentencelist(sentencelist) VectorSineRelationExtractor = SineRelationExtractor() sine_matrix = VectorSineRelationExtractor.extract_sine_similarity(vector_dict) synonym_assigner = SynonymAssigner() synonym_dict = synonym_assigner.assign_synonyms(sentencelist_without_stopwords) SentenceDissimilarityScorer = DissimilarityScorer() dissimilarity_matrix = SentenceDissimilarityScorer.assign_dissimilarity_score(synonym_dict, sentencelist_without_stopwords) final_score_matrix = SentenceDissimilarityScorer.multiply_sine(dissimilarity_matrix, sine_matrix) SentenceRanker = NodeRanker() scorelist_of_sentences= SentenceRanker.calculate_score_of_each_sentence(final_score_matrix) ranked_indices = SentenceRanker.rank_nodes(scorelist_of_sentences) for each_index in ranked_indices: print sentencelist[each_index]
from utils.preprocessor import Preprocessor import os from sets.size import Size from sets.intersections import Intersections from sets.scorer import Scorer from graphs.node_ranker import NodeRanker from sets.distributed_ranks import RankDistributor input_path = '/home/animesh/T-Sum/Data sets/Inception/' files = [f for f in os.listdir(input_path) if os.path.isfile(input_path + f)] prep = Preprocessor() sentence_list = prep.extract_sentences(files, input_path) preprocessed_words_in_each_sentence = [] for s in sentence_list: preprocessed_words_in_each_sentence.append(prep.preprocess_sentence(s)) size = Size() intersections = Intersections() scorer = Scorer() ranker = NodeRanker() rank_counter_in_0_to_1 = RankDistributor() size_of_sets = size.calculate_size_of_set(preprocessed_words_in_each_sentence) number_of_intersections_of_each_sentence = intersections.count_itersections_of_each_set(preprocessed_words_in_each_sentence) scores = scorer.score_sentences(number_of_intersections_of_each_sentence, size_of_sets) normalised_scores = scorer.normalise_score(scores) distributed_ranks = rank_counter_in_0_to_1.distribute_ranks(normalised_scores)