def load_file_sentences(filepath, filename): """ Loads sentences of a file into a list and converts everything to lower case. """ # Read file as string first f = open(filepath, 'r') text = f.read() f.close() # Strip the newlines text = filter(lambda x: x != '\n', text) # Now use nltks method to read the sentences sentences = sent_tokenize(text) # convert everything to lower case sentences = map(str.lower, sentences) """sentences = [(s.lower(), filename) for s in sentences]""" # Create segments by clustering. Let's say 3 segments per text. # Similarity metric shall be cosine. fs = create_feature_space(sentences) vectors = [vectorize(fs, sent) for sent in sentences] compute_similarity_matrix(vectors, cosine_similarity, filename+".similarities") if (len(vectors) < 2): # There are not enough sentences to cluster, so we'll just use the same segment for all # of them. Only happens once in the given project data anyway. segments = [0]*len(vectors) else: segments = cluster_sentences(filename+".similarities", __cluto_bin, 2) # Stitch it all together return zip(sentences, [filename]*len(sentences), segments)
def ordering_preprocessing(collection_name): """ Given a collection name, will create the themes for the collection and return them, thus enabling to order the themes/sentences. """ print("Starting Preprocessing") collection_path = __collection_root + collection_name topic_file = __project_root + 'topicwords/' + collection_name + '.ts' print("Loading and ranking sentences") sentences = rank_by_tweight(collection_path, topic_file) sentences = extract_dates(sentences) # We need to vectorize all the sentences to cluster them. print("Vectorizing Sentences") list_of_sents = [sent for ((sent, f, d, s), w) in sentences] fs = create_feature_space(list_of_sents) vectors = vectorize_sentence_list(fs, list_of_sents) # Now we can cluster into themes print("Creating Themes") compute_similarity_matrix(vectors, cosine_similarity, './similarity_matrix') clusters = cluster_sentences('./similarity_matrix', __cluto_bin, 5) themes = make_themes_from_clusters(sentences, clusters) return themes