def load_file_sentences(filepath, filename):
    """ Loads sentences of a file into a list and converts everything to
    lower case. """
    # Read file as string first
    f = open(filepath, 'r')
    text = f.read()
    f.close()
    # Strip the newlines
    text = filter(lambda x: x != '\n', text)
    # Now use nltks method to read the sentences
    sentences = sent_tokenize(text)
    # convert everything to lower case
    sentences = map(str.lower, sentences)
    """sentences = [(s.lower(), filename) for s in sentences]"""
    # Create segments by clustering. Let's say 3 segments per text.
    # Similarity metric shall be cosine.
    fs = create_feature_space(sentences)
    vectors = [vectorize(fs, sent) for sent in sentences]
    compute_similarity_matrix(vectors, cosine_similarity, filename+".similarities")
    if (len(vectors) < 2):
        # There are not enough sentences to cluster, so we'll just use the same segment for all
        # of them. Only happens once in the given project data anyway.
        segments = [0]*len(vectors)
    else:
        segments = cluster_sentences(filename+".similarities", __cluto_bin, 2)
    # Stitch it all together
    return zip(sentences, [filename]*len(sentences), segments)
def ordering_preprocessing(collection_name):
    """ Given a collection name, will create the themes for the collection and
        return them, thus enabling to order the themes/sentences.
    """
    print("Starting Preprocessing")
    collection_path = __collection_root + collection_name
    topic_file = __project_root + 'topicwords/' + collection_name + '.ts'
    print("Loading and ranking sentences")
    sentences = rank_by_tweight(collection_path, topic_file)
    sentences = extract_dates(sentences)
    # We need to vectorize all the sentences to cluster them.
    print("Vectorizing Sentences")
    list_of_sents = [sent for ((sent, f, d, s), w) in sentences]
    fs = create_feature_space(list_of_sents)
    vectors = vectorize_sentence_list(fs, list_of_sents)
    # Now we can cluster into themes
    print("Creating Themes")
    compute_similarity_matrix(vectors, cosine_similarity, './similarity_matrix')
    clusters = cluster_sentences('./similarity_matrix', __cluto_bin, 5)
    themes = make_themes_from_clusters(sentences, clusters)
    return themes