print "Feature extraction + data processing to get top articles"

    # Start date
    start_date = datetime.date(2013, 4, 8)

    # End date
    end_date = datetime.date(2015, 7, 5)

    # Step size
    step_size = 400

    articles_lost = 0
    all_clusters = []

    data = utils.load_nyt(start_date=start_date.isoformat(),
                          end_date=end_date.isoformat(),
                          keywords="china")

    output_file = "top_articles_new/dump_to_file_" + utils.get_time()
    f = open(output_file, "a")

    for i in range(0, len(data), step_size):
        # with utils.stdout_redirect(f):

        # cluster = analyse.BisectingKmeans(data[i:i+step_size])
        cluster = analyse.BigClamArticle(data[i:i + step_size], coms=130)

        cluster.compute()
        cluster.find_computed_cluster_metrics()

        # for x in cluster.computed_clusters:
__author__ = 'Mark Lee'

import utils

if __name__ == '__main__':
    print "Data cleaning"
    # utils.log_to_file(__file__)
    file_list = utils.FileList("../data")
    # file_list.display_all()
    # file_list.display_csv()
    # file_list.csv_combine_nyt_full()
    # file_list.csv_small_nyt()
    # file_list.csv_combine_guardian()

    # Do not call this function unless updating DB
    file_list.sqlite_build_nyt_full()

    # Test function for the DB
    # sqlite_test()  # Test function for sql

    # Loading from the DB
    data = utils.load_nyt("World", "2014-01", "2015-07",
                          "Vladimir Putin Russia Ebola")
__author__ = "intern"

from snap import *
import utils
import snap_cluster_lib
from timeit import default_timer as timer
import analyse
step_size = 20
steps = 5
opt_com = 10

if __name__ == "__main__":
    start_timer = timer()
    data = utils.load_nyt(start_date="2013-07-01", end_date="2016-01-01",
                          keywords="Israel")
    # Form the word clusters
    tmp = []
    clusters = []
    for article in data[:step_size * steps]:
        tmp.append(article)
        if len(tmp) == step_size:
            cluster = snap_cluster_lib.BigClamWordCluster(tmp)
            cluster.find_community(opt_com=opt_com)
            cluster.print_community()
            for c in cluster.computed:
                clusters.append(c)
            tmp = []

    # Cluster the word clusters and words
    print [str(x) for x in clusters]
    line_cluster = snap_cluster_lib.BigClamLineCluster(clusters)