print "Feature extraction + data processing to get top articles" # Start date start_date = datetime.date(2013, 4, 8) # End date end_date = datetime.date(2015, 7, 5) # Step size step_size = 400 articles_lost = 0 all_clusters = [] data = utils.load_nyt(start_date=start_date.isoformat(), end_date=end_date.isoformat(), keywords="china") output_file = "top_articles_new/dump_to_file_" + utils.get_time() f = open(output_file, "a") for i in range(0, len(data), step_size): # with utils.stdout_redirect(f): # cluster = analyse.BisectingKmeans(data[i:i+step_size]) cluster = analyse.BigClamArticle(data[i:i + step_size], coms=130) cluster.compute() cluster.find_computed_cluster_metrics() # for x in cluster.computed_clusters:
__author__ = 'Mark Lee' import utils if __name__ == '__main__': print "Data cleaning" # utils.log_to_file(__file__) file_list = utils.FileList("../data") # file_list.display_all() # file_list.display_csv() # file_list.csv_combine_nyt_full() # file_list.csv_small_nyt() # file_list.csv_combine_guardian() # Do not call this function unless updating DB file_list.sqlite_build_nyt_full() # Test function for the DB # sqlite_test() # Test function for sql # Loading from the DB data = utils.load_nyt("World", "2014-01", "2015-07", "Vladimir Putin Russia Ebola")
__author__ = "intern" from snap import * import utils import snap_cluster_lib from timeit import default_timer as timer import analyse step_size = 20 steps = 5 opt_com = 10 if __name__ == "__main__": start_timer = timer() data = utils.load_nyt(start_date="2013-07-01", end_date="2016-01-01", keywords="Israel") # Form the word clusters tmp = [] clusters = [] for article in data[:step_size * steps]: tmp.append(article) if len(tmp) == step_size: cluster = snap_cluster_lib.BigClamWordCluster(tmp) cluster.find_community(opt_com=opt_com) cluster.print_community() for c in cluster.computed: clusters.append(c) tmp = [] # Cluster the word clusters and words print [str(x) for x in clusters] line_cluster = snap_cluster_lib.BigClamLineCluster(clusters)