Esempio n. 1
0
def start_counting_with_threads(article_list, store=True, title_weight=19, print_steps=True, leading_weight=1, stoplist_file="stop_words", num_threads=1):
    chunk_sizes = int(float(len(article_list)) / float(num_threads))
    article_lists = chunks.chunks(article_list,chunk_sizes)
    print "Counting " + str(len(article_lists)) + " article lists of size " + str(chunk_sizes) + " using " + str(num_threads) + " threads"
    term_counters = list()
    for a_list in article_lists:
        term_counters.append(termcounter.TermCounter(a_list,store,title_weight,print_steps,leading_weight,stoplist_file))
    return term_counters
Esempio n. 2
0
def get_all_clusters(start_date=None, end_date=None, sql_conds=None, inv_index=None, num_threads=1):
    """ Gets all the clusters given some conditions """
    db = database.connect_to_database()
    query = "SELECT * FROM clusterswitharticles"
    cur = db.cursor(cursorclass=MySQLdb.cursors.DictCursor)
    linker = "WHERE"
    if start_date:
        query += " %s `earliest` >= '%s'" % (linker, str(start_date))
        linker = "AND"
    if end_date:
        query += " %s `latest` <= '%s'" % (linker, str(start_date))
        linker = "AND"
    if sql_conds:
        query += " %s %s" % (linker, sql_conds)

    cur.execute(query)
    results = cur.fetchall()
    cur.close()
    db.close()

    chunk_sizes = int(float(len(results)) / float(num_threads))
    cluster_lists = chunks.chunks(results, chunk_sizes)
    print "Loading " + str(len(cluster_lists)) + " cluster lists of size " + str(chunk_sizes) + " using " + str(
        num_threads
    ) + " threads"
    cluster_loaders = list()
    for c_list in cluster_lists:
        cluster_loaders.append(clusterloader.ClusterLoader(c_list, inv_index))
    for loader in cluster_loaders:
        loader.start()
    # print "Getting as far as starting the threads"
    while threading.activeCount() > 1:
        print "Waiting for " + str(threading.activeCount() - 1) + " threads to finish:\n\t",
        for loader in cluster_loaders:
            print loader.getName() + ": cluster " + loader.get_current_cluster(),
        print "."
        time.sleep(5)
    clusters = dict()
    for loader in cluster_loaders:
        clusters.update(loader.get_clusters())
        loader = None
    cluster_loaders = None
    """
    num_clusters = len(results)
    for row, index in zip(results, xrange(num_clusters)):
        print "Loading cluster " + str(index) + "/" + str(num_clusters)
        new_cl = ClusterModel()
        new_cl.from_db_values(row,inv_index=inv_index)
        clusters[new_cl.id] = new_cl
    """
    return clusters