def start_counting_with_threads(article_list, store=True, title_weight=19, print_steps=True, leading_weight=1, stoplist_file="stop_words", num_threads=1): chunk_sizes = int(float(len(article_list)) / float(num_threads)) article_lists = chunks.chunks(article_list,chunk_sizes) print "Counting " + str(len(article_lists)) + " article lists of size " + str(chunk_sizes) + " using " + str(num_threads) + " threads" term_counters = list() for a_list in article_lists: term_counters.append(termcounter.TermCounter(a_list,store,title_weight,print_steps,leading_weight,stoplist_file)) return term_counters
def get_all_clusters(start_date=None, end_date=None, sql_conds=None, inv_index=None, num_threads=1): """ Gets all the clusters given some conditions """ db = database.connect_to_database() query = "SELECT * FROM clusterswitharticles" cur = db.cursor(cursorclass=MySQLdb.cursors.DictCursor) linker = "WHERE" if start_date: query += " %s `earliest` >= '%s'" % (linker, str(start_date)) linker = "AND" if end_date: query += " %s `latest` <= '%s'" % (linker, str(start_date)) linker = "AND" if sql_conds: query += " %s %s" % (linker, sql_conds) cur.execute(query) results = cur.fetchall() cur.close() db.close() chunk_sizes = int(float(len(results)) / float(num_threads)) cluster_lists = chunks.chunks(results, chunk_sizes) print "Loading " + str(len(cluster_lists)) + " cluster lists of size " + str(chunk_sizes) + " using " + str( num_threads ) + " threads" cluster_loaders = list() for c_list in cluster_lists: cluster_loaders.append(clusterloader.ClusterLoader(c_list, inv_index)) for loader in cluster_loaders: loader.start() # print "Getting as far as starting the threads" while threading.activeCount() > 1: print "Waiting for " + str(threading.activeCount() - 1) + " threads to finish:\n\t", for loader in cluster_loaders: print loader.getName() + ": cluster " + loader.get_current_cluster(), print "." time.sleep(5) clusters = dict() for loader in cluster_loaders: clusters.update(loader.get_clusters()) loader = None cluster_loaders = None """ num_clusters = len(results) for row, index in zip(results, xrange(num_clusters)): print "Loading cluster " + str(index) + "/" + str(num_clusters) new_cl = ClusterModel() new_cl.from_db_values(row,inv_index=inv_index) clusters[new_cl.id] = new_cl """ return clusters