Example #1
0
def db_load_models(articles):
    models = dict()
    inv_index = InvertedIndex()
    num_articles = len(articles)
    db = database.connect_to_database()
    list_of_ids = ",".join([str(article.id) for article in articles])
    query = "SELECT articleid,term,tf,count FROM articleswithterms WHERE articleid IN (%s)" % (list_of_ids)
    cur = db.cursor(cursorclass=MySQLdb.cursors.SSDictCursor)
    num_results = cur.execute(query)
    rows = cur.fetchall()
    model_values = dict()
    for row in rows:
        if model_values.has_key(row['articleid']):
            model_values[row['articleid']].append(row)
        else:
            model_values[row['articleid']] = [row]
    for article, index in zip(articles, xrange(num_articles)):
        if article.has_been_counted:
            print "Loading article " + str(index + 1) + "/" + str(num_articles)
            new_model = ArticleModel(article)
            new_model.from_db_values(model_values[article.id])
            all_terms = new_model.terms.all_terms()
            inv_index.add_term_occurences(all_terms, article.id)
            models[article.id] = new_model
    cur.close()
    db.close()
    return models, inv_index
Example #2
0
def get_all_training_clusters():
    db = connect_to_database()
    c = db.cursor()
    query = "SELECT * FROM training_clusters"
    c.execute(query)
    result = c.fetchall()
    return [TrainingCluster(tc[0],tc[1]) for tc in result]
Example #3
0
 def db_save(self,thresh,t_weight,l_weight,cl_type,cl_method,timeelapsed):
     db = database.connect_to_database()
     cur = db.cursor()
     query = "INSERT INTO clusterresults (`threshold`,`titleweight`, \
                                          `clustermethod`,`leadingweight`, \
                                          `clustertype`,`truepos`, \
                                          `trueneg`,`falsepos`, \
                                          `falseneg`,`fmeasure`, \
                                          `purity`,`nmi`,`timeelapsed`,\
                                          `rand`) \
                                          VALUES(%.3f,%d,\
                                          '%s',%d,\
                                          '%s',%d,\
                                          %d,%d,\
                                          %d,%.4f,\
                                          %.4f,%.4f,%.4f,\
                                          %.4f)" % \
                                          (thresh,t_weight,cl_method,l_weight,cl_type,\
                                           self.totaltruepos,self.totaltrueneg,\
                                           self.totalfalsepos,self.totalfalseneg,\
                                           self.fmeasure,self.purity_measure,self.nmi_val, \
                                           timeelapsed,self.rand)
     print query
     cur.execute(query)
     db.commit()
     cur.close()
Example #4
0
	def db_load_article(self,id):
		db = connect_to_database()
		cur = db.cursor(cursorclass = MySQLdb.cursors.DictCursor)
		query = "SELECT * FROM articles WHERE `id`='" + str(id) + "'"
		cur.execute(query)
		result = cur.fetchone()
		cur.close()
		return result
Example #5
0
 def proportion_of_topics_covered(self,source,start_date=None,end_date=None,db=None):
     if not db:
         db = database.connect_to_database()
     total_topics = len(self.all_clusters)
     articles_from_source = source.db_load_articles(start_date, end_date)
     clusters_of_source = set([article.cluster_id for article in source.article_list])
     num_clusters_by_source = len(clusters_of_source)
     return float(num_clusters_by_source) / float(total_topics)
Example #6
0
def get_all_sources():
    db = database.connect_to_database()
    c = db.cursor()
    source_query = "SELECT id FROM sources"
    c.execute(source_query)
    results = c.fetchall()
    sources = [NewsSource(result[0],db) for result in results]
    return sources
Example #7
0
	def db_set_classification(self,cluster_id):
		self.classification = cluster_id
		db = connect_to_database()
		c = db.cursor()
		query = "UPDATE articles SET trainingcluster = " + str(cluster_id) + " WHERE articles.id = " + str(self.id)
		print "executing " + query
		c.execute(query)
		if db.affected_rows() != 0:
			return True
		else:
			return False
Example #8
0
 def db_create_row(self, db=None):
     if not db:
         db = database.connect_to_database()
     description = str(database.db_escape(self.top_article[0].article.title))
     if not description or description == "NULL":
         description = "'None'"
     query = "INSERT INTO clusters (`description`,`toparticleid`) VALUES(%s,%d)" % (description,self.top_article[0].article.id)
     cur = db.cursor()
     cur.execute(query)
     self.id = cur.lastrowid
     db.commit()
     cur.close()
Example #9
0
 def db_load_counts(self, article_id=None, db=None):
     if not self.article and article_id:
         self.article = Article(id=article_id)
         self.terms = TermList()
     if self.article.has_been_counted:
         if not db:
             db = database.connect_to_database()
         cur = db.cursor(cursorclass=MySQLdb.cursors.SSDictCursor)
         query = "SELECT term,tf,count FROM articleswithterms WHERE articleid = %d" % (self.article.id)
         cur.execute(query)
         rows = cur.fetchall()
         self.terms.set_terms([(row['term'], (float(row['tf']), int(row['count']), None)) for row in rows])
         return self.terms.all_terms()
Example #10
0
def get_all_clusters(start_date=None, end_date=None, sql_conds=None, inv_index=None, num_threads=1):
    """ Gets all the clusters given some conditions """
    db = database.connect_to_database()
    query = "SELECT * FROM clusterswitharticles"
    cur = db.cursor(cursorclass=MySQLdb.cursors.DictCursor)
    linker = "WHERE"
    if start_date:
        query += " %s `earliest` >= '%s'" % (linker, str(start_date))
        linker = "AND"
    if end_date:
        query += " %s `latest` <= '%s'" % (linker, str(start_date))
        linker = "AND"
    if sql_conds:
        query += " %s %s" % (linker, sql_conds)

    cur.execute(query)
    results = cur.fetchall()
    cur.close()
    db.close()

    chunk_sizes = int(float(len(results)) / float(num_threads))
    cluster_lists = chunks.chunks(results, chunk_sizes)
    print "Loading " + str(len(cluster_lists)) + " cluster lists of size " + str(chunk_sizes) + " using " + str(
        num_threads
    ) + " threads"
    cluster_loaders = list()
    for c_list in cluster_lists:
        cluster_loaders.append(clusterloader.ClusterLoader(c_list, inv_index))
    for loader in cluster_loaders:
        loader.start()
    # print "Getting as far as starting the threads"
    while threading.activeCount() > 1:
        print "Waiting for " + str(threading.activeCount() - 1) + " threads to finish:\n\t",
        for loader in cluster_loaders:
            print loader.getName() + ": cluster " + loader.get_current_cluster(),
        print "."
        time.sleep(5)
    clusters = dict()
    for loader in cluster_loaders:
        clusters.update(loader.get_clusters())
        loader = None
    cluster_loaders = None
    """
    num_clusters = len(results)
    for row, index in zip(results, xrange(num_clusters)):
        print "Loading cluster " + str(index) + "/" + str(num_clusters)
        new_cl = ClusterModel()
        new_cl.from_db_values(row,inv_index=inv_index)
        clusters[new_cl.id] = new_cl
    """
    return clusters
Example #11
0
 def __init__(self,id=None,description=None):
     if id:
         if description:
             self.id = id
             self.description = description
         else:
             self.db = connect_to_database()
             # must be a cluster already on the database
             c = self.db.cursor()
             query = "SELECT * FROM training_clusters WHERE `id` = '" + str(id) + "'"
             c.execute(query)
             result = c.fetchone()
             if result:
                 self.id = id
                 self.description = result[1]
     elif description:
         self.db = connect_to_database()
         # obviously doesn't exist yet, let's make it
         self.description = description
         c = self.db.cursor()
         query = "INSERT INTO training_clusters(`description`) VALUES ('" + str(description) + "')"
         c.execute(query)
         self.id = c.lastrowid
Example #12
0
def db_get_all_articles(sql_conditions=None):
	# we need a DictCursor for this one
	db = connect_to_database()
	cur = db.cursor(cursorclass=MySQLdb.cursors.DictCursor)
	query = "SELECT * FROM articles"

	if sql_conditions:
		# so basically sql_conditions needs to be in SQL format - none of that easy-to-use stuff
		# it just complicates things.
		query += " WHERE " + sql_conditions
	cur.execute(query)
	results = cur.fetchall()
	article_list = [Article(db_values = a) for a in results]
        cur.close()
	return article_list
Example #13
0
 def db_load(self, id, db=None):
     close_db = False
     if not db:
         close_db = True
         db = database.connect_to_database()
     query = "SELECT * FROM clusters WHERE `id` = %d" % (id)
     print query
     cur = db.cursor(cursorclass=MySQLdb.cursors.DictCursor)
     cur.execute(query)
     result = cur.fetchone()
     print result
     self.from_db_values(result)
     cur.close()
     self.db_load_articles(db)
     if close_db:
         db.close()
Example #14
0
    def db_save(self, db=None):
        if not db:
            db = database.connect_to_database()
        cur = db.cursor()
        orig_term_inserts = list()
        article_term_inserts = list()
        for term in self.terms:
            tf = self.terms[term][0]
            count = int(self.terms[term][1])
            orig_terms = self.terms[term][2]
            # Check if the stem exists already
            query = "SELECT id FROM unigram_stems WHERE `term` = '" + term + "'"
            cur.execute(query)
            # if the term already exists we update, otherwise insert it
            if db.affected_rows() == 0:
                query = "INSERT INTO unigram_stems (`term`,`totalcount`) VALUES('%s',%d)" % (term, count)
                cur.execute(query)
                term_id = int(cur.lastrowid)
            else:
                row = cur.fetchone()
                term_id = int(row[0])
                query = "UPDATE unigram_stems SET totalcount = totalcount + %d WHERE `id` = %d" % (count, term_id)
                cur.execute(query)
            # Now check if the original terms exist
            for orig_term in orig_terms:
                query = "SELECT * FROM orig_terms WHERE `term` = '%s'" % (orig_term)
                cur.execute(query)
                if db.affected_rows() == 0:
                    orig_term_inserts.append("(%d,'%s')" % (term_id, orig_term))
                    # query = "INSERT INTO orig_terms (`stemid`,`term`) VALUES(%d,'%s')" % (term_id, orig_term)
                    cur.execute(query)
            # Now make the link table row
            article_term_inserts.append("(%d,%d,%d,%.3f)" % (term_id, self.article.id, count, tf))
            # query = "INSERT INTO article_terms (`stemid`,`articleid`,`count`,`tf`) VALUES(%d,%d,%d,%.3f) ON DUPLICATE KEY UPDATE `count` = VALUES(count), `tf` = VALUES(tf)" % (term_id, self.article.id, count, tf)
            # cur.execute(query)

        if len(orig_term_inserts) > 0:
            orig_term_query = "INSERT IGNORE INTO orig_terms (`stemid`,`term`) VALUES %s" % (",".join(orig_term_inserts))
            cur.execute(orig_term_query)
        if len(article_term_inserts) > 0:
            article_term_query = "INSERT INTO article_terms (`stemid`,`articleid`,`count`,`tf`) VALUES %s ON DUPLICATE KEY UPDATE `count` = VALUES(count), `tf` = VALUES(tf)" % (",".join(article_term_inserts))
            cur.execute(article_term_query)
        query = "UPDATE articles SET `counted` = 1 WHERE `id` = %d" % (self.article.id)
        cur.execute(query)
        cur.close()
        db.commit()
Example #15
0
    def db_load_top_article(self,toparticleid,inv_index=None):
        """ Loads the top article of the cluster from the DB """
        # print str(threading.currentThread().getName()) + ": Loading top article..."

        if not self.inv_index:
            self.inv_index = inv_index

        db = database.connect_to_database()
        cur = db.cursor(cursorclass=MySQLdb.cursors.DictCursor)
        article_query = "SELECT * FROM articleswithterms WHERE `clusterid` = %d AND `articleid` = %d" % (int(self.id), int(toparticleid))
        # print "Query is " + article_query
        cur.execute(article_query)
        db_values = cur.fetchall()
        article_model = articleunigram.ArticleModel()
        article_model.from_db_values(db_values)
        # print "TOP ARTICLE: " + str(article_model.article)
        comp = cosinecomparer.CosineComparer(self.inv_index)
        self.top_article = (article_model, self.similarity(comp, article_model, self.GRP_AVG))
Example #16
0
    def db_load_articles(self, top_article_id, db=None):
        """ Loads all the ArticleModel instances relating to this cluster """
        article_list = article.db_get_all_articles("`clusterid` = %d" % self.id)
        models,inv_index = articleunigram.db_load_models(article_list)
        self.articles = models.values()

        close_db = False
        if not db:
            close_db = True
            db = database.connect_to_database()
        cur = db.cursor(cursorclass=MySQLdb.cursors.DictCursor)
        terms_query = "SELECT `term`,`tf`,`count` FROM clusterswithterms WHERE `clusterid` = %d" % (self.id)
        # print str(threading.currentThread().getName()) + ": Going to load cluster model term from DB with query " + terms_query
        num_rows = cur.execute(terms_query)
        db_rows = cur.fetchall()
        # print str(threading.currentThread().getName()) + ": Loading terms for cluster " + str(self.id) + ": " + str(num_rows) + " rows..."
        self.model = articleunigram.ArticleModel()
        self.model.from_db_values(db_rows)
        cur.close()
        if close_db:
            db.close()
Example #17
0
def count_terms_and_store(articles, store=True, title_weight=19, print_steps=False, leading_weight=1, stoplist_file="../stop_words"):
    if store:
        db = database.connect_to_database()
    models = dict()
    inv_index = InvertedIndex()
    num_articles = len(articles)
    for art, index in zip(articles, xrange(len(articles))):
        if print_steps:
            print "Counting terms of article " + str(index + 1) + "/" + str(num_articles)
        model = ArticleModel(art, title_weight, leading_weight, stoplist_file=stoplist_file)
        for term in model.count_terms():
            inv_index.add_term_ocurrence(term, model.article.id)
        if store:
            model.db_save(db)
        models[art.id] = model
    """ This isn't needed anymore
    total_counts = sum([model.total_term_counts for model in models.values()])
    if store:
        cur = db.cursor()
        query = "UPDATE terms_global SET totaltermcounts = totaltermcounts + %d WHERE id = 1" % (total_counts)
        cur.execute(query)
    """
    return models, inv_index
Example #18
0
    def db_save(self, db=None, suppress_save = False):
        """
        Makes a number of queries.
        
        1. First of all, sets the description and top article ID of the cluster.
        2. Updates all the articles to have this cluster ID.
        3. Update the cluster terms by looking for the terms in the articles
           and making an average of the counts and frequencies.
        Then,
        """
        # First make sure that saving is required. If no articles have been added to this cluster,
        # then it does not need to be saved.
        if not self.edited:
            return True,"Not Edited"
        try:
            if not db:
                db = database.connect_to_database()
            cur = db.cursor()

            if self.id:
                cluster_up_query = "UPDATE clusters SET `description` = %s,`toparticleid` = %d WHERE `id` = %d" % (database.db_escape(self.top_article[0].article.title),self.top_article[0].article.id, self.id)
            else:
                self.db_create_row(db)

            # now we update the articles with their clusters
            # queries = [str(a_mod.get_cluster_update_query()) for a_mod in self.articles]
            # article_up_query = ";".join(queries)
            article_up_query = "UPDATE articles SET clusterid = %d WHERE id IN (%s)" % (self.id,",".join([str(a_mod.article.id) for a_mod in self.articles]))
            print "Running query " + article_up_query + " to update the articles in cluster " + str(self.id)

            success = True
            for term in self.model.terms:
                message = "Success!"
                # first get the stem id of the term
                query = "SELECT `id` FROM unigram_stems WHERE `term` = %s" % (database.db_escape(term))
                cur.execute(query)
                result = cur.fetchone()
                term_id = result[0]
                tf = self.model.terms[term][0]
                count = int(self.model.terms[term][1])
                # first check for the existence of the link
                #query = "SELECT `clusterid` FROM cluster_terms WHERE `stemid` = %d AND clusterid = %d" % (term_id,self.id)
                #print query
                #cur.execute(query)
                #result = cur.fetchone()
                #if not result:
                    # Make the link table row
                query = "INSERT INTO cluster_terms (`stemid`,`clusterid`,`count`,`tf`) VALUES(%d,%d,%d,%.3f) ON DUPLICATE KEY UPDATE `count` = VALUES(`count`),`tf` = VALUES(`tf`)" % (term_id, self.id, count, tf)
                # print query
                #else:
                #    query = "UPDATE cluster_terms SET `count` = %d, `tf` = %.3f WHERE `stemid` = %d AND %clusterid = %d" % (count,tf,term_id,self.id)
                cur.execute(query)
            cur.execute(article_up_query)
            cur.close()
            cur = db.cursor()

            if cluster_up_query:
                cur.execute(cluster_up_query)
                cur.close()
        except MySQLdb.Error, e:
            message = "Error %d: %s" % (e.args[0], e.args[1])
            success = False
__author__="Dan"
__date__ ="$16-Mar-2010 21:40:24$"

if __name__ == "__main__":
    todays_date = datetime.datetime.today()
    two_week_difference = datetime.timedelta(days=-14)
    week_difference = datetime.timedelta(days=-7)
    two_day_difference = datetime.timedelta(days=-3)
    week_ago_date_mysql = (todays_date + week_difference).strftime("%Y-%m-%d")
    days_ago_date_mysql = (todays_date + two_day_difference).strftime("%Y-%m-%d")
    
    article_list = article.db_get_all_articles("NOT isnull(clusterid) AND EXISTS (SELECT articleid FROM articleswithterms WHERE articleid = articles.id) AND `datepublished` >= '%(weekago)s'" % {"weekago" : week_ago_date_mysql})
    models,inv_index = articleunigram.db_load_models(article_list)

    mergers_query = "SELECT * FROM proposedmergers WHERE `approved` = 1"
    db = database.connect_to_database()
    cur = db.cursor(cursorclass = MySQLdb.cursors.DictCursor)
    clusterer = ClusterMaker(inv_index = inv_index)
    cur.execute(mergers_query)
    result = cur.fetchall()
    proposed_mergers = dict()
    for row in result:
        clusters_query = "SELECT `clusterid` FROM proposedmergers_clusters WHERE `mergerid` = %d" % (row['id'])
        cur.execute(clusters_query)
        clusters = cur.fetchall()
        proposed_mergers[row['id']] = [clustermodel.ClusterModel(inv_index = inv_index,id = cl['clusterid'],load_full_article = True) for cl in clusters]

    for p in proposed_mergers.items():
        print str(p[0]) + ": merging " + ",".join([str(cl) for cl in p[1]]) + "..."
        new_cluster = p[1][0]
        clusters_to_merge = p[1][1:]
Example #20
0
 def db_save_cluster(self):
     db = database.connect_to_database()
     cur = db.cursor()
     cur.execute(self.get_cluster_update_query())