Exemple #1
0
 def web_update_article_text(self):
     '''
     To be used when the article text needs to be re-downloaded for one reason or another.
     '''
     cursor = self.db.cursor()
     num_articles = len(self.article_list)
     current_article = 1
     queries = list()
     if self.article_list:
         for article in self.article_list:
             article.download_article_text()
             article_text = database.db_escape(article.article_text)
             if article_text == None or article_text == "None" or article_text == "NULL":
                 print "Article broken."
             else:
                 cur_query = "UPDATE `articles` SET `originaltext` = \"" + article_text + "\" WHERE `id` = " + str(article.id)
                 queries.append(cur_query)
             print "Source " + str(self.website_name) + " processed " + str(current_article) + "/" + str(num_articles)
             current_article += 1
             if len(queries) >= 5:
                 for query in queries:
                     cursor.execute(query)
                 queries = []
     else:
         print "Article list has not been compiled."
     cursor.close()
     self.db.commit()
Exemple #2
0
 def db_create_row(self, db=None):
     if not db:
         db = database.connect_to_database()
     description = str(database.db_escape(self.top_article[0].article.title))
     if not description or description == "NULL":
         description = "'None'"
     query = "INSERT INTO clusters (`description`,`toparticleid`) VALUES(%s,%d)" % (description,self.top_article[0].article.id)
     cur = db.cursor()
     cur.execute(query)
     self.id = cur.lastrowid
     db.commit()
     cur.close()
Exemple #3
0
	def get_article_values(self):
		# Gotta filter them values
		article_text = db_escape(self.article_text)
		if article_text == None or article_text == "None" or article_text == "NULL":
			# if it's an empty article just don't bother
			return None
		query = "" + str(db_escape(self.guid)) + "," + str(db_escape(self.original_address)) + "," + str(db_escape(self.source.id)) + \
		  "," + str(db_escape(self.title)) + "," + str(db_escape(self.author)) + \
		  "," + str(article_text) + "," + str(format_date(self.date_published)) + \
		  "," + str(db_escape(self.date_retrieved)) + ""
		return query
Exemple #4
0
    def db_save(self, db=None, suppress_save = False):
        """
        Makes a number of queries.
        
        1. First of all, sets the description and top article ID of the cluster.
        2. Updates all the articles to have this cluster ID.
        3. Update the cluster terms by looking for the terms in the articles
           and making an average of the counts and frequencies.
        Then,
        """
        # First make sure that saving is required. If no articles have been added to this cluster,
        # then it does not need to be saved.
        if not self.edited:
            return True,"Not Edited"
        try:
            if not db:
                db = database.connect_to_database()
            cur = db.cursor()

            if self.id:
                cluster_up_query = "UPDATE clusters SET `description` = %s,`toparticleid` = %d WHERE `id` = %d" % (database.db_escape(self.top_article[0].article.title),self.top_article[0].article.id, self.id)
            else:
                self.db_create_row(db)

            # now we update the articles with their clusters
            # queries = [str(a_mod.get_cluster_update_query()) for a_mod in self.articles]
            # article_up_query = ";".join(queries)
            article_up_query = "UPDATE articles SET clusterid = %d WHERE id IN (%s)" % (self.id,",".join([str(a_mod.article.id) for a_mod in self.articles]))
            print "Running query " + article_up_query + " to update the articles in cluster " + str(self.id)

            success = True
            for term in self.model.terms:
                message = "Success!"
                # first get the stem id of the term
                query = "SELECT `id` FROM unigram_stems WHERE `term` = %s" % (database.db_escape(term))
                cur.execute(query)
                result = cur.fetchone()
                term_id = result[0]
                tf = self.model.terms[term][0]
                count = int(self.model.terms[term][1])
                # first check for the existence of the link
                #query = "SELECT `clusterid` FROM cluster_terms WHERE `stemid` = %d AND clusterid = %d" % (term_id,self.id)
                #print query
                #cur.execute(query)
                #result = cur.fetchone()
                #if not result:
                    # Make the link table row
                query = "INSERT INTO cluster_terms (`stemid`,`clusterid`,`count`,`tf`) VALUES(%d,%d,%d,%.3f) ON DUPLICATE KEY UPDATE `count` = VALUES(`count`),`tf` = VALUES(`tf`)" % (term_id, self.id, count, tf)
                # print query
                #else:
                #    query = "UPDATE cluster_terms SET `count` = %d, `tf` = %.3f WHERE `stemid` = %d AND %clusterid = %d" % (count,tf,term_id,self.id)
                cur.execute(query)
            cur.execute(article_up_query)
            cur.close()
            cur = db.cursor()

            if cluster_up_query:
                cur.execute(cluster_up_query)
                cur.close()
        except MySQLdb.Error, e:
            message = "Error %d: %s" % (e.args[0], e.args[1])
            success = False