def web_update_article_text(self): ''' To be used when the article text needs to be re-downloaded for one reason or another. ''' cursor = self.db.cursor() num_articles = len(self.article_list) current_article = 1 queries = list() if self.article_list: for article in self.article_list: article.download_article_text() article_text = database.db_escape(article.article_text) if article_text == None or article_text == "None" or article_text == "NULL": print "Article broken." else: cur_query = "UPDATE `articles` SET `originaltext` = \"" + article_text + "\" WHERE `id` = " + str(article.id) queries.append(cur_query) print "Source " + str(self.website_name) + " processed " + str(current_article) + "/" + str(num_articles) current_article += 1 if len(queries) >= 5: for query in queries: cursor.execute(query) queries = [] else: print "Article list has not been compiled." cursor.close() self.db.commit()
def db_create_row(self, db=None): if not db: db = database.connect_to_database() description = str(database.db_escape(self.top_article[0].article.title)) if not description or description == "NULL": description = "'None'" query = "INSERT INTO clusters (`description`,`toparticleid`) VALUES(%s,%d)" % (description,self.top_article[0].article.id) cur = db.cursor() cur.execute(query) self.id = cur.lastrowid db.commit() cur.close()
def get_article_values(self): # Gotta filter them values article_text = db_escape(self.article_text) if article_text == None or article_text == "None" or article_text == "NULL": # if it's an empty article just don't bother return None query = "" + str(db_escape(self.guid)) + "," + str(db_escape(self.original_address)) + "," + str(db_escape(self.source.id)) + \ "," + str(db_escape(self.title)) + "," + str(db_escape(self.author)) + \ "," + str(article_text) + "," + str(format_date(self.date_published)) + \ "," + str(db_escape(self.date_retrieved)) + "" return query
def db_save(self, db=None, suppress_save = False): """ Makes a number of queries. 1. First of all, sets the description and top article ID of the cluster. 2. Updates all the articles to have this cluster ID. 3. Update the cluster terms by looking for the terms in the articles and making an average of the counts and frequencies. Then, """ # First make sure that saving is required. If no articles have been added to this cluster, # then it does not need to be saved. if not self.edited: return True,"Not Edited" try: if not db: db = database.connect_to_database() cur = db.cursor() if self.id: cluster_up_query = "UPDATE clusters SET `description` = %s,`toparticleid` = %d WHERE `id` = %d" % (database.db_escape(self.top_article[0].article.title),self.top_article[0].article.id, self.id) else: self.db_create_row(db) # now we update the articles with their clusters # queries = [str(a_mod.get_cluster_update_query()) for a_mod in self.articles] # article_up_query = ";".join(queries) article_up_query = "UPDATE articles SET clusterid = %d WHERE id IN (%s)" % (self.id,",".join([str(a_mod.article.id) for a_mod in self.articles])) print "Running query " + article_up_query + " to update the articles in cluster " + str(self.id) success = True for term in self.model.terms: message = "Success!" # first get the stem id of the term query = "SELECT `id` FROM unigram_stems WHERE `term` = %s" % (database.db_escape(term)) cur.execute(query) result = cur.fetchone() term_id = result[0] tf = self.model.terms[term][0] count = int(self.model.terms[term][1]) # first check for the existence of the link #query = "SELECT `clusterid` FROM cluster_terms WHERE `stemid` = %d AND clusterid = %d" % (term_id,self.id) #print query #cur.execute(query) #result = cur.fetchone() #if not result: # Make the link table row query = "INSERT INTO cluster_terms (`stemid`,`clusterid`,`count`,`tf`) VALUES(%d,%d,%d,%.3f) ON DUPLICATE KEY UPDATE `count` = VALUES(`count`),`tf` = VALUES(`tf`)" % (term_id, self.id, count, tf) # print query #else: # query = "UPDATE cluster_terms SET `count` = %d, `tf` = %.3f WHERE `stemid` = %d AND %clusterid = %d" % (count,tf,term_id,self.id) cur.execute(query) cur.execute(article_up_query) cur.close() cur = db.cursor() if cluster_up_query: cur.execute(cluster_up_query) cur.close() except MySQLdb.Error, e: message = "Error %d: %s" % (e.args[0], e.args[1]) success = False