def db_load_counts(self, article_id=None, db=None): if not self.article and article_id: self.article = Article(id=article_id) self.terms = TermList() if self.article.has_been_counted: if not db: db = database.connect_to_database() cur = db.cursor(cursorclass=MySQLdb.cursors.SSDictCursor) query = "SELECT term,tf,count FROM articleswithterms WHERE articleid = %d" % (self.article.id) cur.execute(query) rows = cur.fetchall() self.terms.set_terms([(row['term'], (float(row['tf']), int(row['count']), None)) for row in rows]) return self.terms.all_terms()
def __init__(self, article=None, title_weight=19, leading_weight=1, normalizing_freq=True, stoplist_file=None): ''' An articlemodel has a TermList which counts terms ''' self.article = None if article: self.article = article if(stoplist_file): self.stoplist = Stoplist(stoplist_file) self.title_weight = title_weight self.leading_weight = leading_weight self.terms = TermList() self.total_term_counts = 0 self.cluster_id = 0 self.normalizing_freq = normalizing_freq
class ArticleModel(object): ''' Represented by a vector ''' def __init__(self, article=None, title_weight=19, leading_weight=1, normalizing_freq=True, stoplist_file=None): ''' An articlemodel has a TermList which counts terms ''' self.article = None if article: self.article = article if(stoplist_file): self.stoplist = Stoplist(stoplist_file) self.title_weight = title_weight self.leading_weight = leading_weight self.terms = TermList() self.total_term_counts = 0 self.cluster_id = 0 self.normalizing_freq = normalizing_freq def from_db_values(self, db_values): # print str(threading.currentThread().getName()) + ": has to load " + str(len(db_values)) + " terms" for row in db_values: self.from_db_row(row) # print str(threading.currentThread().getName()) + ": has finished with loading model of " + str(len(db_values)) + " terms" def from_db_row(self, db_row, load_article = True): if not self.article and db_row.has_key("articleid") and load_article: self.article = Article(id=db_row['articleid']) # print "Loaded article with id " + str(db_row['articleid']) + " therefore the article should be set: " + str(self.article) if db_row.has_key("term") and load_article: self.terms[db_row['term']] = (float(db_row['tf']), int(db_row['count']), None) def db_load_counts(self, article_id=None, db=None): if not self.article and article_id: self.article = Article(id=article_id) self.terms = TermList() if self.article.has_been_counted: if not db: db = database.connect_to_database() cur = db.cursor(cursorclass=MySQLdb.cursors.SSDictCursor) query = "SELECT term,tf,count FROM articleswithterms WHERE articleid = %d" % (self.article.id) cur.execute(query) rows = cur.fetchall() self.terms.set_terms([(row['term'], (float(row['tf']), int(row['count']), None)) for row in rows]) return self.terms.all_terms() def set_cluster(self, cluster_id, save=False): self.cluster_id = cluster_id def get_cluster_update_query(self): return "UPDATE articles SET `clusterid` = %d WHERE `id` = %d" % (self.cluster_id, self.article.id) def db_save_cluster(self): db = database.connect_to_database() cur = db.cursor() cur.execute(self.get_cluster_update_query()) def get_terms(self): return self.terms def set_terms(self, term_list): self.terms = term_list def count_terms(self, normalizing=True): re_words = re.compile(r"[a-z]+'?[a-z]+", re.IGNORECASE) article_text = termproc.replace_html_chars(self.article.article_text) terms = re_words.findall(article_text) title_terms = re_words.findall(self.article.title) terms = self.stoplist.apply(terms) title_terms = self.stoplist.apply(title_terms) self.total_term_counts = len(terms) + (len(title_terms) * self.title_weight) if not self.normalizing_freq: self.denominator = 1 else: self.denominator = len(terms) + (len(title_terms) * self.title_weight) # print "Length: " + str(denominator) term_indices = xrange(len(terms)) leading_threshold = 0.02 leading_threshold = len(terms) * leading_threshold for term, i in zip(terms, term_indices): if i <= int(leading_threshold): weight = self.leading_weight else: weight = 1 yield self.terms.count_term(term, self.denominator, weight) for term in title_terms: yield self.terms.count_term(term, self.denominator, self.title_weight) def copy(self): ''' Creates an exact copy of the model given, only doesn't take the article model also. ''' exact_copy = ArticleModel() exact_copy.set_terms(self.terms) return exact_copy def db_save(self, db=None): if not db: db = database.connect_to_database() cur = db.cursor() orig_term_inserts = list() article_term_inserts = list() for term in self.terms: tf = self.terms[term][0] count = int(self.terms[term][1]) orig_terms = self.terms[term][2] # Check if the stem exists already query = "SELECT id FROM unigram_stems WHERE `term` = '" + term + "'" cur.execute(query) # if the term already exists we update, otherwise insert it if db.affected_rows() == 0: query = "INSERT INTO unigram_stems (`term`,`totalcount`) VALUES('%s',%d)" % (term, count) cur.execute(query) term_id = int(cur.lastrowid) else: row = cur.fetchone() term_id = int(row[0]) query = "UPDATE unigram_stems SET totalcount = totalcount + %d WHERE `id` = %d" % (count, term_id) cur.execute(query) # Now check if the original terms exist for orig_term in orig_terms: query = "SELECT * FROM orig_terms WHERE `term` = '%s'" % (orig_term) cur.execute(query) if db.affected_rows() == 0: orig_term_inserts.append("(%d,'%s')" % (term_id, orig_term)) # query = "INSERT INTO orig_terms (`stemid`,`term`) VALUES(%d,'%s')" % (term_id, orig_term) cur.execute(query) # Now make the link table row article_term_inserts.append("(%d,%d,%d,%.3f)" % (term_id, self.article.id, count, tf)) # query = "INSERT INTO article_terms (`stemid`,`articleid`,`count`,`tf`) VALUES(%d,%d,%d,%.3f) ON DUPLICATE KEY UPDATE `count` = VALUES(count), `tf` = VALUES(tf)" % (term_id, self.article.id, count, tf) # cur.execute(query) if len(orig_term_inserts) > 0: orig_term_query = "INSERT IGNORE INTO orig_terms (`stemid`,`term`) VALUES %s" % (",".join(orig_term_inserts)) cur.execute(orig_term_query) if len(article_term_inserts) > 0: article_term_query = "INSERT INTO article_terms (`stemid`,`articleid`,`count`,`tf`) VALUES %s ON DUPLICATE KEY UPDATE `count` = VALUES(count), `tf` = VALUES(tf)" % (",".join(article_term_inserts)) cur.execute(article_term_query) query = "UPDATE articles SET `counted` = 1 WHERE `id` = %d" % (self.article.id) cur.execute(query) cur.close() db.commit() def print_terms(self): self.terms.print_terms() def print_info(self): print "Article title: " + self.article.title print " UNIQUE TERMS: " + str(len(self.terms)) print " TOTAL TERM COUNT: " + str(self.total_term_counts)