def update_classifier(topic_id): from classifier import BinaryClassifier, doc2text db = get_db() cur = db.cursor(MySQLdb.cursors.DictCursor) query = ''' SELECT D.*, M.strength FROM docs D, docs2topics M WHERE M.doc_id = D.doc_id AND M.topic_id = {0} AND M.is_training = 1 ORDER BY D.found_date DESC LIMIT 100 ''' app.logger.debug(query) cur.execute(query.format(topic_id)) rows = cur.fetchall() docs = [doc2text(row) for row in rows] classes = [row['strength'] for row in rows] msg = '' if (0 in classes and 1 in classes): with Capturing() as output: clf = BinaryClassifier(topic_id) clf.train(docs, classes) clf.save() msg += '\n'.join(output) # We could reclassify all documents now, but we postpone this step # until the documents are actually displayed (which may be never # for sufficiently old ones). So we simply undefine the topic # strengths to mark that no classification has yet been made. query = "UPDATE docs2topics SET strength = NULL WHERE topic_id = {0} AND is_training < 1" app.logger.debug(query) cur.execute(query.format(topic_id)) db.commit() else: msg = "classifier not yet ready because only positive or negative training samples" return msg
def classify(rows, topic, topic_id): from classifier import BinaryClassifier, doc2text docs = [doc2text(row) for row in rows] with Capturing() as output: clf = BinaryClassifier(topic_id) clf.load() probs = clf.classify(docs) app.logger.debug('\n'.join(output)) db = get_db() cur = db.cursor() for i, (p_spam, p_ham) in enumerate(probs): app.logger.debug("doc {} classified for topic {}: {}".format( rows[i]['doc_id'], topic_id, p_ham)) query = ''' INSERT INTO docs2topics (doc_id, topic_id, strength) VALUES ({0},{1},{2}) ON DUPLICATE KEY UPDATE strength={2} ''' query = query.format(rows[i]['doc_id'], topic_id, p_ham) app.logger.debug(query) cur.execute(query) db.commit() return [p[1] for p in probs]
except Exception, e: app.logger.error('cannot parse {}: {}'.format(post['url'], e)) continue if default_author: # overwrite whatever blogpostparser identified as the # author -- should probably make an exception for guest # posts: post['authors'] = default_author posts.append(post) if not posts: app.logger.warn('no posts to save') return 'OK' from classifier import BinaryClassifier, doc2text docs = [doc2text(post) for post in posts] clf = BinaryClassifier(0) # classifier 0 is for blogspam; note that 1=>blogspam, 0=>blogham clf.load() probs = clf.classify(docs) for i, (p_no, p_yes) in enumerate(probs): post = posts[i] app.logger.debug(u"post {} has blogspam probability {}".format(post['title'], p_yes)) if p_yes > app.config['MAX_SPAM'] * 3/2: app.logger.debug("> max {}".format(app.config['MAX_SPAM'] * 3/2)) continue post['status'] = 1 if p_yes < app.config['MAX_SPAM'] * 3/4 else 0 post['spamminess'] = p_yes post['meta_confidence'] = 0.75 query = "INSERT INTO docs ({}, found_date) VALUES ({} NOW())".format( ', '.join(post.keys()), '%s, '*len(post.keys())) app.logger.debug(query + ', '.join(map(unicode, post.values())))