def classify(rows, topic, topic_id): from classifier import BinaryClassifier, doc2text docs = [doc2text(row) for row in rows] with Capturing() as output: clf = BinaryClassifier(topic_id) clf.load() probs = clf.classify(docs) app.logger.debug('\n'.join(output)) db = get_db() cur = db.cursor() for i, (p_spam, p_ham) in enumerate(probs): app.logger.debug("doc {} classified for topic {}: {}".format( rows[i]['doc_id'], topic_id, p_ham)) query = ''' INSERT INTO docs2topics (doc_id, topic_id, strength) VALUES ({0},{1},{2}) ON DUPLICATE KEY UPDATE strength={2} ''' query = query.format(rows[i]['doc_id'], topic_id, p_ham) app.logger.debug(query) cur.execute(query) db.commit() return [p[1] for p in probs]
continue if default_author: # overwrite whatever blogpostparser identified as the # author -- should probably make an exception for guest # posts: post['authors'] = default_author posts.append(post) if not posts: app.logger.warn('no posts to save') return 'OK' from classifier import BinaryClassifier, doc2text docs = [doc2text(post) for post in posts] clf = BinaryClassifier(0) # classifier 0 is for blogspam; note that 1=>blogspam, 0=>blogham clf.load() probs = clf.classify(docs) for i, (p_no, p_yes) in enumerate(probs): post = posts[i] app.logger.debug(u"post {} has blogspam probability {}".format(post['title'], p_yes)) if p_yes > app.config['MAX_SPAM'] * 3/2: app.logger.debug("> max {}".format(app.config['MAX_SPAM'] * 3/2)) continue post['status'] = 1 if p_yes < app.config['MAX_SPAM'] * 3/4 else 0 post['spamminess'] = p_yes post['meta_confidence'] = 0.75 query = "INSERT INTO docs ({}, found_date) VALUES ({} NOW())".format( ', '.join(post.keys()), '%s, '*len(post.keys())) app.logger.debug(query + ', '.join(map(unicode, post.values()))) try: cur.execute(query, post.values())