Example #1
0
def update_classifier(topic_id):
    from classifier import BinaryClassifier, doc2text
    db = get_db()
    cur = db.cursor(MySQLdb.cursors.DictCursor)
    query = '''
         SELECT D.*, M.strength
         FROM docs D, docs2topics M
         WHERE M.doc_id = D.doc_id AND M.topic_id = {0} AND M.is_training = 1
         ORDER BY D.found_date DESC
         LIMIT 100
    '''
    app.logger.debug(query)
    cur.execute(query.format(topic_id))
    rows = cur.fetchall()
    docs = [doc2text(row) for row in rows]
    classes = [row['strength'] for row in rows]
    msg = ''
    if (0 in classes and 1 in classes):
        with Capturing() as output:
            clf = BinaryClassifier(topic_id)        
            clf.train(docs, classes)
            clf.save()
        msg += '\n'.join(output)
        # We could reclassify all documents now, but we postpone this step
        # until the documents are actually displayed (which may be never
        # for sufficiently old ones). So we simply undefine the topic
        # strengths to mark that no classification has yet been made.
        query = "UPDATE docs2topics SET strength = NULL WHERE topic_id = {0} AND is_training < 1"
        app.logger.debug(query)
        cur.execute(query.format(topic_id))
        db.commit()
    else:
        msg = "classifier not yet ready because only positive or negative training samples"
    return msg
Example #2
0
def classify(rows, topic, topic_id):
    from classifier import BinaryClassifier, doc2text
    docs = [doc2text(row) for row in rows]
    with Capturing() as output:
        clf = BinaryClassifier(topic_id)
        clf.load()
        probs = clf.classify(docs)
    app.logger.debug('\n'.join(output))
    db = get_db()
    cur = db.cursor()
    for i, (p_spam, p_ham) in enumerate(probs):
        app.logger.debug("doc {} classified for topic {}: {}".format(
            rows[i]['doc_id'], topic_id, p_ham))
        query = '''
            INSERT INTO docs2topics (doc_id, topic_id, strength)
            VALUES ({0},{1},{2})
            ON DUPLICATE KEY UPDATE strength={2}
        '''
        query = query.format(rows[i]['doc_id'], topic_id, p_ham)
        app.logger.debug(query)
        cur.execute(query)
        db.commit()
    return [p[1] for p in probs]
Example #3
0
            app.logger.error('cannot parse {}: {}'.format(post['url'], e))
            continue
        if default_author:
            # overwrite whatever blogpostparser identified as the
            # author -- should probably make an exception for guest
            # posts:
            post['authors'] = default_author
        posts.append(post)
        
    if not posts:
        app.logger.warn('no posts to save')
        return 'OK'

    from classifier import BinaryClassifier, doc2text
    docs = [doc2text(post) for post in posts]
    clf = BinaryClassifier(0) # classifier 0 is for blogspam; note that 1=>blogspam, 0=>blogham
    clf.load()
    probs = clf.classify(docs)
    for i, (p_no, p_yes) in enumerate(probs):
        post = posts[i]
        app.logger.debug(u"post {} has blogspam probability {}".format(post['title'], p_yes))
        if p_yes > app.config['MAX_SPAM'] * 3/2:
            app.logger.debug("> max {}".format(app.config['MAX_SPAM'] * 3/2))
            continue
        post['status'] = 1 if p_yes < app.config['MAX_SPAM'] * 3/4 else 0
        post['spamminess'] = p_yes
        post['meta_confidence'] = 0.75
        query = "INSERT INTO docs ({}, found_date) VALUES ({} NOW())".format(
            ', '.join(post.keys()), '%s, '*len(post.keys()))
        app.logger.debug(query + ', '.join(map(unicode, post.values())))
        try:
Example #4
0
def main():
    x = tf.placeholder(tf.float32, [None, AUDIO_FEATURE_SIZE])
    y = tf.placeholder(tf.float32, [None, 2])

    n_units = [100, 100, 50]
    n_batches = 10000
    batch_size = 10
    sound_event = 137  # Testing with sound event class of music.

    classifier = BinaryClassifier(x, y, n_units)

    path = "./trainingFeatures/bal_train/"
    filenames = [path + f for f in listdir(path)]
    """filenames = [path + "ZZ.tfrecord",
                 path + "Zy.tfrecord",
                 path + "ZY.tfrecord",
                 path + "zz.tfrecord",
                 path + "zZ.tfrecord",
                 path + "uT.tfrecord",
                 path + "Ut.tfrecord",
                 path + "UT.tfrecord",
                 path + "uu.tfrecord",
                 path + "uU.tfrecord"
                 ]"""
    eval_path = "./trainingFeatures/eval/"
    eval_filenames = [path + f for f in listdir(path)]

    batch = tf.train.batch(extract_example(filenames),
                           batch_size,
                           dynamic_pad=True)
    eval_batch = tf.train.batch(extract_example(eval_filenames),
                                EVAL_SET_SIZE,
                                dynamic_pad=True)

    with tf.Session() as sess:
        sess.run(tf.local_variables_initializer())
        sess.run(tf.global_variables_initializer())
        coordinator = tf.train.Coordinator()
        tf.train.start_queue_runners(sess, coordinator)

        # Train the model.
        for i in range(n_batches):
            labels, audio_features = sess.run(batch)
            targets = []

            for j in range(batch_size):
                if sound_event in labels[j]:
                    targets.append(POSITIVE)
                else:
                    targets.append(NEGATIVE)

            sess.run(classifier.train,
                     feed_dict={
                         x: audio_features,
                         y: targets
                     })

        # Evaluate the model.
        labels, audio_features = sess.run(eval_batch)
        targets = []

        for i in range(EVAL_SET_SIZE):
            if sound_event in labels[i]:
                targets.append(POSITIVE)
            else:
                targets.append(NEGATIVE)

        print(
            sess.run(classifier.f1_score,
                     feed_dict={
                         x: audio_features,
                         y: targets
                     }))

        coordinator.request_stop()
        coordinator.join()