def restoreVars():
    with open(VARS_FILE, 'r') as f:
        global n_words
        n_words = pickle.load(f)
    global vocab_processor
    vocab_processor = learn.preprocessing.VocabularyProcessor.restore(
        VOCAB_PROCESSOR_SAVE_FILE)
    logger.debug(vocab_processor)
    logger.info("Vars updated.")
 def __init__(self, cloud_amqp_url, queue_name):
     self.cloud_amqp_url = cloud_amqp_url
     self.queue_name = queue_name
     self.params = pika.URLParameters(cloud_amqp_url)
     self.params.socket_timeout = 3
     self.connection = pika.BlockingConnection(
         self.params)  # Connect to CloudAMQP
     self.channel = self.connection.channel()  # start a channel
     self.channel.queue_declare(queue=queue_name)  # Declare a queue
     logger.info("CloudAMQPCLient : init queue %s" % self.queue_name)
def main(unused_argv):
    if REMOVE_PREVIOUS_MODEL:
        # Remove old model
        shutil.rmtree(MODEL_OUTPUT_DIR)
        os.mkdir(MODEL_OUTPUT_DIR)

    # Prepare training and testing data
    df = pd.read_csv(DATA_SET_FILE, header=None)
    train_df = df[0:400]
    test_df = df.drop(train_df.index)

    # x - news title, y - class
    x_train = train_df[1]
    y_train = train_df[0]
    x_test = test_df[1]
    y_test = test_df[0]

    # Process vocabulary - embedding
    vocab_processor = learn.preprocessing.VocabularyProcessor(
        MAX_DOCUMENT_LENGTH)
    x_train = np.array(list(vocab_processor.fit_transform(x_train)))
    x_test = np.array(list(vocab_processor.transform(x_test)))

    n_words = len(vocab_processor.vocabulary_)
    logger.debug('News topic trainer : Total words: %d' % n_words)

    # Saving n_words and vocab_processor:
    with open(VARS_FILE, 'w') as f:
        pickle.dump(n_words, f)

    vocab_processor.save(VOCAB_PROCESSOR_SAVE_FILE)

    # Build model
    classifier = learn.Estimator(model_fn=news_cnn_model.generate_cnn_model(
        N_CLASSES, n_words),
                                 model_dir=MODEL_OUTPUT_DIR)

    # Train and predict
    classifier.fit(x_train, y_train, steps=STEPS)

    # Evaluate model
    y_predicted = [
        p['class'] for p in classifier.predict(x_test, as_iterable=True)
    ]

    score = metrics.accuracy_score(y_test, y_predicted)
    logger.info('News topic trainer accuracy: {0:f}'.format(score))
def loadModel():
    global classifier
    classifier = learn.Estimator(model_fn=news_cnn_model.generate_cnn_model(
        N_CLASSES, n_words),
                                 model_dir=MODEL_OUTPUT_DIR)
    # Prepare training and testing
    df = pd.read_csv('../training_data/labeled_news.csv', header=None)

    # TODO: fix this until https://github.com/tensorflow/tensorflow/issues/5548 is solved.
    # We have to call evaluate or predict at least once to make the restored Estimator work.
    train_df = df[0:400]
    x_train = train_df[1]
    x_train = np.array(list(vocab_processor.transform(x_train)))
    y_train = train_df[0]
    classifier.evaluate(x_train, y_train)

    logger.info("Model updated.")
Esempio n. 5
0
def predict_news_click(user_id, news_description):
    # this function is to predict the probability of news in x_test to be clicked
    x_test = news_description
    x_train, y_train = get_training_data(user_id)

    x_train.extend(x_test)
    x_vector = TfidfVectorizer().fit_transform(x_train).todense()

    neigh = KNeighborsClassifier(n_neighbors=3)
    neigh.fit(x_vector[0:len(y_train)], y_train)

    y_predict = neigh.predict_proba(x_vector[len(y_train):])
    # store the 'unclicked' probability
    click_predict = [predict[0] for predict in y_predict]
    logger.info("Predict news click for %s : [%s]" % (user_id, ' '.join(map(str, click_predict))))

    return click_predict
SLEEP_TIME_IN_SECONDS = int(
    config['cloudAMQP']
    ['scrape_news_task_queue_sleep_time_in_seconds_at_monitor'])

while True:
    news_list = news_api_client.getNewsFromSource(NEWS_SOURCES)
    num_of_new_news = 0

    for news in news_list:
        news_digest = hashlib.md5(
            news['title'].encode('utf-8')).digest().encode('base64')

        if redis_client.get(news_digest) is None:
            num_of_new_news = num_of_new_news + 1
            news['digest'] = news_digest

            # if there's no published time, set it to current UTC time
            if news['publishedAt'] is None:
                news['publishedAt'] = datetime.datetime.utcnow().strftime(
                    '%Y-%m-%dT%H:%M:%SZ')

            redis_client.set(news_digest, news)
            redis_client.expire(news_digest, NEWS_TIME_OUT_IN_SECONDS)

            scrape_news_queue_client.sendMessage(news)

    logger.info("News monitor : fetched %d news." % num_of_new_news)

    scrape_news_queue_client.sleep(SLEEP_TIME_IN_SECONDS)
Esempio n. 7
0
class RequestHandler(pyjsonrpc.HttpRequestHandler):
    """Test Method"""
    @pyjsonrpc.rpcmethod
    def add(self, a, b):
        print "Service.py : add is called with %d and %d" % (a, b)
        return a + b

    """ Get news summaries for a user """

    @pyjsonrpc.rpcmethod
    def getNewsSummariesForUser(self, user_id, page_num):
        return operations.getNewsSummariesForUser(user_id, page_num)

    """ Log user news clicks """

    @pyjsonrpc.rpcmethod
    def logNewsClickForUser(self, user_id, news_id):
        return operations.logNewsClickForUser(user_id, news_id)


# Threading HTTP Server
http_server = pyjsonrpc.ThreadingHttpServer(server_address=(SERVER_HOST,
                                                            SERVER_PORT),
                                            RequestHandlerClass=RequestHandler)

logger.info("Starting Backend HTTP server on %s:%d" %
            (SERVER_HOST, SERVER_PORT))

http_server.serve_forever()
Esempio n. 8
0
import pyjsonrpc
import sys

sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common'))
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'configuration'))

from config_parser import config
from sys_log_client import logger

SERVER_HOST = str(config['customized_news_list']['host'])
SERVER_PORT = int(config['customized_news_list']['port'])


class RequestHandler(pyjsonrpc.HttpRequestHandler):
    """ Predict News Click """
    @pyjsonrpc.rpcmethod
    def predict_news_click(self, user_id, news_description):
        return news_click_predictor.predict_news_click(user_id,
                                                       news_description)


# Threading HTTP Server
http_server = pyjsonrpc.ThreadingHttpServer(server_address=(SERVER_HOST,
                                                            SERVER_PORT),
                                            RequestHandlerClass=RequestHandler)

logger.info("Starting customized news list server on %s:%d" %
            (SERVER_HOST, SERVER_PORT))

http_server.serve_forever()
Esempio n. 9
0
    def getPreferenceForUser(self, user_id):
        logger.debug("news_recommendation_service - getPreferenceForUser")
        db = mongodb_client.get_db()
        model = db[PREFERENCE_MODEL_TABLE_NAME].find_one({'userId': user_id})
        if model is None:
            return []

        sorted_tuples = sorted(model['preference'].items(),
                               key=operator.itemgetter(1),
                               reverse=True)
        sorted_list = [x[0] for x in sorted_tuples]
        sorted_value_list = [x[1] for x in sorted_tuples]

        # If the first preference is the same as the last one, the preference
        # makes no sense.
        if isclose(float(sorted_value_list[0]), float(sorted_value_list[-1])):
            return []

        return sorted_list


# Threading HTTP Server
http_server = pyjsonrpc.ThreadingHttpServer(server_address=(SERVER_HOST,
                                                            SERVER_PORT),
                                            RequestHandlerClass=RequestHandler)

logger.info("Starting news recommendation service on %s:%d" %
            (SERVER_HOST, SERVER_PORT))

http_server.serve_forever()
Esempio n. 10
0
 def on_any_event(self, event):
     # Reload model
     logger.info("Model update detected. Loading new model.")
     time.sleep(MODEL_UPDATE_LAG_IN_SECONDS)
     restoreVars()
     loadModel()
Esempio n. 11
0
    # TODO: fix this until https://github.com/tensorflow/tensorflow/issues/5548 is solved.
    # We have to call evaluate or predict at least once to make the restored Estimator work.
    train_df = df[0:400]
    x_train = train_df[1]
    x_train = np.array(list(vocab_processor.transform(x_train)))
    y_train = train_df[0]
    classifier.evaluate(x_train, y_train)

    logger.info("Model updated.")


restoreVars()
loadModel()

logger.info("Model loaded")


class ReloadModelHandler(FileSystemEventHandler):
    def on_any_event(self, event):
        # Reload model
        logger.info("Model update detected. Loading new model.")
        time.sleep(MODEL_UPDATE_LAG_IN_SECONDS)
        restoreVars()
        loadModel()


class RequestHandler(pyjsonrpc.HttpRequestHandler):
    @pyjsonrpc.rpcmethod
    def classify(self, text):
        text_series = pd.Series([text])
Esempio n. 12
0
def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        return

    if ('userId' not in msg or 'newsId' not in msg or 'timestamp' not in msg):
        return

    userId = msg['userId']
    newsId = msg['newsId']

    # Update user's preference
    db = mongodb_client.get_db()
    model = db[PREFERENCE_MODEL_TABLE_NAME].find_one({'userId': userId})

    # If model not exists, create a new one
    if model is None:
        logger.debug(
            'Click log processor: Creating preference model for new user: %s' %
            userId)
        new_model = {'userId': userId}
        preference = {}
        for i in NEWS_TOPICS:
            preference[i] = float(INITIAL_P)
        new_model['preference'] = preference
        model = new_model

    logger.info(
        'Click log processor: Updating preference model for new user: %s' %
        userId)

    # Update model using time decaying method
    news = db[NEWS_TABLE_NAME].find_one({'digest': newsId})
    if (news is None or 'class' not in news
            or news['class'] not in NEWS_TOPICS):
        logger.error(
            "Click log prrocessor: news doesn't exist or news topic doesn't exist"
        )
        return

    click_class = news['class']

    # Send the metrics to graphite
    metrics = 'backend.click.' + userId.replace(
        '.', '') + '.' + newsId.replace('.', '').replace(
            '\n', '') + '.' + click_class.split(' ')[0]
    graphite.send(metrics, 1)

    # Update the clicked one.
    old_p = model['preference'][click_class]
    model['preference'][click_class] = float((1 - ALPHA) * old_p + ALPHA)

    # Update not clicked classes.
    for i, prob in model['preference'].iteritems():
        if not i == click_class:
            model['preference'][i] = float(
                (1 - ALPHA) * model['preference'][i])

    # update to mongodb
    db[PREFERENCE_MODEL_TABLE_NAME].replace_one({'userId': userId},
                                                model,
                                                upsert=True)

    # add news title to click log table
    click_logs = db[CLICK_LOGS_TABLE_NAME].find(
        {"$and": [{
            'userId': userId
        }, {
            'newsId': news['digest']
        }]})

    if click_logs.count() == 0:
        if news['description'] is not None:
            click_log = {
                'userId': userId,
                'newsId': news['digest'],
                'description': news['description'],
                'timestamp': datetime.utcnow(),
                'clicked': 1
            }
            db[CLICK_LOGS_TABLE_NAME].insert(click_log)
            logger.info("Click log processor: add click log")
            logger.info(news['description'])
        else:
            logger.info('==== empty news description ==== ')
    else:
        for click_log in click_logs:
            click_log['timestamp'] = datetime.utcnow()
            db[CLICK_LOGS_TABLE_NAME].replace_one(
                {"$and": [{
                    'userId': userId
                }, {
                    'newsId': news['digest']
                }]},
                click_log,
                upsert=True)
            logger.info(
                "Click log processor: find duplicated click and update the time"
            )
            logger.info(news['description'])
 def __init__(self, host, port):
     self.host = host
     self.port = port
     logger.info("Graphite client is created at %s:%d" % (host, port))