def restoreVars(): with open(VARS_FILE, 'r') as f: global n_words n_words = pickle.load(f) global vocab_processor vocab_processor = learn.preprocessing.VocabularyProcessor.restore( VOCAB_PROCESSOR_SAVE_FILE) logger.debug(vocab_processor) logger.info("Vars updated.")
def __init__(self, cloud_amqp_url, queue_name): self.cloud_amqp_url = cloud_amqp_url self.queue_name = queue_name self.params = pika.URLParameters(cloud_amqp_url) self.params.socket_timeout = 3 self.connection = pika.BlockingConnection( self.params) # Connect to CloudAMQP self.channel = self.connection.channel() # start a channel self.channel.queue_declare(queue=queue_name) # Declare a queue logger.info("CloudAMQPCLient : init queue %s" % self.queue_name)
def main(unused_argv): if REMOVE_PREVIOUS_MODEL: # Remove old model shutil.rmtree(MODEL_OUTPUT_DIR) os.mkdir(MODEL_OUTPUT_DIR) # Prepare training and testing data df = pd.read_csv(DATA_SET_FILE, header=None) train_df = df[0:400] test_df = df.drop(train_df.index) # x - news title, y - class x_train = train_df[1] y_train = train_df[0] x_test = test_df[1] y_test = test_df[0] # Process vocabulary - embedding vocab_processor = learn.preprocessing.VocabularyProcessor( MAX_DOCUMENT_LENGTH) x_train = np.array(list(vocab_processor.fit_transform(x_train))) x_test = np.array(list(vocab_processor.transform(x_test))) n_words = len(vocab_processor.vocabulary_) logger.debug('News topic trainer : Total words: %d' % n_words) # Saving n_words and vocab_processor: with open(VARS_FILE, 'w') as f: pickle.dump(n_words, f) vocab_processor.save(VOCAB_PROCESSOR_SAVE_FILE) # Build model classifier = learn.Estimator(model_fn=news_cnn_model.generate_cnn_model( N_CLASSES, n_words), model_dir=MODEL_OUTPUT_DIR) # Train and predict classifier.fit(x_train, y_train, steps=STEPS) # Evaluate model y_predicted = [ p['class'] for p in classifier.predict(x_test, as_iterable=True) ] score = metrics.accuracy_score(y_test, y_predicted) logger.info('News topic trainer accuracy: {0:f}'.format(score))
def loadModel(): global classifier classifier = learn.Estimator(model_fn=news_cnn_model.generate_cnn_model( N_CLASSES, n_words), model_dir=MODEL_OUTPUT_DIR) # Prepare training and testing df = pd.read_csv('../training_data/labeled_news.csv', header=None) # TODO: fix this until https://github.com/tensorflow/tensorflow/issues/5548 is solved. # We have to call evaluate or predict at least once to make the restored Estimator work. train_df = df[0:400] x_train = train_df[1] x_train = np.array(list(vocab_processor.transform(x_train))) y_train = train_df[0] classifier.evaluate(x_train, y_train) logger.info("Model updated.")
def predict_news_click(user_id, news_description): # this function is to predict the probability of news in x_test to be clicked x_test = news_description x_train, y_train = get_training_data(user_id) x_train.extend(x_test) x_vector = TfidfVectorizer().fit_transform(x_train).todense() neigh = KNeighborsClassifier(n_neighbors=3) neigh.fit(x_vector[0:len(y_train)], y_train) y_predict = neigh.predict_proba(x_vector[len(y_train):]) # store the 'unclicked' probability click_predict = [predict[0] for predict in y_predict] logger.info("Predict news click for %s : [%s]" % (user_id, ' '.join(map(str, click_predict)))) return click_predict
SLEEP_TIME_IN_SECONDS = int( config['cloudAMQP'] ['scrape_news_task_queue_sleep_time_in_seconds_at_monitor']) while True: news_list = news_api_client.getNewsFromSource(NEWS_SOURCES) num_of_new_news = 0 for news in news_list: news_digest = hashlib.md5( news['title'].encode('utf-8')).digest().encode('base64') if redis_client.get(news_digest) is None: num_of_new_news = num_of_new_news + 1 news['digest'] = news_digest # if there's no published time, set it to current UTC time if news['publishedAt'] is None: news['publishedAt'] = datetime.datetime.utcnow().strftime( '%Y-%m-%dT%H:%M:%SZ') redis_client.set(news_digest, news) redis_client.expire(news_digest, NEWS_TIME_OUT_IN_SECONDS) scrape_news_queue_client.sendMessage(news) logger.info("News monitor : fetched %d news." % num_of_new_news) scrape_news_queue_client.sleep(SLEEP_TIME_IN_SECONDS)
class RequestHandler(pyjsonrpc.HttpRequestHandler): """Test Method""" @pyjsonrpc.rpcmethod def add(self, a, b): print "Service.py : add is called with %d and %d" % (a, b) return a + b """ Get news summaries for a user """ @pyjsonrpc.rpcmethod def getNewsSummariesForUser(self, user_id, page_num): return operations.getNewsSummariesForUser(user_id, page_num) """ Log user news clicks """ @pyjsonrpc.rpcmethod def logNewsClickForUser(self, user_id, news_id): return operations.logNewsClickForUser(user_id, news_id) # Threading HTTP Server http_server = pyjsonrpc.ThreadingHttpServer(server_address=(SERVER_HOST, SERVER_PORT), RequestHandlerClass=RequestHandler) logger.info("Starting Backend HTTP server on %s:%d" % (SERVER_HOST, SERVER_PORT)) http_server.serve_forever()
import pyjsonrpc import sys sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common')) sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'configuration')) from config_parser import config from sys_log_client import logger SERVER_HOST = str(config['customized_news_list']['host']) SERVER_PORT = int(config['customized_news_list']['port']) class RequestHandler(pyjsonrpc.HttpRequestHandler): """ Predict News Click """ @pyjsonrpc.rpcmethod def predict_news_click(self, user_id, news_description): return news_click_predictor.predict_news_click(user_id, news_description) # Threading HTTP Server http_server = pyjsonrpc.ThreadingHttpServer(server_address=(SERVER_HOST, SERVER_PORT), RequestHandlerClass=RequestHandler) logger.info("Starting customized news list server on %s:%d" % (SERVER_HOST, SERVER_PORT)) http_server.serve_forever()
def getPreferenceForUser(self, user_id): logger.debug("news_recommendation_service - getPreferenceForUser") db = mongodb_client.get_db() model = db[PREFERENCE_MODEL_TABLE_NAME].find_one({'userId': user_id}) if model is None: return [] sorted_tuples = sorted(model['preference'].items(), key=operator.itemgetter(1), reverse=True) sorted_list = [x[0] for x in sorted_tuples] sorted_value_list = [x[1] for x in sorted_tuples] # If the first preference is the same as the last one, the preference # makes no sense. if isclose(float(sorted_value_list[0]), float(sorted_value_list[-1])): return [] return sorted_list # Threading HTTP Server http_server = pyjsonrpc.ThreadingHttpServer(server_address=(SERVER_HOST, SERVER_PORT), RequestHandlerClass=RequestHandler) logger.info("Starting news recommendation service on %s:%d" % (SERVER_HOST, SERVER_PORT)) http_server.serve_forever()
def on_any_event(self, event): # Reload model logger.info("Model update detected. Loading new model.") time.sleep(MODEL_UPDATE_LAG_IN_SECONDS) restoreVars() loadModel()
# TODO: fix this until https://github.com/tensorflow/tensorflow/issues/5548 is solved. # We have to call evaluate or predict at least once to make the restored Estimator work. train_df = df[0:400] x_train = train_df[1] x_train = np.array(list(vocab_processor.transform(x_train))) y_train = train_df[0] classifier.evaluate(x_train, y_train) logger.info("Model updated.") restoreVars() loadModel() logger.info("Model loaded") class ReloadModelHandler(FileSystemEventHandler): def on_any_event(self, event): # Reload model logger.info("Model update detected. Loading new model.") time.sleep(MODEL_UPDATE_LAG_IN_SECONDS) restoreVars() loadModel() class RequestHandler(pyjsonrpc.HttpRequestHandler): @pyjsonrpc.rpcmethod def classify(self, text): text_series = pd.Series([text])
def handle_message(msg): if msg is None or not isinstance(msg, dict): return if ('userId' not in msg or 'newsId' not in msg or 'timestamp' not in msg): return userId = msg['userId'] newsId = msg['newsId'] # Update user's preference db = mongodb_client.get_db() model = db[PREFERENCE_MODEL_TABLE_NAME].find_one({'userId': userId}) # If model not exists, create a new one if model is None: logger.debug( 'Click log processor: Creating preference model for new user: %s' % userId) new_model = {'userId': userId} preference = {} for i in NEWS_TOPICS: preference[i] = float(INITIAL_P) new_model['preference'] = preference model = new_model logger.info( 'Click log processor: Updating preference model for new user: %s' % userId) # Update model using time decaying method news = db[NEWS_TABLE_NAME].find_one({'digest': newsId}) if (news is None or 'class' not in news or news['class'] not in NEWS_TOPICS): logger.error( "Click log prrocessor: news doesn't exist or news topic doesn't exist" ) return click_class = news['class'] # Send the metrics to graphite metrics = 'backend.click.' + userId.replace( '.', '') + '.' + newsId.replace('.', '').replace( '\n', '') + '.' + click_class.split(' ')[0] graphite.send(metrics, 1) # Update the clicked one. old_p = model['preference'][click_class] model['preference'][click_class] = float((1 - ALPHA) * old_p + ALPHA) # Update not clicked classes. for i, prob in model['preference'].iteritems(): if not i == click_class: model['preference'][i] = float( (1 - ALPHA) * model['preference'][i]) # update to mongodb db[PREFERENCE_MODEL_TABLE_NAME].replace_one({'userId': userId}, model, upsert=True) # add news title to click log table click_logs = db[CLICK_LOGS_TABLE_NAME].find( {"$and": [{ 'userId': userId }, { 'newsId': news['digest'] }]}) if click_logs.count() == 0: if news['description'] is not None: click_log = { 'userId': userId, 'newsId': news['digest'], 'description': news['description'], 'timestamp': datetime.utcnow(), 'clicked': 1 } db[CLICK_LOGS_TABLE_NAME].insert(click_log) logger.info("Click log processor: add click log") logger.info(news['description']) else: logger.info('==== empty news description ==== ') else: for click_log in click_logs: click_log['timestamp'] = datetime.utcnow() db[CLICK_LOGS_TABLE_NAME].replace_one( {"$and": [{ 'userId': userId }, { 'newsId': news['digest'] }]}, click_log, upsert=True) logger.info( "Click log processor: find duplicated click and update the time" ) logger.info(news['description'])
def __init__(self, host, port): self.host = host self.port = port logger.info("Graphite client is created at %s:%d" % (host, port))