def getNewsSummariesForUser(user_id, page_num): page_num = int(page_num) begin_index = (page_num - 1) * NEWS_LIST_BATCH_SIZE end_index = page_num * NEWS_LIST_BATCH_SIZE # The final list of news to be returned. if redis_client.get(user_id) is not None: # deserialized redis data news_digests = pickle.loads(redis_client.get(user_id)) # If begin_index is out of range, this will return empty list; # If end_index is out of range (begin_index is within the range), this # will return all remaining news ids. sliced_news_digests = news_digests[begin_index:end_index] print(sliced_news_digests) db = mongodb_client.get_db() sliced_news = list(db[NEWS_TABLE_NAME].find( {'digest': { '$in': sliced_news_digests }})) else: db = mongodb_client.get_db() # -1 means latest one # news for specific user total_news = list(db[NEWS_TABLE_NAME].find().sort([ ('publishedAt', -1) ]).limit(NEWS_LIMIT)) total_news_digests = [x['digest'] for x in total_news] # redis doesn't take json object # pickle used to serialized object/json/dict redis_client.set(user_id, pickle.dumps(total_news_digests)) redis_client.expire(user_id, USER_NEWS_TIME_OUT_IN_SECONDS) sliced_news = total_news[begin_index:end_index] # Get preference for the user # preference = news_recommendation_service_client.getPreferenceForUser(user_id) # topPreference = None # # if preference is not None and len(preference) > 0: # topPreference = preference[0] # # for news in sliced_news: # # Remove text field to save bandwidth. # del news['text'] # if news['class'] == topPreference: # news['reason'] = 'Recommend' # if news['publishedAt'].date() == datetime.today().date(): # news['time'] = 'today' return json.loads(dumps(sliced_news))
def test_basic(): db = client.get_db('test') db.demo.drop() assert db.test.estimated_document_count() == 0 db.demo.insert_one({'test': 123}) assert db.demo.estimated_document_count() == 1 db.demo.drop() assert db.demo.estimated_document_count() == 0 print('test passed')
def handle_message(msg): if msg is None or not isinstance(msg, dict): return if ('userId' not in msg or 'newsId' not in msg or 'timestamp' not in msg): return userId = msg['userId'] newsId = msg['newsId'] print('newsid', newsId) # Update user's preference db = mongodb_client.get_db() model = db[PREFERENCE_MODEL_TABLE_NAME].find_one({'userId': userId}) # If model not exists, create a new one if model is None: print('Creating preference model for new user: %s' % userId) new_model = {'userId': userId} # preference map for each topic preference = {} for i in news_classes.classes: preference[i] = float(INITIAL_P) new_model['preference'] = preference model = new_model print('Updating preference model for new user: %s' % userId) # Update model using time decaying method news = db[NEWS_TABLE_NAME].find_one({'digest': newsId}) print(news) if (news is None or 'class' not in news or news['class'] not in news_classes.classes): # print(news is None) # print('class' not in news) # print(news['class'] not in news_classes.classes) # print('Skipping processing...') return click_class = news['class'] # Update the clicked topic old_p = model['preference'][click_class] model['preference'][click_class] = float((1 - ALPHA) * old_p + ALPHA) # Update not clicked classes. for i, prob in model['preference'].items(): if not i == click_class: model['preference'][i] = float((1 - ALPHA) * model['preference'][i]) print("model", model) db[PREFERENCE_MODEL_TABLE_NAME].replace_one({'userId': userId}, model, upsert=True)
def logNewsClickForUser(user_id, news_id): # Send log task to machine learning service for prediction message = { 'userId': user_id, 'newsId': news_id, 'timestamp': datetime.utcnow() } db = mongodb_client.get_db() db[CLICK_LOGS_TABLE_NAME].insert(message) message = { 'userId': user_id, 'newsId': news_id, 'timestamp': str(datetime.utcnow()) } cloudAMQP_client.send_message(message)
def handle_message(msg): if msg is None or not isinstance(msg, dict): return task = msg text = task['text'] if text is None: return # get all recent news based on publishedAt published_at = parser.parse(task['publishedAt']) published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0) published_at_day_end = published_at_day_begin + datetime.timedelta(days=1) db = mongodb_client.get_db() recent_news_list = list( db[NEWS_TABLE_NAME].find({'publishedAt': {'$gte': published_at_day_begin, '$lt': published_at_day_end}})) if recent_news_list is not None and len(recent_news_list) > 0: documents = [news['text'] for news in recent_news_list] documents.insert(0, text) # Calculate similarity matrix tfidf = TfidfVectorizer().fit_transform(documents) pairwise_sim = tfidf * tfidf.T print(pairwise_sim) rows, _ = pairwise_sim.shape for row in range(1, rows): if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD: # Duplicated news. Ignore. print("Duplicated news. Ignore.") return task['publishedAt'] = parser.parse(task['publishedAt']) # Classify news # title = task['title'] # if title is not None: # topic = news_topic_modeling_service_client.classify(title) # task['class'] = topic # if exist, just replace, if not, same as insert db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']}, task, upsert=True)
def test_logNewsClickForUser_basic(): db = mongodb_client.get_db() # clean the db, make sure the later test operation is test itself db[operations.CLICK_LOGS_TABLE_NAME].delete_many({"userId": "test"}) operations.logNewsClickForUser('test', 'test_news') record = list(db[operations.CLICK_LOGS_TABLE_NAME].find().sort([ ('timestamp', -1) ]).limit(1))[0] assert record is not None assert record['userId'] == 'test' assert record['newsId'] == 'test_news' assert record['timestamp'] is not None db[operations.CLICK_LOGS_TABLE_NAME].delete_many({"userId": "test"}) # Verify the message has been sent to queue msg = cloudAMQP_client.get_message() assert msg is not None assert msg['userId'] == 'test' assert msg['newsId'] == 'test_news' assert msg['timestamp'] is not None
DEDUPE_NEWS_TASK_QUEUE_URL = cc.CLOUDAMQP_URL DEDUPE_NEWS_TASK_QUEUE_NAME = "news-manager-dedupe-task" SLEEP_TIME_IN_SECONDS = 1 SAME_NEWS_SIMILARITY_THRESHOLD = 0.9 logger_format = '%(asctime)s - %(message)s' logging.basicConfig(format=logger_format) logger = logging.getLogger('news_deduper') logger.setLevel(logging.DEBUG) NEWS_TABLE_NAME = "news_col" cloudAMQP_client = CloudAMQPClient(DEDUPE_NEWS_TASK_QUEUE_URL, DEDUPE_NEWS_TASK_QUEUE_NAME) db = mongodb_client.get_db("demo_news") # col=db.get_collection("news_col") cloudAMQP_client = CloudAMQPClient(DEDUPE_NEWS_TASK_QUEUE_URL, DEDUPE_NEWS_TASK_QUEUE_NAME) news = cloudAMQP_client.getMessage() def handle_message(news): text = news['text'] description = news['description'] if description is None: description = news['title'] news['publishedAt'] = parser.parse( news['publishedAt']