def test_logNewsClickForUser_basic(): db = AWS_mongodb_client.get_db() db[MONGODB_CLICK_LOGS_TABLE_NAME].delete_many({"userId": "test"}) operations.logNewsClickForUser('test', 'test_news') # Verify click logs written into MongoDB # Get most recent record in MongoDB. record = list(db[MONGODB_CLICK_LOGS_TABLE_NAME].find().sort([ ('timestamp', -1) ]).limit(1))[0] assert record is not None assert record['userId'] == 'test' assert record['newsId'] == 'test_news' assert record['timestamp'] is not None db[MONGODB_CLICK_LOGS_TABLE_NAME].delete_many({"userId": "test"}) # Verify the message can be received by kafkaconsumer. for msg in Log_kafka_consumer: dumpmsg = json.loads(msg) assert dumpmsg['userId'] == 'test' assert dumpmsg['newsId'] == 'test_news' print('test_logNewsClicksForUser_basic passed!')
def getNewsSummariesForUser(user_id, page_num): page_num = int(page_num) begin_index = (page_num - 1) * MONGODB_NEWS_LIST_BATCH_SIZE end_index = page_num * MONGODB_NEWS_LIST_BATCH_SIZE # The final list of news to be returned. sliced_news = [] if AWS_redis_client.get(user_id) is not None: news_digests = pickle.loads(AWS_redis_client.get(user_id)) # GET the corresponding (VALUE)news_id by (KEY)a user_id # If begin_index is out of range, this will return empty list; # If end_index is out of range (begin_index is within the range), this # will return all remaining news ids. sliced_news_digests = news_digests[begin_index:end_index] print(sliced_news_digests) db = AWS_mongodb_client.get_db() # "newCollection" sliced_news = list(db[MONGODB_NEWS_TABLE_NAME].find({'digest':{'$in':sliced_news_digests}})) else: db = AWS_mongodb_client.get_db() # sort in descending order(-1) total_news = list(db[MONGODB_NEWS_TABLE_NAME].find().sort([('publishedAt', -1)]).limit(MONGODB_NEWS_LIMIT)) total_news_digests = map(lambda x:x['digest'], total_news) AWS_redis_client.set(user_id, pickle.dumps(total_news_digests)) AWS_redis_client.expire(user_id, REDIS_USER_NEWS_TIME_OUT_IN_SECONDS) sliced_news = total_news[begin_index:end_index] # Get preference for the user preference = news_recommendation_service_client.getPreferenceForUser(user_id) topPreference = None if preference is not None and len(preference) > 0: topPreference = preference[0] for news in sliced_news: # Remove text field to save bandwidth. del news['text'] if news['class'] == topPreference: news['reason'] = 'Recommend' if news['publishedAt'].date() == datetime.today().date(): news['time'] = 'today' return json.loads(dumps(sliced_news))
def test_basic(): db = client.get_db('test') db.demo.drop() assert db.demo.count() == 0 db.demo.insert({"test": 123}) assert db.demo.count() == 1 db.demo.drop() assert db.demo.count() == 0 print('test_basic passed!')
def logNewsClickForUser(user_id, news_id): message = {'userId': user_id, 'newsId': news_id, 'timestamp': datetime.utcnow()} db = AWS_mongodb_client.get_db() # user_id; newsID; timestamp db[MONGODB_CLICK_LOGS_TABLE_NAME].insert(message) # Send log task to machine learning service for prediction message = {'userId': user_id, 'newsId': news_id} AWS_Log_kafka_producer.send(topic=AWS_KAFKA_LOG_CLICKS_TASK_QUEUE, value=json.dumps(message), timestamp_ms=time.time())
def handle_message(msg): if msg is None or not isinstance(msg, dict) : return if ('userId' not in msg or 'newsId' not in msg): return userId = msg['userId'] newsId = msg['newsId'] # Update user's preference db = AWS_mongodb_client.get_db() model = db[MONGODB_PREFERENCE_MODEL_TABLE_NAME].find_one({'userId': userId}) print(model) # If model not exists, create a new one if model is None: print('Creating preference model for new user: %s' % userId) new_model = {'userId' : userId} preference = {} for i in news_classes.classes: preference[i] = float(INITIAL_P) # set the value of the news class in dict 'preference' all to 0.17(INITIAL_P) new_model['preference'] = preference model = new_model print('Updating preference model for new user: %s' % userId) # Update model using time decaying method news = db[MONGODB_NEWS_TABLE_NAME].find_one({'digest': newsId}) if (news is None or 'class' not in news or news['class'] not in news_classes.classes): print(news is None) # print 'class' not in news # print news['class'] not in news_classes.classes print('Skipping processing...') return click_class = news['class'] # get the class of clicked news # Update the clicked one. old_p = model['preference'][click_class] model['preference'][click_class] = float((1 - ALPHA) * old_p + ALPHA) # Update not clicked classes. for i, prob in model['preference'].iteritems(): if not i == click_class: model['preference'][i] = float((1 - ALPHA) * model['preference'][i]) db[MONGODB_PREFERENCE_MODEL_TABLE_NAME].replace_one({'userId': userId}, model, upsert=True)
def getPreferenceForUser(self, user_id): db = AWS_mongodb_client.get_db() model = db[MONGODB_PREFERENCE_MODEL_TABLE_NAME].find_one( {'userId': user_id}) if model is None: return [] sorted_tuples = sorted(model['preference'].items(), key=operator.itemgetter(1), reverse=True) sorted_list = [x[0] for x in sorted_tuples] sorted_value_list = [x[1] for x in sorted_tuples] # If the first preference is same as the last one, the preference makes # no sense. if isclose(float(sorted_value_list[0]), float(sorted_value_list[-1])): return [] return sorted_list
def handle_message(msg): if msg is None or not isinstance(msg, dict): return task = msg text = str(task['text'].encode('utf-8')) # extract text as string if text is None: return # get all recent news based on publishedAt published_at = parser.parse(task['publishedAt']) # represented as number published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0) published_at_day_end = published_at_day_begin + datetime.timedelta(days=1) db = AWS_mongodb_client.get_db() # newCollection same_day_news_list = list(db[MONGODB_NEWS_TABLE_NAME].find({ 'publishedAt': { '$gte': published_at_day_begin, '$lt': published_at_day_end } })) ''' if news existed in db between that time, calculate similarity; if not, store in mongodb ''' if same_day_news_list is not None and len(same_day_news_list) > 0: documents = [ str(news['text'].encode('utf-8')) for news in same_day_news_list ] documents.insert(0, text) # Calculate similarity matrix tfidf = TfidfVectorizer().fit_transform( documents ) # Learn vocabulary in document and return the term-document matrix pairwise_sim = tfidf * tfidf.T print(pairwise_sim.A) # Display the result in type Array rows, _ = pairwise_sim.shape # return # samples and features for row in range(1, rows): if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD: # Duplicated news. Ignore. print("Duplicated news. Ignore.") return task['publishedAt'] = parser.parse( task['publishedAt']) # time represented as numbers # Classify news title = task['title'] if title is not None: topic = news_topic_modeling_service_client.classify(title) task['class'] = topic # set class for news to 'class' field db[MONGODB_NEWS_TABLE_NAME].replace_one({'digest': task['digest']}, task, upsert=True)
import os import sys # import common package in parent directory sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common')) from common import AWS_mongodb_client from common import news_topic_modeling_service_client if __name__ == '__main__': db = AWS_mongodb_client.get_db() cursor = db['news'].find({}) count = 0 for news in cursor: count += 1 print(count) if 'class' not in news: print('Populating classes...') title = news['title'] topic = news_topic_modeling_service_client.classify(title) news['class'] = topic db['news'].replace_one({'digest': news['digest']}, news, upsert=True)