def test_basic(): db = client.get_db('test') db.testCollection.drop() assert db.testCollection.count() == 0 db.testCollection.insert({'test': 1, 'hello': "world"}) assert db.testCollection.count() == 1 db.testCollection.drop() assert db.testCollection.count() == 0 print 'test_basic passed.'
def logNewsClickForUser(user_id, news_id): message = {'userId': user_id, 'newsId': news_id, 'timestamp': datetime.utcnow()} db = mongo_client.get_db() db[CLICK_LOGS_TABLE_NAME].insert(message) # Send log task to machine learning service for prediction message = {'userId': user_id, 'newsId': news_id, 'timestamp': str(datetime.utcnow())} cloudAMQP_client.sendMessage(message)
def getNewsSummariesForUser(user_id, page_num): page_num = int(page_num) begin_index = (page_num - 1) * config['operations']['NEWS_LIST_BATCH_SIZE'] end_index = page_num * int(config['operations']['NEWS_LIST_BATCH_SIZE']) # The final list of news to be returned. sliced_news = [] if redis_client.get(user_id) is not None: news_digests = pickle.loads(redis_client.get(user_id)) # If begin_index is out of range, this will return empty list; # If end_index is out of range (begin_index is within the range), this # will return all remaining news ids. sliced_news_digests = news_digests[begin_index:end_index] print sliced_news_digests db = mongo_client.get_db() sliced_news = list(db[NEWS_TABLE_NAME].find({'digest':{'$in':sliced_news_digests}})) else: db = mongo_client.get_db() total_news = list(db[NEWS_TABLE_NAME].find().sort([('publishedAt', -1)]).limit(NEWS_LIMIT)) total_news_digests = map(lambda x:x['digest'], total_news) redis_client.set(user_id, pickle.dumps(total_news_digests)) redis_client.expire(user_id, config['operations']['USER_NEWS_TIME_OUT_IN_SECONDS']) sliced_news = total_news[begin_index:end_index] # Get preference for the user preference = news_recommendation_service_client.getPreferenceForUser(user_id) topPreference = None if preference is not None and len(preference) > 0: topPreference = preference[0] for news in sliced_news: # Remove text field to save bandwidth. del news['text'] #if news['class'] == topPreference: # news['reason'] = 'Recommend' if news['publishedAt'].date() == datetime.today().date(): news['time'] = 'today' return json.loads(dumps(sliced_news))
def handle_message(msg): #if msg is None or not isinstance(msg, dict): #return task = msg text = task['text'] if text is None: #print 'how are you' return published_at = parser.parse(task['publishedAt']) published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0) published_at_day_end = published_at_day_begin + datetime.timedelta(days=1) print 'hello' db = mongo_client.get_db() same_day_news_list = list(db[NEWS_TABLE_NAME].find({ 'publishedAt': { '$gte': published_at_day_begin, '$lt': published_at_day_end } })) print 'how are you' if same_day_news_list is not None and len(same_day_news_list) > 0: documents = [news['text'] for news in same_day_news_list] documents.insert(0, text) tfidf = TfidfVectorizer().fit_transform(documents) pairwise_sim = tfidf * tfidf.T print pairwise_sim rows, _ = pairwise_sim.shape for row in range(1, rows): if pairwise_sim[row, 0] > config['news_deduper'][ 'SAME_NEWS_SIMILARITY_THRESHOLD']: print "Duplicated news. Ignore." return print 'what about' task['publishedAt'] = parser.parse(task['publishedAt']) #Classified news print 'title is title' title = task['title'] if title is not None: topic = news_topic_modeling_service_client.classify(title) task['class'] = topic print 'what is wrong' db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']}, task, upsert=True)
def handle_message(msg): if msg is None or not isinstance(msg, dict): return if ('userId' not in msg or 'newsId' not in msg or 'timestamp' not in msg): return userId = msg['userId'] newsId = msg['newsId'] # Update user's preference db = mongo_client.get_db() model = db[PREFERENCE_MODEL_TABLE_NAME].find_one({'userId': userId}) # If model not exists, create a new one if model is None: print 'Creating preference model for new user: %s' % userId new_model = {'userId': userId} preference = {} for i in news_classes.classes: preference[i] = float(INITIAL_P) new_model['preference'] = preference model = new_model print 'Updating preference model for new user: %s' % userId # Update model using time decaying method news = db[NEWS_TABLE_NAME].find_one({'digest': newsId}) if (news is None or 'class' not in news or news['class'] not in news_classes.classes): print news is None print 'class' not in news print news['class'] not in news_classes.classes print 'Skipping processing...' return click_class = news['class'] # Update the clicked one. old_p = model['preference'][click_class] model['preference'][click_class] = float((1 - ALPHA) * old_p + ALPHA) # Update not clicked classes. for i, prob in model['preference'].iteritems(): if not i == click_class: model['preference'][i] = float( (1 - ALPHA) * model['preference'][i]) db[PREFERENCE_MODEL_TABLE_NAME].replace_one({'userId': userId}, model, upsert=True)
def test_basic(): db = mongo_client.get_db() db[PREFERENCE_MODEL_TABLE_NAME].delete_many({"userId": "test_user1"}) msg = {"userId": "test_user1", "newsId": "test_news", "timestamp": str(datetime.utcnow())} click_log_processor.handle_message(msg) model = db[PREFERENCE_MODEL_TABLE_NAME].find_one({'userId':'test_user1'}) assert model is not None assert len(model['preference']) == NUM_OF_CLASSES print 'test_basic passed'
def getPreferenceForUser(self, user_id): db = mongo_client.get_db() model = db[PREFERENCE_MODEL_TABLE_NAME].find_one({'userId': user_id}) if model is None: return [] sorted_tuples = sorted(model['preference'].items(), key=operator.itemgetter(1), reverse=True) sorted_list = [x[0] for x in sorted_tuples] sorted_value_list = [x[1] for x in sorted_tuples] # If the first preference is same as the last one, the preference makes # no sense. if isclose(float(sorted_value_list[0]), float(sorted_value_list[-1])): return [] return sorted_list
import os import sys # import common package in parent directory sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common')) import mongo_client import news_topic_modeling_service_client if __name__ == '__main__': db = mongo_client.get_db() cursor = db['news'].find({}) count = 0 for news in cursor: count += 1 print count if 'class' not in news: print 'Populating classes...' description = news['description'] if description is None: description = news['title'] topic = news_topic_modeling_service_client.classify(description) news['class'] = topic db['news'].replace_one({'digest': news['digest']}, news, upsert=True)