def process_message(msg): task = msg text = task['text'] if text is None: return published_at = parser.parse(task['publishedAt']) published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0) published_at_day_end = published_at_day_begin + datetime.timedelta(days=1) db = db_client.get_db() same_day_news_list = list(db[NEWS_TABLE_NAME].find({'publishedAt': {'$gte': published_at_day_begin, '$lt': published_at_day_end}})) if same_day_news_list is not None and len(same_day_news_list) > 0: documents = [news['text'] for news in same_day_news_list] documents.insert(0, text) tfidf = TfidfVectorizer().fit_transform(documents) pairwise_sim = tfidf * tfidf.T print(pairwise_sim.A) rows, _ = pairwise_sim.shape for row in range(1, rows): # similarity is 0.8 if pairwise_sim[row, 0] > 0.8: print("Ignore duplicated news") return task['publishedAt'] = parser.parse(task['publishedAt']) if task['title'] is not None: task['class'] = model_client.classify(task['description']) # print ("task class %s" % task["class"]) db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']}, task, upsert=True)
def getNewsSummariesForUser(user_id, page_num): page_num = int(page_num) begin_index = (page_num - 1) * NEWS_LIST_BATCH_SIZE end_index = page_num * NEWS_LIST_BATCH_SIZE sliced_news = [] if redis_client.get(user_id) is not None: total_news_digests = pickle.loads(redis_client.get(user_id)) sliced_news_digests = total_news_digests[begin_index:end_index] db = database_client.get_db() sliced_news = list(db[NEWS_TABLE_NAME].find( {'digest': { '$in': sliced_news_digests }})) else: db = database_client.get_db() total_news = list(db[NEWS_TABLE_NAME].find().sort([ ('publishedAt', -1) ]).limit(NEWS_LIMIT)) total_news_digests = [x['digest'] for x in total_news] redis_client.set(user_id, pickle.dumps(total_news_digests)) redis_client.expire(user_id, USER_NEWS_TIME_OUT_IN_SECONDS) sliced_news = total_news[begin_index:end_index] # Get preference for the user. preference = recommender_system_client.getPreferenceForUser(user_id) topPrefence = None print('top preference %s' % topPrefence) if preference is not None and len(preference) > 0: topPrefence = preference[0] print('top preference %s' % topPrefence) for news in sliced_news: # Remove text field to save bandwidth. del news['text'] if news['publishedAt'].date() == datetime.today().date(): news['time'] = 'today' if news['class'] == topPrefence: news['reason'] = "Recommend" result = json.loads(dumps(sliced_news)) # print result return result
def getPreferenceForUser(user_id): """ Get user's preference in an ordered class list """ print("userid: %s" % user_id) db = database_client.get_db() model = db[PREFERENCE_MODEL_TABLE].find_one({'user_id': user_id}) print('model') print(model) if model is None: return [] sorted_tuples = sorted(list(model['preference'].items()), key=operator.itemgetter(1), reverse=True) sorted_list = [x[0] for x in sorted_tuples] sorted_value_list = [x[1] for x in sorted_tuples] if isclose(float(sorted_value_list[0]), float(sorted_value_list[-1])): return [] return sorted_list
import os import sys # import common package in parent directory sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'helpers')) import database_client import model_server_client if __name__ == '__main__': db = database_client.get_db() cursor = db['news'].find({}) count = 0 for news in cursor: count += 1 print(count) if 'class' not in news: print('Populating classes...') description = news['description'] if description is None: description = news['title'] news['class'] = model_server_client.classify(description) db['news'].replace_one({'digest': news['digest']}, news, upsert=True)
def test_basic(): db = client.get_db('news') print db.news.count() assert db.news.count() == 1495 print('passed')