def process_message(msg):
    task = msg
    text = task['text']
    if text is None:
        return

    published_at = parser.parse(task['publishedAt'])
    published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0)
    published_at_day_end = published_at_day_begin + datetime.timedelta(days=1)

    db = db_client.get_db()
    same_day_news_list = list(db[NEWS_TABLE_NAME].find({'publishedAt': {'$gte': published_at_day_begin, '$lt': published_at_day_end}}))

    if same_day_news_list is not None and len(same_day_news_list) > 0:
        documents = [news['text'] for news in same_day_news_list]
        documents.insert(0, text)
        tfidf = TfidfVectorizer().fit_transform(documents)
        pairwise_sim = tfidf * tfidf.T

        print(pairwise_sim.A)

        rows, _ = pairwise_sim.shape

        for row in range(1, rows):
            # similarity is 0.8
            if pairwise_sim[row, 0] > 0.8:
                print("Ignore duplicated news")
                return
    task['publishedAt'] = parser.parse(task['publishedAt'])
    if task['title'] is not None:
        task['class'] = model_client.classify(task['description'])
        # print ("task class %s" % task["class"])

    db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']}, task, upsert=True)
def getNewsSummariesForUser(user_id, page_num):
    page_num = int(page_num)
    begin_index = (page_num - 1) * NEWS_LIST_BATCH_SIZE
    end_index = page_num * NEWS_LIST_BATCH_SIZE

    sliced_news = []

    if redis_client.get(user_id) is not None:
        total_news_digests = pickle.loads(redis_client.get(user_id))

        sliced_news_digests = total_news_digests[begin_index:end_index]
        db = database_client.get_db()
        sliced_news = list(db[NEWS_TABLE_NAME].find(
            {'digest': {
                '$in': sliced_news_digests
            }}))
    else:
        db = database_client.get_db()
        total_news = list(db[NEWS_TABLE_NAME].find().sort([
            ('publishedAt', -1)
        ]).limit(NEWS_LIMIT))
        total_news_digests = [x['digest'] for x in total_news]

        redis_client.set(user_id, pickle.dumps(total_news_digests))
        redis_client.expire(user_id, USER_NEWS_TIME_OUT_IN_SECONDS)

        sliced_news = total_news[begin_index:end_index]

    # Get preference for the user.
    preference = recommender_system_client.getPreferenceForUser(user_id)
    topPrefence = None
    print('top preference %s' % topPrefence)

    if preference is not None and len(preference) > 0:
        topPrefence = preference[0]
        print('top preference %s' % topPrefence)

    for news in sliced_news:
        # Remove text field to save bandwidth.
        del news['text']
        if news['publishedAt'].date() == datetime.today().date():
            news['time'] = 'today'
        if news['class'] == topPrefence:
            news['reason'] = "Recommend"
    result = json.loads(dumps(sliced_news))
    # print result
    return result
def getPreferenceForUser(user_id):
    """ Get user's preference in an ordered class list """
    print("userid: %s" % user_id)
    db = database_client.get_db()
    model = db[PREFERENCE_MODEL_TABLE].find_one({'user_id': user_id})
    print('model')
    print(model)
    if model is None:
        return []

    sorted_tuples = sorted(list(model['preference'].items()),
                           key=operator.itemgetter(1),
                           reverse=True)
    sorted_list = [x[0] for x in sorted_tuples]
    sorted_value_list = [x[1] for x in sorted_tuples]

    if isclose(float(sorted_value_list[0]), float(sorted_value_list[-1])):
        return []

    return sorted_list
Ejemplo n.º 4
0
import os
import sys

# import common package in parent directory
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'helpers'))

import database_client
import model_server_client

if __name__ == '__main__':
    db = database_client.get_db()
    cursor = db['news'].find({})
    count = 0
    for news in cursor:
        count += 1
        print(count)

        if 'class' not in news:
            print('Populating classes...')
            description = news['description']
            if description is None:
                description = news['title']
            news['class'] = model_server_client.classify(description)
            db['news'].replace_one({'digest': news['digest']},
                                   news,
                                   upsert=True)
def test_basic():
    db = client.get_db('news')
    print db.news.count()
    assert db.news.count() == 1495
    print('passed')