Esempio n. 1
0
def getNewsSummariesForUser(user_id, page_num):
    page_num = int(page_num)
    begin_index = (page_num - 1) * NEWS_LIST_BATCH_SIZE
    end_index = page_num * NEWS_LIST_BATCH_SIZE

    # The final list of news to be returned.
    if redis_client.get(user_id) is not None:
        # deserialized redis data
        news_digests = pickle.loads(redis_client.get(user_id))

        # If begin_index is out of range, this will return empty list;
        # If end_index is out of range (begin_index is within the range), this
        # will return all remaining news ids.
        sliced_news_digests = news_digests[begin_index:end_index]
        print(sliced_news_digests)
        db = mongodb_client.get_db()
        sliced_news = list(db[NEWS_TABLE_NAME].find(
            {'digest': {
                '$in': sliced_news_digests
            }}))
    else:
        db = mongodb_client.get_db()
        # -1 means latest one
        # news for specific user
        total_news = list(db[NEWS_TABLE_NAME].find().sort([
            ('publishedAt', -1)
        ]).limit(NEWS_LIMIT))
        total_news_digests = [x['digest'] for x in total_news]

        # redis doesn't take json object
        # pickle used to serialized object/json/dict
        redis_client.set(user_id, pickle.dumps(total_news_digests))
        redis_client.expire(user_id, USER_NEWS_TIME_OUT_IN_SECONDS)

        sliced_news = total_news[begin_index:end_index]

    # Get preference for the user
    # preference = news_recommendation_service_client.getPreferenceForUser(user_id)
    # topPreference = None
    #
    # if preference is not None and len(preference) > 0:
    #     topPreference = preference[0]
    #
    # for news in sliced_news:
    #     # Remove text field to save bandwidth.
    #     del news['text']
    #     if news['class'] == topPreference:
    #         news['reason'] = 'Recommend'
    #     if news['publishedAt'].date() == datetime.today().date():
    #         news['time'] = 'today'
    return json.loads(dumps(sliced_news))
Esempio n. 2
0
def test_basic():
    db = client.get_db('test')
    db.demo.drop()
    assert db.test.estimated_document_count() == 0
    db.demo.insert_one({'test': 123})
    assert db.demo.estimated_document_count() == 1
    db.demo.drop()
    assert db.demo.estimated_document_count() == 0
    print('test passed')
Esempio n. 3
0
def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        return

    if ('userId' not in msg
            or 'newsId' not in msg
            or 'timestamp' not in msg):
        return

    userId = msg['userId']
    newsId = msg['newsId']
    print('newsid', newsId)
    # Update user's preference
    db = mongodb_client.get_db()
    model = db[PREFERENCE_MODEL_TABLE_NAME].find_one({'userId': userId})

    # If model not exists, create a new one
    if model is None:
        print('Creating preference model for new user: %s' % userId)
        new_model = {'userId': userId}
        # preference map for each topic
        preference = {}
        for i in news_classes.classes:
            preference[i] = float(INITIAL_P)
        new_model['preference'] = preference
        model = new_model

    print('Updating preference model for new user: %s' % userId)

    # Update model using time decaying method
    news = db[NEWS_TABLE_NAME].find_one({'digest': newsId})
    print(news)
    if (news is None
            or 'class' not in news
            or news['class'] not in news_classes.classes):
        # print(news is None)
        # print('class' not in news)
        # print(news['class'] not in news_classes.classes)
        # print('Skipping processing...')
        return

    click_class = news['class']

    # Update the clicked topic
    old_p = model['preference'][click_class]
    model['preference'][click_class] = float((1 - ALPHA) * old_p + ALPHA)

    # Update not clicked classes.
    for i, prob in model['preference'].items():
        if not i == click_class:
            model['preference'][i] = float((1 - ALPHA) * model['preference'][i])

    print("model", model)
    db[PREFERENCE_MODEL_TABLE_NAME].replace_one({'userId': userId}, model, upsert=True)
Esempio n. 4
0
def logNewsClickForUser(user_id, news_id):
    # Send log task to machine learning service for prediction
    message = {
        'userId': user_id,
        'newsId': news_id,
        'timestamp': datetime.utcnow()
    }

    db = mongodb_client.get_db()
    db[CLICK_LOGS_TABLE_NAME].insert(message)

    message = {
        'userId': user_id,
        'newsId': news_id,
        'timestamp': str(datetime.utcnow())
    }
    cloudAMQP_client.send_message(message)
Esempio n. 5
0
def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        return
    task = msg
    text = task['text']
    if text is None:
        return

    # get all recent news based on publishedAt
    published_at = parser.parse(task['publishedAt'])
    published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0)
    published_at_day_end = published_at_day_begin + datetime.timedelta(days=1)

    db = mongodb_client.get_db()
    recent_news_list = list(
        db[NEWS_TABLE_NAME].find({'publishedAt': {'$gte': published_at_day_begin, '$lt': published_at_day_end}}))

    if recent_news_list is not None and len(recent_news_list) > 0:
        documents = [news['text'] for news in recent_news_list]
        documents.insert(0, text)

        # Calculate similarity matrix
        tfidf = TfidfVectorizer().fit_transform(documents)
        pairwise_sim = tfidf * tfidf.T

        print(pairwise_sim)

        rows, _ = pairwise_sim.shape

        for row in range(1, rows):
            if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD:
                # Duplicated news. Ignore.
                print("Duplicated news. Ignore.")
                return
    task['publishedAt'] = parser.parse(task['publishedAt'])

    # Classify news
    # title = task['title']
    # if title is not None:
    #     topic = news_topic_modeling_service_client.classify(title)
    #     task['class'] = topic

    # if exist, just replace, if not, same as insert
    db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']}, task, upsert=True)
Esempio n. 6
0
def test_logNewsClickForUser_basic():
    db = mongodb_client.get_db()
    # clean the db, make sure the later test operation is test itself
    db[operations.CLICK_LOGS_TABLE_NAME].delete_many({"userId": "test"})

    operations.logNewsClickForUser('test', 'test_news')

    record = list(db[operations.CLICK_LOGS_TABLE_NAME].find().sort([
        ('timestamp', -1)
    ]).limit(1))[0]

    assert record is not None
    assert record['userId'] == 'test'
    assert record['newsId'] == 'test_news'
    assert record['timestamp'] is not None

    db[operations.CLICK_LOGS_TABLE_NAME].delete_many({"userId": "test"})

    # Verify the message has been sent to queue
    msg = cloudAMQP_client.get_message()
    assert msg is not None
    assert msg['userId'] == 'test'
    assert msg['newsId'] == 'test_news'
    assert msg['timestamp'] is not None
Esempio n. 7
0
DEDUPE_NEWS_TASK_QUEUE_URL = cc.CLOUDAMQP_URL
DEDUPE_NEWS_TASK_QUEUE_NAME = "news-manager-dedupe-task"
SLEEP_TIME_IN_SECONDS = 1

SAME_NEWS_SIMILARITY_THRESHOLD = 0.9

logger_format = '%(asctime)s - %(message)s'
logging.basicConfig(format=logger_format)
logger = logging.getLogger('news_deduper')
logger.setLevel(logging.DEBUG)

NEWS_TABLE_NAME = "news_col"
cloudAMQP_client = CloudAMQPClient(DEDUPE_NEWS_TASK_QUEUE_URL,
                                   DEDUPE_NEWS_TASK_QUEUE_NAME)

db = mongodb_client.get_db("demo_news")
# col=db.get_collection("news_col")

cloudAMQP_client = CloudAMQPClient(DEDUPE_NEWS_TASK_QUEUE_URL,
                                   DEDUPE_NEWS_TASK_QUEUE_NAME)

news = cloudAMQP_client.getMessage()


def handle_message(news):
    text = news['text']
    description = news['description']
    if description is None:
        description = news['title']
    news['publishedAt'] = parser.parse(
        news['publishedAt']