def test_logNewsClickForUser_basic():
    db = AWS_mongodb_client.get_db()
    db[MONGODB_CLICK_LOGS_TABLE_NAME].delete_many({"userId": "test"})

    operations.logNewsClickForUser('test', 'test_news')

    # Verify click logs written into MongoDB
    # Get most recent record in MongoDB.
    record = list(db[MONGODB_CLICK_LOGS_TABLE_NAME].find().sort([
        ('timestamp', -1)
    ]).limit(1))[0]

    assert record is not None
    assert record['userId'] == 'test'
    assert record['newsId'] == 'test_news'
    assert record['timestamp'] is not None

    db[MONGODB_CLICK_LOGS_TABLE_NAME].delete_many({"userId": "test"})

    # Verify the message can be received by kafkaconsumer.
    for msg in Log_kafka_consumer:
        dumpmsg = json.loads(msg)
        assert dumpmsg['userId'] == 'test'
        assert dumpmsg['newsId'] == 'test_news'

    print('test_logNewsClicksForUser_basic passed!')
def getNewsSummariesForUser(user_id, page_num):

    page_num = int(page_num)
    begin_index = (page_num - 1) * MONGODB_NEWS_LIST_BATCH_SIZE
    end_index = page_num * MONGODB_NEWS_LIST_BATCH_SIZE

    # The final list of news to be returned.
    sliced_news = []

    if AWS_redis_client.get(user_id) is not None:
        news_digests = pickle.loads(AWS_redis_client.get(user_id)) # GET the corresponding (VALUE)news_id by (KEY)a user_id

        # If begin_index is out of range, this will return empty list;
        # If end_index is out of range (begin_index is within the range), this
        # will return all remaining news ids.
        sliced_news_digests = news_digests[begin_index:end_index]
        print(sliced_news_digests)
        db = AWS_mongodb_client.get_db()
        # "newCollection"
        sliced_news = list(db[MONGODB_NEWS_TABLE_NAME].find({'digest':{'$in':sliced_news_digests}}))
    else:
        db = AWS_mongodb_client.get_db()
        # sort in descending order(-1)
        total_news = list(db[MONGODB_NEWS_TABLE_NAME].find().sort([('publishedAt', -1)]).limit(MONGODB_NEWS_LIMIT))
        total_news_digests = map(lambda x:x['digest'], total_news)

        AWS_redis_client.set(user_id, pickle.dumps(total_news_digests))
        AWS_redis_client.expire(user_id, REDIS_USER_NEWS_TIME_OUT_IN_SECONDS)

        sliced_news = total_news[begin_index:end_index]

    # Get preference for the user
    preference = news_recommendation_service_client.getPreferenceForUser(user_id)
    topPreference = None

    if preference is not None and len(preference) > 0:
        topPreference = preference[0]

    for news in sliced_news:
        # Remove text field to save bandwidth.
        del news['text']
        if news['class'] == topPreference:
            news['reason'] = 'Recommend'
        if news['publishedAt'].date() == datetime.today().date():
            news['time'] = 'today'
    return json.loads(dumps(sliced_news))
Ejemplo n.º 3
0
def test_basic():
    db = client.get_db('test')
    db.demo.drop()
    assert db.demo.count() == 0
    db.demo.insert({"test": 123})
    assert db.demo.count() == 1
    db.demo.drop()
    assert db.demo.count() == 0
    print('test_basic passed!')
def logNewsClickForUser(user_id, news_id):
    message = {'userId': user_id, 'newsId': news_id, 'timestamp': datetime.utcnow()}

    db = AWS_mongodb_client.get_db()
    # user_id;  newsID; timestamp
    db[MONGODB_CLICK_LOGS_TABLE_NAME].insert(message)

    # Send log task to machine learning service for prediction
    message = {'userId': user_id, 'newsId': news_id}
    AWS_Log_kafka_producer.send(topic=AWS_KAFKA_LOG_CLICKS_TASK_QUEUE,
                                value=json.dumps(message), timestamp_ms=time.time())
Ejemplo n.º 5
0
def handle_message(msg):
    if msg is None or not isinstance(msg, dict) :
        return
    if ('userId' not in msg
        or 'newsId' not in msg):
        return

    userId = msg['userId']
    newsId = msg['newsId']

    # Update user's preference
    db = AWS_mongodb_client.get_db()
    model = db[MONGODB_PREFERENCE_MODEL_TABLE_NAME].find_one({'userId': userId})
    print(model)
    # If model not exists, create a new one
    if model is None:
        print('Creating preference model for new user: %s' % userId)
        new_model = {'userId' : userId}
        preference = {}
        for i in news_classes.classes:
            preference[i] = float(INITIAL_P) # set the value of the news class in dict 'preference' all to 0.17(INITIAL_P)
        new_model['preference'] = preference
        model = new_model

    print('Updating preference model for new user: %s' % userId)

    # Update model using time decaying method
    news = db[MONGODB_NEWS_TABLE_NAME].find_one({'digest': newsId})
    if (news is None
        or 'class' not in news
        or news['class'] not in news_classes.classes):
        print(news is None)
        # print 'class' not in news
        # print news['class'] not in news_classes.classes
        print('Skipping processing...')
        return

    click_class = news['class'] # get the class of clicked news 

    # Update the clicked one.
    old_p = model['preference'][click_class]
    model['preference'][click_class] = float((1 - ALPHA) * old_p + ALPHA)

    # Update not clicked classes.
    for i, prob in model['preference'].iteritems():
        if not i == click_class:
            model['preference'][i] = float((1 - ALPHA) * model['preference'][i])

    db[MONGODB_PREFERENCE_MODEL_TABLE_NAME].replace_one({'userId': userId}, model, upsert=True)
Ejemplo n.º 6
0
    def getPreferenceForUser(self, user_id):
        db = AWS_mongodb_client.get_db()
        model = db[MONGODB_PREFERENCE_MODEL_TABLE_NAME].find_one(
            {'userId': user_id})
        if model is None:
            return []

        sorted_tuples = sorted(model['preference'].items(),
                               key=operator.itemgetter(1),
                               reverse=True)
        sorted_list = [x[0] for x in sorted_tuples]
        sorted_value_list = [x[1] for x in sorted_tuples]

        # If the first preference is same as the last one, the preference makes
        # no sense.
        if isclose(float(sorted_value_list[0]), float(sorted_value_list[-1])):
            return []

        return sorted_list
Ejemplo n.º 7
0
def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        return
    task = msg
    text = str(task['text'].encode('utf-8'))  # extract text as string
    if text is None:
        return

    # get all recent news based on publishedAt
    published_at = parser.parse(task['publishedAt'])  # represented as number
    published_at_day_begin = datetime.datetime(published_at.year,
                                               published_at.month,
                                               published_at.day, 0, 0, 0, 0)
    published_at_day_end = published_at_day_begin + datetime.timedelta(days=1)

    db = AWS_mongodb_client.get_db()
    # newCollection
    same_day_news_list = list(db[MONGODB_NEWS_TABLE_NAME].find({
        'publishedAt': {
            '$gte': published_at_day_begin,
            '$lt': published_at_day_end
        }
    }))
    '''
    if news existed in db between that time, calculate similarity;
    if not, store in mongodb 
    '''
    if same_day_news_list is not None and len(same_day_news_list) > 0:
        documents = [
            str(news['text'].encode('utf-8')) for news in same_day_news_list
        ]
        documents.insert(0, text)

        # Calculate similarity matrix
        tfidf = TfidfVectorizer().fit_transform(
            documents
        )  # Learn vocabulary in document and return the term-document matrix
        pairwise_sim = tfidf * tfidf.T

        print(pairwise_sim.A)  # Display the result in type Array

        rows, _ = pairwise_sim.shape  # return # samples and features

        for row in range(1, rows):
            if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD:
                # Duplicated news. Ignore.
                print("Duplicated news. Ignore.")
                return

    task['publishedAt'] = parser.parse(
        task['publishedAt'])  # time represented as numbers

    # Classify news
    title = task['title']
    if title is not None:
        topic = news_topic_modeling_service_client.classify(title)
        task['class'] = topic  # set class for news to 'class' field

    db[MONGODB_NEWS_TABLE_NAME].replace_one({'digest': task['digest']},
                                            task,
                                            upsert=True)
import os
import sys

# import common package in parent directory
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common'))

from common import AWS_mongodb_client
from common import news_topic_modeling_service_client

if __name__ == '__main__':
    db = AWS_mongodb_client.get_db()
    cursor = db['news'].find({})
    count = 0
    for news in cursor:
        count += 1
        print(count)
        if 'class' not in news:
            print('Populating classes...')
            title = news['title']
            topic = news_topic_modeling_service_client.classify(title)
            news['class'] = topic
            db['news'].replace_one({'digest': news['digest']},
                                   news,
                                   upsert=True)