def test_basic():
    """ test_basic"""
    client = CloudAMQPClient(CLOUDAMQP_URL, TEST_QUEUE_NAME)

    sent_msg = {'test': 'test'}
    client.send_message(sent_msg)
    received_msg = client.get_message()

    assert sent_msg == received_msg
    print 'test_basic passed.'
def test_basic():
    """ test basic cases """

    client = CloudAMQPClient(CLOUDAMQP_URL, TEST_QUEUE_NAME)
    send = {"test": "test"}
    client.send_message(send)
    rev = client.get_message()

    assert send == rev
    print "test_basic passed."
Example #3
0
def clear_queue(queue_url, queue_name):
    '''clearQueue'''
    scrape_news_queue_client = CloudAMQPClient(queue_url, queue_name)

    num_of_messages = 0

    while True:
        if scrape_news_queue_client is not None:
            msg = scrape_news_queue_client.get_message()
            if msg is None:
                print "Cleared %d messages." % num_of_messages
                return
            num_of_messages += 1
Example #4
0
REDIS_HOST = config['operations']['REDIS_HOST']
REDIS_PORT = config['operations']['REDIS_PORT']

NEWS_TABLE_NAME = config['operations']['NEWS_TABLE_NAME']
CLICK_LOGS_TABLE_NAME = config['operations']['CLICK_LOGS_TABLE_NAME']

NEWS_LIMIT = config['operations']['NEWS_LIMIT']
NEWS_LIST_BATCH_SIZE = config['operations']['NEWS_LIST_BATCH_SIZE']
USER_NEWS_TIME_OUT_IN_SECONDS = config['operations'][
    'USER_NEWS_TIME_OUT_IN_SECONDS']

LOG_CLICKS_TASK_QUEUE_URL = "amqp://*****:*****@donkey.rmq.cloudamqp.com/hwobvzoo"
LOG_CLICKS_TASK_QUEUE_NAME = "tap-news-log-clicks-task-queue"

redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT, db=0)
cloudAMQP_client = CloudAMQPClient(LOG_CLICKS_TASK_QUEUE_URL,
                                   LOG_CLICKS_TASK_QUEUE_NAME)


def getNewsSummariesForUser(user_id, page_num):
    page_num = int(page_num)
    begin_index = (page_num - 1) * NEWS_LIST_BATCH_SIZE
    end_index = page_num * NEWS_LIST_BATCH_SIZE

    # The final list of news to be returned.
    sliced_news = []
    print 'getNewsSummariesForUser'

    if redis_client.get(user_id) is not None:
        news_digests = pickle.loads(redis_client.get(user_id))

        # If begin_index is out of range, this will return empty list;
Example #5
0
SCRAPE_NEWS_TASK_QUEUE_NAME = config['news_monitor'][
    'SCRAPE_NEWS_TASK_QUEUE_NAME']
SLEEP_TIME_IN_SECONDS = 10 * 6
NEWS_TIME_OUT_IN_SECONDS = 3600 * 24 * 3

NEWS_SOURCES = [
    'bbc-news', 'bbc-sport', 'bloomberg', 'cnn', 'entertainment-weekly',
    'espn', 'ign', 'techcrunch', 'the-new-york-times',
    'the-wall-street-journal', 'the-washington-post'
]

sys.path.append(os.path.join(os.path.dirname(__file__), '..', ''))
from logger.log import LOGGING_NEWS_MONITOR

redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT)
cloudAMQP_client = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL,
                                   SCRAPE_NEWS_TASK_QUEUE_NAME)

while True:
    news_list = news_api_client.getNewsFromSource(NEWS_SOURCES)

    num_of_news_news = 0

    for news in news_list:
        news_digest = hashlib.md5(
            news['title'].encode('utf-8')).digest().encode('base64')

        if redis_client.get(news_digest) is None:
            num_of_news_news = num_of_news_news + 1
            news['digest'] = news_digest

            if news['publishedAt'] is None:
from cloud_amqp_client import CloudAMQPClient

with open('../config.json') as config_data:
    cfg = json.load(config_data)

DEDUPE_NEWS_TASK_QUEUE_URL = cfg['amqp']['DEDUPE_NEWS_TASK_QUEUE']['url']
DEDUPE_NEWS_TASK_QUEUE_NAME = cfg['amqp']['DEDUPE_NEWS_TASK_QUEUE'][
    'queue_name']

SCRAPE_NEWS_TASK_QUEUE_URL = cfg['amqp']['SCRAPE_NEWS_TASK_QUEUE']['url']
SCRAPE_NEWS_TASK_QUEUE_NAME = cfg['amqp']['SCRAPE_NEWS_TASK_QUEUE'][
    'queue_name']

SLEEP_TIME_IN_SECONDS = 5

DEDUPE_NEWS_QUEUE_CLIENT = CloudAMQPClient(DEDUPE_NEWS_TASK_QUEUE_URL,
                                           DEDUPE_NEWS_TASK_QUEUE_NAME)
SCRAPE_NEWS_QUEUE_CLIENT = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL,
                                           SCRAPE_NEWS_TASK_QUEUE_NAME)


def handle_message(msg):
    """Handle received message"""
    if msg is None or not isinstance(msg, dict):
        print('Message is broken.')
        return

    task = msg
    text = None
    article = Article(task['url'])
    article.download()
    article.parse()
sys.path.append(os.path.join(os.path.dirname(__file__), 'scrapers'))

import cnn_news_scraper
from cloud_amqp_client import CloudAMQPClient

# TODO: use your own queue.

SCRAPE_NEWS_TASK_QUEUE_URL = 'amqp://*****:*****@donkey.rmq.cloudamqp.com/xpiykasc'
SCRAPE_NEWS_TASK_QUEUE_NAME = 'tap-news-scrape-news-task-queue'

DEDUPE_NEWS_TASK_QUEUE_URL = 'amqp://*****:*****@donkey.rmq.cloudamqp.com/xqwzopki'
DEDUPE_NEWS_TASK_QUEUE_NAME = 'tap-news-dedupe-news-task-queue'

SLEEP_TIME_IN_SECONDS = 5

dedupe_news_queue_client = CloudAMQPClient(DEDUPE_NEWS_TASK_QUEUE_URL,
                                           DEDUPE_NEWS_TASK_QUEUE_NAME)
scrape_news_queue_client = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL,
                                           SCRAPE_NEWS_TASK_QUEUE_NAME)


def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        print 'message is broken'
        return

    task = msg
    text = None

    article = Article(task['url'])
    article.download()
    article.parse()
Example #8
0
config = config_client.get_config('../config/config_news_pipeline.yaml')
DEDUPE_NEWS_TASK_QUEUE_URL = config['news_deduper'][
    'DEDUPE_NEWS_TASK_QUEUE_URL']
DEDUPE_NEWS_TASK_QUEUE_NAME = config['news_deduper'][
    'DEDUPE_NEWS_TASK_QUEUE_NAME']
NEWS_TABLE_NAME = config['news_deduper']['NEWS_TABLE_NAME']
SLEEP_TIME_IN_SECONDS = config['news_deduper']['SLEEP_TIME_IN_SECONDS']
SAME_NEWS_SIMILARITY_THRESHOLD = config['news_deduper'][
    'SAME_NEWS_SIMILARITY_THRESHOLD']

# log
sys.path.append(os.path.join(os.path.dirname(__file__), '..', ''))
from logger.log import LOGGING_NEWS_DEDUPER

cloudAMQP_client = CloudAMQPClient(DEDUPE_NEWS_TASK_QUEUE_URL,
                                   DEDUPE_NEWS_TASK_QUEUE_NAME)


def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        return
    task = msg
    text = task['text']
    if text is None:
        return

    published_at = parser.parse(task['publishedAt'])
    published_at_day_begin = datetime.datetime(published_at.year,
                                               published_at.month,
                                               published_at.day, 0, 0, 0, 0)
    published_at_day_end = published_at_day_begin + datetime.timedelta(days=1)
    'queue_name']

SLEEP_TIME_IN_SECONDS = 10
NEWS_TIME_OUT_IN_SECONDS = 3600 * 24 * 3

REDIS_HOST = cfg['redis']['host']
REDIS_PORT = cfg['redis']['port']

NEWS_SOURCES = [
    'bbc-news', 'bbc-sport', 'bloomberg', 'cnn', 'entertainment-weekly',
    'espn', 'ign', 'techcrunch', 'the-new-york-times',
    'the-wall-street-journal', 'the-washington-post'
]

REDIS_CLIENT = redis.StrictRedis(REDIS_HOST, REDIS_PORT)
CLOUD_AMQP_CLIENT = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL,
                                    SCRAPE_NEWS_TASK_QUEUE_NAME)


def run():
    """Start news monitor"""
    while True:
        news_list = news_api_client.getNewsFromSource(NEWS_SOURCES)
        num_of_new_news = 0

        for news in news_list:
            news_digest = hashlib.md5(
                news['title'].encode('utf-8')).hexdigest()

            if REDIS_CLIENT.get(news_digest) is None:
                num_of_new_news = num_of_new_news + 1
                news['digest'] = news_digest
Example #10
0
with open('../config.json') as config_data:
    cfg = json.load(config_data)

LOG_CLICKS_TASK_QUEUE_URL = cfg['amqp']['LOG_CLICKS_TASK_QUEUE']['url']
LOG_CLICKS_TASK_QUEUE_NAME = cfg['amqp']['LOG_CLICKS_TASK_QUEUE']['queue_name']

NUM_OF_CLASSES = 8
INITIAL_P = 1.0 / NUM_OF_CLASSES
ALPHA = 0.2

PREFERENCE_MODEL_TABLE_NAME = "user_preference_model"
NEWS_TABLE_NAME = "news"

SLEEP_TIME_IN_SECONDS = 1

CLOUD_AMQP_CLIENT = CloudAMQPClient(LOG_CLICKS_TASK_QUEUE_URL,
                                    LOG_CLICKS_TASK_QUEUE_NAME)


def handle_message(message):
    """Process message"""
    if message is None or not isinstance(message, dict):
        print('message is broken')
        return

    if 'userId' not in message or 'newsId' not in message or 'timestamp' not in message:
        return

    userId = message['userId']
    newsId = message['newsId']

    database = mongodb_client.get_db()
Example #11
0
from cloud_amqp_client import CloudAMQPClient

with open('../config.json') as config_data:
    cfg = json.load(config_data)

DEDUPE_NEWS_TASK_QUEUE_URL = cfg['amqp']['DEDUPE_NEWS_TASK_QUEUE']['url']
DEDUPE_NEWS_TASK_QUEUE_NAME = cfg['amqp']['DEDUPE_NEWS_TASK_QUEUE'][
    'queue_name']

SLEEP_TIME_IN_SECONDS = 3

NEWS_TABLE_NAME = 'news'

SAME_NEWS_SIMILARITY_THRESHOLD = 0.9

CLOUD_AMQP_CLIENT = CloudAMQPClient(DEDUPE_NEWS_TASK_QUEUE_URL,
                                    DEDUPE_NEWS_TASK_QUEUE_NAME)


def handle_message(msg):
    """Handle message"""
    if msg is None or not isinstance(msg, dict):
        return
    task = msg
    text = task['text']
    if text is None:
        return

    published_at = parser.parse(task['publishedAt'])
    published_at_day_begin = datetime.datetime(published_at.year,
                                               published_at.month,
                                               published_at.day, 0, 0, 0, 0)