def test_basic(): """ test_basic""" client = CloudAMQPClient(CLOUDAMQP_URL, TEST_QUEUE_NAME) sent_msg = {'test': 'test'} client.send_message(sent_msg) received_msg = client.get_message() assert sent_msg == received_msg print 'test_basic passed.'
def test_basic(): """ test basic cases """ client = CloudAMQPClient(CLOUDAMQP_URL, TEST_QUEUE_NAME) send = {"test": "test"} client.send_message(send) rev = client.get_message() assert send == rev print "test_basic passed."
def clear_queue(queue_url, queue_name): '''clearQueue''' scrape_news_queue_client = CloudAMQPClient(queue_url, queue_name) num_of_messages = 0 while True: if scrape_news_queue_client is not None: msg = scrape_news_queue_client.get_message() if msg is None: print "Cleared %d messages." % num_of_messages return num_of_messages += 1
REDIS_HOST = config['operations']['REDIS_HOST'] REDIS_PORT = config['operations']['REDIS_PORT'] NEWS_TABLE_NAME = config['operations']['NEWS_TABLE_NAME'] CLICK_LOGS_TABLE_NAME = config['operations']['CLICK_LOGS_TABLE_NAME'] NEWS_LIMIT = config['operations']['NEWS_LIMIT'] NEWS_LIST_BATCH_SIZE = config['operations']['NEWS_LIST_BATCH_SIZE'] USER_NEWS_TIME_OUT_IN_SECONDS = config['operations'][ 'USER_NEWS_TIME_OUT_IN_SECONDS'] LOG_CLICKS_TASK_QUEUE_URL = "amqp://*****:*****@donkey.rmq.cloudamqp.com/hwobvzoo" LOG_CLICKS_TASK_QUEUE_NAME = "tap-news-log-clicks-task-queue" redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT, db=0) cloudAMQP_client = CloudAMQPClient(LOG_CLICKS_TASK_QUEUE_URL, LOG_CLICKS_TASK_QUEUE_NAME) def getNewsSummariesForUser(user_id, page_num): page_num = int(page_num) begin_index = (page_num - 1) * NEWS_LIST_BATCH_SIZE end_index = page_num * NEWS_LIST_BATCH_SIZE # The final list of news to be returned. sliced_news = [] print 'getNewsSummariesForUser' if redis_client.get(user_id) is not None: news_digests = pickle.loads(redis_client.get(user_id)) # If begin_index is out of range, this will return empty list;
SCRAPE_NEWS_TASK_QUEUE_NAME = config['news_monitor'][ 'SCRAPE_NEWS_TASK_QUEUE_NAME'] SLEEP_TIME_IN_SECONDS = 10 * 6 NEWS_TIME_OUT_IN_SECONDS = 3600 * 24 * 3 NEWS_SOURCES = [ 'bbc-news', 'bbc-sport', 'bloomberg', 'cnn', 'entertainment-weekly', 'espn', 'ign', 'techcrunch', 'the-new-york-times', 'the-wall-street-journal', 'the-washington-post' ] sys.path.append(os.path.join(os.path.dirname(__file__), '..', '')) from logger.log import LOGGING_NEWS_MONITOR redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT) cloudAMQP_client = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL, SCRAPE_NEWS_TASK_QUEUE_NAME) while True: news_list = news_api_client.getNewsFromSource(NEWS_SOURCES) num_of_news_news = 0 for news in news_list: news_digest = hashlib.md5( news['title'].encode('utf-8')).digest().encode('base64') if redis_client.get(news_digest) is None: num_of_news_news = num_of_news_news + 1 news['digest'] = news_digest if news['publishedAt'] is None:
from cloud_amqp_client import CloudAMQPClient with open('../config.json') as config_data: cfg = json.load(config_data) DEDUPE_NEWS_TASK_QUEUE_URL = cfg['amqp']['DEDUPE_NEWS_TASK_QUEUE']['url'] DEDUPE_NEWS_TASK_QUEUE_NAME = cfg['amqp']['DEDUPE_NEWS_TASK_QUEUE'][ 'queue_name'] SCRAPE_NEWS_TASK_QUEUE_URL = cfg['amqp']['SCRAPE_NEWS_TASK_QUEUE']['url'] SCRAPE_NEWS_TASK_QUEUE_NAME = cfg['amqp']['SCRAPE_NEWS_TASK_QUEUE'][ 'queue_name'] SLEEP_TIME_IN_SECONDS = 5 DEDUPE_NEWS_QUEUE_CLIENT = CloudAMQPClient(DEDUPE_NEWS_TASK_QUEUE_URL, DEDUPE_NEWS_TASK_QUEUE_NAME) SCRAPE_NEWS_QUEUE_CLIENT = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL, SCRAPE_NEWS_TASK_QUEUE_NAME) def handle_message(msg): """Handle received message""" if msg is None or not isinstance(msg, dict): print('Message is broken.') return task = msg text = None article = Article(task['url']) article.download() article.parse()
sys.path.append(os.path.join(os.path.dirname(__file__), 'scrapers')) import cnn_news_scraper from cloud_amqp_client import CloudAMQPClient # TODO: use your own queue. SCRAPE_NEWS_TASK_QUEUE_URL = 'amqp://*****:*****@donkey.rmq.cloudamqp.com/xpiykasc' SCRAPE_NEWS_TASK_QUEUE_NAME = 'tap-news-scrape-news-task-queue' DEDUPE_NEWS_TASK_QUEUE_URL = 'amqp://*****:*****@donkey.rmq.cloudamqp.com/xqwzopki' DEDUPE_NEWS_TASK_QUEUE_NAME = 'tap-news-dedupe-news-task-queue' SLEEP_TIME_IN_SECONDS = 5 dedupe_news_queue_client = CloudAMQPClient(DEDUPE_NEWS_TASK_QUEUE_URL, DEDUPE_NEWS_TASK_QUEUE_NAME) scrape_news_queue_client = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL, SCRAPE_NEWS_TASK_QUEUE_NAME) def handle_message(msg): if msg is None or not isinstance(msg, dict): print 'message is broken' return task = msg text = None article = Article(task['url']) article.download() article.parse()
config = config_client.get_config('../config/config_news_pipeline.yaml') DEDUPE_NEWS_TASK_QUEUE_URL = config['news_deduper'][ 'DEDUPE_NEWS_TASK_QUEUE_URL'] DEDUPE_NEWS_TASK_QUEUE_NAME = config['news_deduper'][ 'DEDUPE_NEWS_TASK_QUEUE_NAME'] NEWS_TABLE_NAME = config['news_deduper']['NEWS_TABLE_NAME'] SLEEP_TIME_IN_SECONDS = config['news_deduper']['SLEEP_TIME_IN_SECONDS'] SAME_NEWS_SIMILARITY_THRESHOLD = config['news_deduper'][ 'SAME_NEWS_SIMILARITY_THRESHOLD'] # log sys.path.append(os.path.join(os.path.dirname(__file__), '..', '')) from logger.log import LOGGING_NEWS_DEDUPER cloudAMQP_client = CloudAMQPClient(DEDUPE_NEWS_TASK_QUEUE_URL, DEDUPE_NEWS_TASK_QUEUE_NAME) def handle_message(msg): if msg is None or not isinstance(msg, dict): return task = msg text = task['text'] if text is None: return published_at = parser.parse(task['publishedAt']) published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0) published_at_day_end = published_at_day_begin + datetime.timedelta(days=1)
'queue_name'] SLEEP_TIME_IN_SECONDS = 10 NEWS_TIME_OUT_IN_SECONDS = 3600 * 24 * 3 REDIS_HOST = cfg['redis']['host'] REDIS_PORT = cfg['redis']['port'] NEWS_SOURCES = [ 'bbc-news', 'bbc-sport', 'bloomberg', 'cnn', 'entertainment-weekly', 'espn', 'ign', 'techcrunch', 'the-new-york-times', 'the-wall-street-journal', 'the-washington-post' ] REDIS_CLIENT = redis.StrictRedis(REDIS_HOST, REDIS_PORT) CLOUD_AMQP_CLIENT = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL, SCRAPE_NEWS_TASK_QUEUE_NAME) def run(): """Start news monitor""" while True: news_list = news_api_client.getNewsFromSource(NEWS_SOURCES) num_of_new_news = 0 for news in news_list: news_digest = hashlib.md5( news['title'].encode('utf-8')).hexdigest() if REDIS_CLIENT.get(news_digest) is None: num_of_new_news = num_of_new_news + 1 news['digest'] = news_digest
with open('../config.json') as config_data: cfg = json.load(config_data) LOG_CLICKS_TASK_QUEUE_URL = cfg['amqp']['LOG_CLICKS_TASK_QUEUE']['url'] LOG_CLICKS_TASK_QUEUE_NAME = cfg['amqp']['LOG_CLICKS_TASK_QUEUE']['queue_name'] NUM_OF_CLASSES = 8 INITIAL_P = 1.0 / NUM_OF_CLASSES ALPHA = 0.2 PREFERENCE_MODEL_TABLE_NAME = "user_preference_model" NEWS_TABLE_NAME = "news" SLEEP_TIME_IN_SECONDS = 1 CLOUD_AMQP_CLIENT = CloudAMQPClient(LOG_CLICKS_TASK_QUEUE_URL, LOG_CLICKS_TASK_QUEUE_NAME) def handle_message(message): """Process message""" if message is None or not isinstance(message, dict): print('message is broken') return if 'userId' not in message or 'newsId' not in message or 'timestamp' not in message: return userId = message['userId'] newsId = message['newsId'] database = mongodb_client.get_db()
from cloud_amqp_client import CloudAMQPClient with open('../config.json') as config_data: cfg = json.load(config_data) DEDUPE_NEWS_TASK_QUEUE_URL = cfg['amqp']['DEDUPE_NEWS_TASK_QUEUE']['url'] DEDUPE_NEWS_TASK_QUEUE_NAME = cfg['amqp']['DEDUPE_NEWS_TASK_QUEUE'][ 'queue_name'] SLEEP_TIME_IN_SECONDS = 3 NEWS_TABLE_NAME = 'news' SAME_NEWS_SIMILARITY_THRESHOLD = 0.9 CLOUD_AMQP_CLIENT = CloudAMQPClient(DEDUPE_NEWS_TASK_QUEUE_URL, DEDUPE_NEWS_TASK_QUEUE_NAME) def handle_message(msg): """Handle message""" if msg is None or not isinstance(msg, dict): return task = msg text = task['text'] if text is None: return published_at = parser.parse(task['publishedAt']) published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0)