def test_basic(): client = RabbitMQClient(HOST, TEST_QUEUE_NAME) sentMsg = {'test': 'demo'} client.sendMessage(sentMsg) client.sleep(10) receivedMsg = client.getMessage() assert sentMsg == receivedMsg print('test_basic passed!')
for row in range(1, rows): if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD: # Dupilicated news print("Dupilicated news, ignore") return task['publishedAt'] = parser.parse(task['publishedAt']) # Classify news title = task['title'] if title is not None: topic = news_topic_modeling_service_client.classify(title) task['class'] = topic db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']}, task, upsert=True) while True: if dedupe_news_queue_client is not None: msg = dedupe_news_queue_client.getMessage() if msg is not None: # Parse try: handle_message(msg) except Exception as e: print(e) pass dedupe_news_queue_client.sleep(SLEEP_TIME_IN_SECONDS)
print("message is broken") return task = msg article = Article(task['url']) article.download() article.parse() task['text'] = article.text # #Only support cnn for now # if(task['source']['name'] == 'CNN'): # print("Scraping CNN news") # text = cnn_news_scraper.extract_news(task['url']) # else: # print("not supported") # task['text'] = text dedupe_news_queue_client.sendMessage(task) while True: # Fetch message from queue if scrape_news_queue_client is not None: msg = scrape_news_queue_client.getMessage() if msg is not None: # Handle message: scrape news from websites try: handle_message(msg) except Exception as e: print(e) pass scrape_news_queue_client.sleep(SLEEP_TIME_IN_SECONDS)
NEWS_SOURCES = [ 'cnn, bbc-news, bloomberg, espn, cnbc, business-insider, abc-news, buzzfeed, bbc-sport, fox-news, the-verge, techradar, talksport, nfl-news, nhl-news, reddit-r-all'] NEWS_TIME_OUT_IN_SECONDS = 3600 * 24 SLEEP_TIME_IN_SECONDS = 60 redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT) rabbitMQ_client = RabbitMQClient(SCRAPE_NEWS_TASK_QUEUE_HOST, SCRAPE_NEWS_TASK_QUEUE_NAME) while True: news_list = news_api_client.getNewsFromSource(NEWS_SOURCES) nums_of_new_news = 0 for news in news_list: news_digest = hashlib.md5(news['title'].encode('utf-8')).hexdigest() if redis_client.get(news_digest) is None: nums_of_new_news += 1 news['digest'] = news_digest if news['publishedAt'] is None: news['publishedAt'] = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ") redis_client.set(news_digest, json.dumps(news)) redis_client.expire(str(news), NEWS_TIME_OUT_IN_SECONDS) rabbitMQ_client.sendMessage(news) print("Fetched {} new news" .format(nums_of_new_news)) rabbitMQ_client.sleep(SLEEP_TIME_IN_SECONDS)