def test_basic(): news = client.getNewsFromSources() print(news) assert len(news) > 0 news = client.getNewsFromSources(sources=['espn'], sortBy='latest') assert len(news) > 0 print('test_basic passed!')
def test_basic(): news = client.getNewsFromSources() print (news) assert len(news) > 0 news = client.getNewsFromSources(sources=['bbc-news']) assert len(news) > 0 print ("test_basic passed.")
def test_basic(): news = client.getNewsFromSources() print(news) assert len(news) > 0, 'not getting news' news = client.getNewsFromSources(sources=['cnn'], sortBy='top') assert len(news) > 0 print('test_basic passed!')
def test_basic(): news = client.getNewsFromSources() print(news) assert len(news) > 0 news = client.getNewsFromSources(sources=['cnn'], sortBy='top') assert len(news) > 0 print('test passed')
def test_basic(): news = client.getNewsFromSources() print(news) assert len(news) > 0 news = client.getNewsFromSources(['ign'], sortBy='top') print(news) assert len(news) > 0 print('test_basic passed!')
def test_basic(): # test pass with no argument news = client.getNewsFromSources() print(news) assert len(news) > 0 # test pass with argument news = client.getNewsFromSources(sources=['cnn'], sortBy='top') assert len(news) > 0 print('test_basic passed!')
def test_basic(): news = client.getNewsFromSources(sources = ['bbc-news']) for new in news: if new['source'] == 'bbc-news': print new # print news assert len(news) > 0 news = client.getNewsFromSources(sources = ['bloomberg']) # print news assert len(news) > 0 print "test_basic passed."
def run(): while True: news_list = news_api_client.getNewsFromSources(NEWS_SOURCES) num_of_new_news = 0 for news in news_list: # calculate MD5 and convert to string use hexigest news_digest = hashlib.md5(news['title'].encode('utf-8')).hexdigest() if redis_client.get(news_digest) is None: num_of_new_news += 1 # every news has unique digest news['digest'] = news_digest if news['publishedAt'] is None: news['publishedAt'] = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ") # use as hash set, value does not matter redis_client.set(news_digest, "True") redis_client.expire(news_digest, NEWS_TIME_OUT_IN_SECONDS) cloudAMQP_client.send_message(news) print("Fetched %d news." % num_of_new_news) # use cloudAMQP_client.sleep keep queue heartbeat cloudAMQP_client.sleep(SLEEP_TIME_IN_SECONDS)
def run(): while True: news_list = news_api_client.getNewsFromSources(NEWS_SOURCES) #record new coming xinwen num_of_new_news = 0 # find duplidate xinwen, hash title for news in news_list: news_digest = hashlib.md5( news['title'].encode('utf-8')).hexdigest() if redis_client.get(news_digest) is None: num_of_new_news += 1 news['digest'] = news_digest if news['publishedAt'] is None: news['publishedAt'] = datetime.datetime.utcnow().strftime( "%Y-%m-%dT%H:%M:%SZ") redis_client.set(news_digest, 'True') redis_client.expire(news_digest, NEWS_TIME_OUT_IN_SECONDS) cloudAMQP_client.sendMessage(news) logger.info('Fetched %d news.', num_of_new_news) cloudAMQP_client.sleep(SLEEP_TIME_IN_SECONDS)
def run(): while True: news_list = news_api_client.getNewsFromSources(NEWS_SOURCES) num_of_new_news = 0 # for each news, check duplicity, if pass, send news to AMQP for news in news_list: # skip when newsAPI fails to retrieve news, (e.g. news site got reformatted) if news['description'] is None: continue # news_digest is primary key of each news (for checking duplicity) news_digest = hashlib.md5( news['description'].encode('utf-8')).hexdigest() if redis_client.get(news_digest) is None: num_of_new_news = num_of_new_news + 1 news['digest'] = news_digest if news['publishedAt'] is None: news['publishedAt'] = datetime.datetime.utcnow().strftime( '%Y-%m-%dT%H:%M:%SZ') redis_client.set(news_digest, 'True') redis_client.expire(news_digest, NEWS_TIME_OUT_IN_SECONDS) cloudAMQP_client.sendMessage(news) print('--------------------') print('Fetched %d news sources' % num_of_new_news) cloudAMQP_client.sleep(SLEEP_TIME_IN_SECONDS)
def run (): while True: news_list = news_api_client.getNewsFromSources(NEWS_SOURCES) num_of_new_news = 0 for news in news_list: news_digest = hashlib.md5(news['title'].encode('utf-8')).hexdigest() if redis_client.get(news_digest) is None: num_of_new_news += 1 news['digest'] = news_digest #加时间是因为后面做去重需要最近的新闻,而有的新闻自己不带这个时间戳 if news['publishedAt'] is None: news['publishedAt'] = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ") redis_client.set(news_digest, True) #value是什么无所谓, 确保key在就行了 redis_client.expire(news_digest, NEW_TIME_OUT_IN_SECOND) #send the news in queue cloudAMQP_client.sendMessage(news) logger.info("Fetched %d news" , num_of_new_news) cloudAMQP_client.sleep(SLEEP_TIME_IN_SECONDS) #这个while循环每10秒循环一次
def run(): while True: news_list = news_api_client.getNewsFromSources(NEWS_SOURCES) num_of_new_news = 0 for news in news_list: if news['title'] is not None: news_digest = hashlib.md5( news['title'].encode('utf-8')).hexdigest() else: print("The news does not have title") print(news) continue if redis_client.get(news_digest) is None: num_of_new_news = num_of_new_news + 1 news['digest'] = news_digest if news['publishedAt'] is None: news['publishedAt'] = datetime.datetime.utcnow().strftime( "%Y-%m-%dT%H:%M:%SZ") redis_client.set(news_digest, 'True') redis_client.expire(news_digest, NEWS_TIME_OUT_IN_SECONDS) cloudAMQP_client.sendMessage(news) print("Fetched %d news." % num_of_new_news) # cloudAMQP_client.sleep(SLEEP_TIME_IN_SECONDS) # Send a Get_News Request every hour time.sleep(NEWS_REQUEST_TIME_IN_SECONDS)
def run(): while True: news_list = news_api_client.getNewsFromSources(NEWS_SOURCES) num_of_new_news = 0 for news in news_list: news_digest = hashlib.md5( news['title'].encode('utf-8')).hexdigest() if redis_client.get(news_digest) is None: num_of_new_news += 1 news['digest'] = news_digest if news['publishedAt'] is None: news['publishedAt'] = datetime.datetime.utcnow().strftime( "%Y-%m-%dT%H:%M:%SZ") redis_client.set(news_digest, "True") redis_client.expire(news_digest, NEWS_TIME_OUT_IN_SECONDS) cloudAMQP_client.send_message(news) print("Fetched %d news." % num_of_new_news) cloudAMQP_client.sleep(SLEEP_TIME_IN_SECONDS)
def run(news_api_client): """ fetch news from NewsAPI, use redis to remove repeated, send non-repeated to Scraper Queue """ while True: news_list = news_api_client.getNewsFromSources(NEWS_SOURCES) num_of_new_news = 0 num_of_total_news = 0 for news in news_list: # or description, etc. # then change the string to hex news_digest = hashlib.md5( news['title'].encode('utf-8')).hexdigest() num_of_total_news += 1 if redis_client.get(news_digest) is None: num_of_new_news += 1 news['digest'] = news_digest if news['publishedAt'] is None: news['publishedAt'] = datetime.datetime.utcnow().strftime( "%Y-%m-%dT%H:%M:%SZ") redis_client.set(news_digest, '1') redis_client.expire(news_digest, NEWS_TIME_OUT_IN_SECONDS) cloundAMQP_client.sendMessage(news) logger.info("Fetched total %d news, including %d new news.", num_of_total_news, num_of_new_news) cloundAMQP_client.sleep(SLEEP_TIME_IN_SECONDS)
def run(): while True: news_list = news_api_client.getNewsFromSources(NEWS_SOURCES) num_of_new_news = 0 for news in news_list: # calculate MD5 and convert to string use hexigest # there might not be a description, can use title + description news_digest = hashlib.md5( news['title'].encode('utf-8')).hexdigest() if redis_client.get(news_digest) is None: num_of_new_news += 1 # every news has unique digest, store it in news news['digest'] = news_digest # time is important for later use, if missing time, we need to give it a timestamp if news['publishedAt'] is None: news['publishedAt'] = datetime.datetime.utcnow().strftime( "%Y-%m-%dT%H:%M:%SZ") # use as hash set, value does not matter redis_client.set(news_digest, "True") redis_client.expire(news_digest, NEWS_TIME_OUT_IN_SECONDS) cloudAMQP_client.send_message(news) print("Fetched %d news." % num_of_new_news) # use cloudAMQP_client.sleep to keep queue heartbeat # the whole thread will be stoped for 10s cloudAMQP_client.sleep(SLEEP_TIME_IN_SECONDS)
db_config = yaml.load(dbCfg) with open(CLOUDAMQP_CONFIG_FILE, 'r') as amqpCfg: cloudAMQP_config = yaml.load(amqpCfg) with open(NEWS_CONFIG_FILE, 'r') as newsCfg: news_config = yaml.load(newsCfg) # loading configuration from yaml files redis_client = redis.StrictRedis(db_config['redis']['host'], db_config['redis']['port']) cloudAMQP_client = CloudAMQPClient(cloudAMQP_config['url'], cloudAMQP_config['scrape_queue_name']) while True: news_list = news_api_client.getNewsFromSources(news_config['news_sources']) num_of_new_news = 0 for news in news_list: news_digest = hashlib.md5( news['title'].encode('utf-8')).digest().encode('base64') if redis_client.get(news_digest) is None: num_of_new_news = num_of_new_news + 1 news['digest'] = news_digest # If 'publishedAt' is None, set it to current UTC time if news['publishedAt'] is None: # Make the time in format YYYY-MM-DDTHH:MM:SSZ in UTC news['publishedAt'] = datetime.datetime.utcnow().strftime( '%Y-%m-%dT%H:%M:%SZ')
# TODO: use your own queue. SCRAPE_NEWS_TASK_QUEUE_URL = "" SCRAPE_NEWS_TASK_QUEUE_NAME = "tap-news-scrape-news-task-queue" NEWS_SOURCES = [ 'bbc-news', 'bbc-sport', 'bloomberg', 'cnn', 'entertainment-weekly', 'espn', 'ign', 'techcrunch', 'the-new-york-times', 'the-wall-street-journal', 'the-washington-post' ] redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT) cloudAMQP_client = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL, SCRAPE_NEWS_TASK_QUEUE_NAME) while True: news_list = news_api_client.getNewsFromSources(NEWS_SOURCES) num_of_new_news = 0 for news in news_list: news_digest = hashlib.md5( news['title'].encode('utf-8')).digest().encode('base64') if redis_client.get(news_digest) is None: num_of_new_news = num_of_new_news + 1 news['digest'] = news_digest if news['publishedAt'] is None: # Make the time in format YYYY-MM-DDTHH:MM:SS in UTC. news['publishedAt'] = datetime.datetime.utcnow().strftime( '%Y-%m-%dT%H:%M:%SZ')
import news_api_client as client def test_basic(): news = client.getNewsFromSources() print(news) assert len(news) > 0 test pass with argument news = client.getNewsFromSources(sources=['ign'], sort_by='top') print(news) assert len(news) > 0 print('test_basic passed!') if __name__ == "__main__": test_basic()
def test_basic(): news = client.getNewsFromSources() print(news) assert len(news) > 0 test pass with argument