def test_basic(): news = client.getNewsFromSource() print news assert len(news) > 0 news = client.getNewsFromSource(sources=['bbc-news']) assert len(news) > 0 print 'news_api_client passed test'
def test_basic(): news = client.getNewsFromSource() print(news) assert len(news) > 0 news = client.getNewsFromSource(sources=['bbc-news']) assert len(news) > 0 print("test_basic passed!")
def test(): news = client.getNewsFromSource() print(news) assert len(news) > 0 news = client.getNewsFromSource(sources=['bloomberg'], sortBy='top') assert len(news) > 0 print('test_basic passed!')
def test_basic(): news = client.getNewsFromSource() print news assert len(news) > 0 news = client.getNewsFromSource('bbc-news') assert len(news) > 0 print 'test_basic passed!'
def test_basic(): news = client.getNewsFromSource() # print(news) assert len(news) > 0 news = client.getNewsFromSource(sources=['bbc-news'], sortBy='top') assert len(news) > 0 print('test_basic passed!')
def test_basic(): news = client.getNewsFromSource() print news assert len(news) > 0 news = client.getNewsFromSource(sources=['cnn'], sortBy='top') assert len(news) > 0 print 'test_basic passed!'
def test_basic(): news = client.getNewsFromSource() print "get cnn passed" assert len(news) > 0 news = client.getNewsFromSource(sources=['bbc-news']) assert len(news) > 0 print 'get bbc passed!'
def basic_test(): news = client.getNewsFromSource() print(news) assert len(news) > 0 news = client.getNewsFromSource(sources=["bbc-news"], sortBy="top") assert len(news) > 0 print(news) print("New Api Basic Test Complete!")
def test_basic(): print 'enter fun' news = client.getNewsFromSource() print news assert len(news) > 0 news = client.getNewsFromSource(sources=['bbc-news']) assert len(news) > 0 print 'test_basic passed!'
def test_basic(): """Basic test""" news = client.getNewsFromSource() print(news) assert news is not None news = client.getNewsFromSource(sources=['cnn'], sortBy='top') assert news is not None print('Test passed!')
def test_news(): news = client.getNewsFromSource() print news assert len(news) > 0 news = client.getNewsFromSource(sources=['bbc-news']) print news assert len(news) > 0
def test(): news = client.getNewsFromSource() print news assert len(news) > 0 news = client.getNewsFromSource(sources=['bbc-news']) assert len(news) print 'test passed!'
def test_basic(): news = client.getNewsFromSource() print news print '\n' assert len(news) > 0 news = client.getNewsFromSource(sources=['bbc-news']) print news assert len(news) > 0 print 'test_basic passed!'
def test_basic(): ''' test news_api_client ''' news = client.getNewsFromSource() print(news) assert len(news) > 0 news = client.getNewsFromSource(sources=['cnn'], sort_by='publishedAt') assert len(news) > 0 print 'test_basic passed!'
def test_basic(): news = client.getNewsFromSource() print news assert len(news) > 0 print "cnn news successfully loaded!" news = client.getNewsFromSource(sources=['bbc-news']) print news assert len(news) > 0 print "bbc-news successfully loaded!"
def test_basic(): sources = [ source.encode('ascii') for source in config["newsApi"]["sources"] ] print sources news = client.getNewsFromSource() print news assert len(news) > 0 news = client.getNewsFromSource(sources=sources) assert len(news) > 0 print 'test_basic passed!'
def test_basic(): # test the function without using parameters news = client.getNewsFromSource() print news # if news is not none then pass assert len(news) > 0 # test the function using parameters news = client.getNewsFromSource( sources=['cnn'], sortBy='top') # cnn doesn't support latest sorting assert len(news) > 0 print 'test_basic passed'
def run(): while True: news_list = news_api_client.getNewsFromSource(NEWS_SOURCES) num_of_news_news = 0 for news in news_list: news_digest = hashlib.md5( news['title'].encode('utf-8')).hexdigest() if redis_client.get(news_digest) is None: num_of_news_news = num_of_news_news + 1 news['digest'] = news_digest if news['publishedAt'] is None: news['publishedAt'] = datetime.datetime.utcnow().strftime( "%Y-%m-%dT%H:%M:%SZ") redis_client.set(news_digest, "True") redis_client.expire(news_digest, NEWS_TIME_OUT_IN_SECONDS) cloudAMQP_client.sendMessage(news) print("Fetched %d news." % num_of_news_news) cloudAMQP_client.sleep(SLEEP_TIME_IN_SECONDS)
def run(): while True: news_list = news_api_client.getNewsFromSource(NEWS_SOURCES) num_of_news_news = 0 for news in news_list: # calculate MD5 and convert to string use hexigest news_digest = hashlib.md5( news['title'].encode('utf-8')).hexdigest() # check in redis if redis_client.get(news_digest) is None: num_of_news_news = num_of_news_news + 1 # every news has unique digest news['digest'] = news_digest # use utc time to avoid different time zones if news['publishedAt'] is None: news['publishedAt'] = datetime.datetime.utcnow().strftime( "%Y-%m-%dT%H:%M:%SZ") # use as hash set, value does not matter redis_client.set(news_digest, "True") redis_client.expire(news_digest, NEWS_TIME_OUT_IN_SECONDS) # send message to queue for next tast cloudAMQP_client.sendMessage(news) print("Fetched %d news." % num_of_news_news) # use cloudAMQP_client.sleep keep queue heartbeat cloudAMQP_client.sleep(SLEEP_TIME_IN_SECONDS)
def __call__(self): self.redis_client = redis.StrictRedis(self.redis_server_host, self.redis_server_port) self.cloudAMQP_client = CloudAMQPClient( self.scrape_news_task_queue_url, self.scrape_news_task_queue_name) news_list = news_api_client.getNewsFromSource(self.news_sources) print "call news monitor" num_of_new_news = 0 num_of_old_news = 0 for news in news_list: news_digest = hashlib.md5( news['title'].encode('utf-8')).digest().encode('base-64') if self.redis_client.get(news_digest) is None: num_of_new_news = num_of_new_news + 1 news['digest'] = news_digest if news['publishedAt'] is None: news['publishedAt'] = datetime.datetime.utcnow().strftime( '%Y-%m-%dT%H:%M:%SZZ') self.redis_client.set(news_digest, news) self.redis_client.expire(news_digest, self.news_timeout_redis_in_seconds) self.cloudAMQP_client.sendMessage(news) else: num_of_old_news = num_of_old_news + 1 print "Fetched %d new news. %d old news in redis" % (num_of_new_news, num_of_old_news) self.cloudAMQP_client.close()
def run(): """Start news monitor""" while True: news_list = news_api_client.getNewsFromSource(NEWS_SOURCES) num_of_new_news = 0 for news in news_list: news_digest = hashlib.md5( news['title'].encode('utf-8')).hexdigest() if REDIS_CLIENT.get(news_digest) is None: num_of_new_news = num_of_new_news + 1 news['digest'] = news_digest if news['publishedAt'] is None: news['publishedAt'] = datetime.datetime.utcnow().strftime( "%Y-%m-%dT%H:%M:%SZ") REDIS_CLIENT.set(news_digest, 'True') REDIS_CLIENT.expire(news_digest, NEWS_TIME_OUT_IN_SECONDS) CLOUD_AMQP_CLIENT.send_message(news) print("Fetched %d news." % num_of_new_news) CLOUD_AMQP_CLIENT.sleep(SLEEP_TIME_IN_SECONDS)
def test_basic(): news = client.getNewsFromSource() NEWS_SOURCES = [ 'bbc-news', 'bbc-sport', 'bloomberg', 'cnn', 'entertainment-weekly', 'espn', 'ign', 'techcrunch', 'the-new-york-times', 'the-wall-street-journal', 'the-washington-post' ] news_category = client.getNewsFromCategory() NEWS_CATEGORY = ['general', 'sports'] # print(news) print(news_category) assert len(news) > 0 news = client.getNewsFromSource(sources=NEWS_SOURCES) assert len(news) > 0 news = client.getNewsFromCategory(categories=NEWS_CATEGORY) assert len(news) > 0 print('[x] test basic new_api passed!')
def test_basic(): # news = client.getNewsFromSource() # print news # assert len(news) > 0 # news = client.getNewsFromSource(sources=['bbc-news']) # assert len(news) > 0 news = client.getNewsFromSource(sources=['the-verge']) print(news) assert len(news) > 0 print 'test_basic passed!'
'bloomberg', 'cnn', 'entertainment-weekly', 'espn', 'ign', 'techcrunch', 'the-new-york-times', 'the-wall-street-journal', 'the-washington-post' ] redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT) cloudAMQP_client = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL, SCRAPE_NEWS_TASK_QUEUE_NAME) while True: news_list = news_api_client.getNewsFromSource(NEWS_SOURCES) num_of_new_news = 0 for news in news_list: news_digest = hashlib.md5(news['title'].encode('utf-8')).digest().encode('base64') if redis_client.get(news_digest) is None: num_of_new_news = num_of_new_news + 1 news['digest'] = news_digest if news['publishedAt'] is None: # format: YYYY-MM-DDTHH:MM:SS in UTC news['publishedAt'] = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ') redis_client.set(news_digest, news) redis_client.expire(news_digest, NEWS_TIME_OUT_IN_SECONDS)
# ask for mq and memory configs mq_config = config_service_client.getMessagequeueConfigForUsecase('scrape_news_task') cloudAMQP_client = CloudAMQPClient(mq_config['queue_url'], mq_config['queue_name']) mmr_config = config_service_client.getMemoryConfig('redis') redis_client = redis.StrictRedis(mmr_config['host'], mmr_config['port']) # ask for other params news_monitor_config = config_service_client.getPipelineConfigForSection('news_monitor') news_sources = news_monitor_config['news_sources'] news_timeout_seconds = int(news_monitor_config['news_timeout_seconds']) sleeptime_seconds = int(news_monitor_config['scrape_queue_client_sleeptime_seconds']) while True: # such a step takes a list of latest news task, but most of them could be old duplicates news_list = news_api_client.getNewsFromSource(news_sources) num_of_new_news = 0 for news in news_list: news_digest = hashlib.md5(news['title'].encode('utf-8')).digest().encode('base64') if redis_client.get(news_digest) is None: num_of_new_news = num_of_new_news + 1 news['digest'] = news_digest if news['publishedAt'] is None: news['publishedAt'] = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ') # first level anti-duplicate by redis: only report new news tasks redis_client.set(news_digest, news) redis_client.expire(news_digest, news_timeout_seconds)
def test_basic(): news = client.getNewsFromSource() print news assert len(news) > 0 news = client.getNewsFromSource(sources=[NEWS_SOURCES], sortBy='top')
with open('../configuration/news_pipeline_conf.yaml', 'r') as stream: try: config = yaml.load(stream) except yaml.YAMLError as exc: print(exc) redis_client = redis.StrictRedis(config['news_monitor']['REDIS_HOST'], int(config['news_monitor']['REDIS_PORT'])) cloudAMQP_client = CloudAMQPClient( config['news_monitor']['SCRAPE_NEWS_TASK_QUEUE_URL'], config['news_monitor']['SCRAPE_NEWS_TASK_QUEUE_NAME']) while True: news_list = news_api_client.getNewsFromSource( config['news_monitor']['NEWS_SOURCES']) num_of_news_news = 0 for news in news_list: news_digest = hashlib.md5( news['title'].encode('utf-8')).digest().encode( 'base64') # digest can be used as a unique ID if redis_client.get(news_digest) is None: '''new news coming in''' num_of_news_news = num_of_news_news + 1 news['digest'] = news_digest if news['publishedAt'] is None: news['publishedAt'] = datetime.datetime.utcnow().strftime(
REDIS_PORT = 6379 SCRAPE_NEWS_TASK_QUEUE_URL = 'amqp://*****:*****@shark.rmq.cloudamqp.com/gvmlmvjt' SCRAPE_NEWS_TASK_QUEUE_NAME = 'smart-news-scrape-task-queue' NEWS_TIMEOUT_IN_SECONDS = 3600 * 24 SLEEP_IN_SECONDS = 1800 redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT) CloudAMPQ_client = CloudAMPQClient(SCRAPE_NEWS_TASK_QUEUE_URL,SCRAPE_NEWS_TASK_QUEUE_NAME) while True: #src_list = news_api_client.getSources() news_list = news_api_client.getNewsFromSource() num_of_new = 0 for news in news_list: news_digest = hashlib.md5(news['title'].encode('utf-8')).digest().encode('base64') if redis_client.get(news_digest) is None: num_of_new = num_of_new + 1 news['digest'] = news_digest print(news_digest) if news['publishedAt'] is None: news['publishedAt'] = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ') print(news) redis_client.set(news_digest, json.dumps(news)) redis_client.expire(news_digest, NEWS_TIMEOUT_IN_SECONDS) print("[News-Monitor] SEND MSG : ") print(news)
def test(): news = client.getNewsFromSource() print(news) assert len(news) > 0 print("test passed")
NEWS_SOURCES = [ 'bbc-news', 'bbc-sport', 'bloomberg', 'cnn', 'entertainment-weekly', 'espn', 'ign', 'techcrunch', 'the-new-york-times', 'the-wall-street-journal', 'the-washington-post' ] EVERYTHING_API = 'everything' SORT_BY_TOP = 'publishedAt' PAGE_SIZE = 100 redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT) cloudAMQP_client = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL, SCRAPE_NEWS_TASK_QUEUE_NAME) while True: news_list = news_api_client.getNewsFromSource(sources=NEWS_SOURCES, api_name=EVERYTHING_API, sort_by=SORT_BY_TOP, pageSize=PAGE_SIZE) num_of_new_news = 0 for news in news_list: news_digest = hashlib.md5( news['title'].encode('utf-8')).digest().encode('base64') if redis_client.get(news_digest) is None: num_of_new_news += 1 news['digest'] = news_digest if news['publishedAt'] is None: news['publishedAt'] = datetime.datetime.utcnow().strftime( "%Y-%m-%dT%H:%M:%SZ")
# Use your own Cloud AMQP queue SCRAPE_NEWS_TASK_QUEUE_URL = "amqp://*****:*****@wasp.rmq.cloudamqp.com/yruquhmv" SCRAPE_NEWS_TASK_QUEUE_NAME = "SCRAPE_NEWS_TASK" NEWS_SOURCES = [ 'bbc-news', 'bbc-sport', 'bloomberg', 'cnn', 'entertainment-weekly', 'espn', 'ign', 'techcrunch', 'the-new-york-times', 'the-wall-street-journal', 'the-washington-post' ] redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT) cloudAMQP_client = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL, SCRAPE_NEWS_TASK_QUEUE_NAME) while True: news_list = news_api_client.getNewsFromSource(NEWS_SOURCES) num_of_new_news = 0 for news in news_list: news_digest = hashlib.md5( news['title'].encode('utf-8')).digest().encode('base64') if redis_client.get(news_digest) is None: num_of_new_news = num_of_new_news + 1 news['digest'] = news_digest if news['publishedAt'] is None: # format: YYYY-MM-DDTHH:MM:SS in UTC news['publishedAt'] = datetime.datetime.utcnow().strftime( '%Y-%m-%dT%H:%M:%SZ')
SLEEP_TIME_OUT_IN_SECONDS = 10 NEWS_SOURCE = [ 'the-new-york-times' ] # Connect redis redis_client = redis.Redis(host = REDIS_HOST, port = REDIS_PORT) # Connect CloudAMQP cloudAMQP_client = CloudAMQPClient(AMQP_URL, SCRAPE_NEWS_QUEUE_NAME) while True: # GET NEWS API 的 news articles = news_api_client.getNewsFromSource(NEWS_SOURCE) # count how many new news need to be saved num_of_new_news = 0 for article in articles: # encoding title, store it into redis as the key article_digest = hashlib.md5(article['title'].encode('utf-8')).digest().encode('base64') # check whether redis already have it, if not, then store it into redis and send it to CloudAMQP QUEUE if redis_client.get(article_digest) is None: num_of_new_news = num_of_new_news + 1 article['digest'] = article_digest # in case there is no this field, then add current time if article['publishedAt'] is None: article['publishedAt'] = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ') # insert into redis and set expiration time redis_client.set(article_digest, article) redis_client.expire(article_digest, NEWS_TIME_OUT_IN_SECONDS)