Ejemplo n.º 1
0
def test_basic():
    news = client.getNewsFromSource()
    print news
    assert len(news) > 0
    news = client.getNewsFromSource(sources=['bbc-news'])
    assert len(news) > 0
    print 'news_api_client passed test'
def test_basic():
    news = client.getNewsFromSource()
    print(news)
    assert len(news) > 0
    news = client.getNewsFromSource(sources=['bbc-news'])
    assert len(news) > 0
    print("test_basic passed!")
Ejemplo n.º 3
0
def test():
    news = client.getNewsFromSource()
    print(news)
    assert len(news) > 0
    news = client.getNewsFromSource(sources=['bloomberg'], sortBy='top')
    assert len(news) > 0
    print('test_basic passed!')
Ejemplo n.º 4
0
def test_basic():
    news = client.getNewsFromSource()
    print news
    assert len(news) > 0
    news = client.getNewsFromSource('bbc-news')
    assert len(news) > 0
    print 'test_basic passed!'
def test_basic():
    news = client.getNewsFromSource()
    # print(news)
    assert len(news) > 0
    news = client.getNewsFromSource(sources=['bbc-news'], sortBy='top')
    assert len(news) > 0
    print('test_basic passed!')
Ejemplo n.º 6
0
def test_basic():
    news = client.getNewsFromSource()
    print news
    assert len(news) > 0
    news = client.getNewsFromSource(sources=['cnn'], sortBy='top')
    assert len(news) > 0
    print 'test_basic passed!'
Ejemplo n.º 7
0
def test_basic():
    news = client.getNewsFromSource()
    print "get cnn passed"
    assert len(news) > 0
    news = client.getNewsFromSource(sources=['bbc-news'])
    assert len(news) > 0
    print 'get bbc passed!'
Ejemplo n.º 8
0
def basic_test():
    news = client.getNewsFromSource()
    print(news)
    assert len(news) > 0
    news = client.getNewsFromSource(sources=["bbc-news"], sortBy="top")
    assert len(news) > 0
    print(news)
    print("New Api Basic Test Complete!")
Ejemplo n.º 9
0
def test_basic():
    print 'enter fun'
    news = client.getNewsFromSource()
    print news
    assert len(news) > 0
    news = client.getNewsFromSource(sources=['bbc-news'])
    assert len(news) > 0
    print 'test_basic passed!'
def test_basic():
    """Basic test"""
    news = client.getNewsFromSource()
    print(news)
    assert news is not None
    news = client.getNewsFromSource(sources=['cnn'], sortBy='top')
    assert news is not None
    print('Test passed!')
Ejemplo n.º 11
0
def test_news():
    news = client.getNewsFromSource()
    print news
    assert len(news) > 0

    news = client.getNewsFromSource(sources=['bbc-news'])
    print news
    assert len(news) > 0
Ejemplo n.º 12
0
def test():
    news = client.getNewsFromSource()
    print news
    assert len(news) > 0

    news = client.getNewsFromSource(sources=['bbc-news'])
    assert len(news)
    print 'test passed!'
Ejemplo n.º 13
0
def test_basic():
    news = client.getNewsFromSource()
    print news
    print '\n'
    assert len(news) > 0
    news = client.getNewsFromSource(sources=['bbc-news'])
    print news
    assert len(news) > 0
    print 'test_basic passed!'
Ejemplo n.º 14
0
def test_basic():
    '''
    test news_api_client
    '''
    news = client.getNewsFromSource()
    print(news)
    assert len(news) > 0
    news = client.getNewsFromSource(sources=['cnn'], sort_by='publishedAt')
    assert len(news) > 0
    print 'test_basic passed!'
Ejemplo n.º 15
0
def test_basic():
    news = client.getNewsFromSource()
    print news
    assert len(news) > 0
    print "cnn news successfully loaded!"

    news = client.getNewsFromSource(sources=['bbc-news'])
    print news
    assert len(news) > 0
    print "bbc-news successfully loaded!"
Ejemplo n.º 16
0
def test_basic():
    sources = [
        source.encode('ascii') for source in config["newsApi"]["sources"]
    ]
    print sources
    news = client.getNewsFromSource()
    print news
    assert len(news) > 0
    news = client.getNewsFromSource(sources=sources)
    assert len(news) > 0
    print 'test_basic passed!'
def test_basic():
    # test the function without using parameters
    news = client.getNewsFromSource()
    print news
    # if news is not none then pass
    assert len(news) > 0
    # test the function using parameters
    news = client.getNewsFromSource(
        sources=['cnn'], sortBy='top')  # cnn doesn't support latest sorting
    assert len(news) > 0
    print 'test_basic passed'
Ejemplo n.º 18
0
def run():
    while True:
        news_list = news_api_client.getNewsFromSource(NEWS_SOURCES)

        num_of_news_news = 0

        for news in news_list:
            news_digest = hashlib.md5(
                news['title'].encode('utf-8')).hexdigest()

            if redis_client.get(news_digest) is None:
                num_of_news_news = num_of_news_news + 1
                news['digest'] = news_digest

                if news['publishedAt'] is None:
                    news['publishedAt'] = datetime.datetime.utcnow().strftime(
                        "%Y-%m-%dT%H:%M:%SZ")

                redis_client.set(news_digest, "True")
                redis_client.expire(news_digest, NEWS_TIME_OUT_IN_SECONDS)

                cloudAMQP_client.sendMessage(news)

        print("Fetched %d news." % num_of_news_news)

        cloudAMQP_client.sleep(SLEEP_TIME_IN_SECONDS)
def run():
    while True:
        news_list = news_api_client.getNewsFromSource(NEWS_SOURCES)
        num_of_news_news = 0

        for news in news_list:
            # calculate MD5 and convert to string use hexigest
            news_digest = hashlib.md5(
                news['title'].encode('utf-8')).hexdigest()
            # check in redis
            if redis_client.get(news_digest) is None:
                num_of_news_news = num_of_news_news + 1
                # every news has unique digest
                news['digest'] = news_digest

                # use utc time to avoid different time zones
                if news['publishedAt'] is None:
                    news['publishedAt'] = datetime.datetime.utcnow().strftime(
                        "%Y-%m-%dT%H:%M:%SZ")

                # use as hash set, value does not matter
                redis_client.set(news_digest, "True")
                redis_client.expire(news_digest, NEWS_TIME_OUT_IN_SECONDS)
                # send message to queue for next tast
                cloudAMQP_client.sendMessage(news)

        print("Fetched %d news." % num_of_news_news)
        # use cloudAMQP_client.sleep keep queue heartbeat
        cloudAMQP_client.sleep(SLEEP_TIME_IN_SECONDS)
Ejemplo n.º 20
0
    def __call__(self):
        self.redis_client = redis.StrictRedis(self.redis_server_host,
                                              self.redis_server_port)
        self.cloudAMQP_client = CloudAMQPClient(
            self.scrape_news_task_queue_url, self.scrape_news_task_queue_name)
        news_list = news_api_client.getNewsFromSource(self.news_sources)
        print "call news monitor"
        num_of_new_news = 0
        num_of_old_news = 0

        for news in news_list:
            news_digest = hashlib.md5(
                news['title'].encode('utf-8')).digest().encode('base-64')

            if self.redis_client.get(news_digest) is None:
                num_of_new_news = num_of_new_news + 1
                news['digest'] = news_digest

                if news['publishedAt'] is None:
                    news['publishedAt'] = datetime.datetime.utcnow().strftime(
                        '%Y-%m-%dT%H:%M:%SZZ')

                self.redis_client.set(news_digest, news)
                self.redis_client.expire(news_digest,
                                         self.news_timeout_redis_in_seconds)

                self.cloudAMQP_client.sendMessage(news)
            else:
                num_of_old_news = num_of_old_news + 1

        print "Fetched %d new news. %d old news in redis" % (num_of_new_news,
                                                             num_of_old_news)
        self.cloudAMQP_client.close()
Ejemplo n.º 21
0
def run():
    """Start news monitor"""
    while True:
        news_list = news_api_client.getNewsFromSource(NEWS_SOURCES)
        num_of_new_news = 0

        for news in news_list:
            news_digest = hashlib.md5(
                news['title'].encode('utf-8')).hexdigest()

            if REDIS_CLIENT.get(news_digest) is None:
                num_of_new_news = num_of_new_news + 1
                news['digest'] = news_digest

                if news['publishedAt'] is None:
                    news['publishedAt'] = datetime.datetime.utcnow().strftime(
                        "%Y-%m-%dT%H:%M:%SZ")

                REDIS_CLIENT.set(news_digest, 'True')
                REDIS_CLIENT.expire(news_digest, NEWS_TIME_OUT_IN_SECONDS)

                CLOUD_AMQP_CLIENT.send_message(news)
        print("Fetched %d news." % num_of_new_news)

        CLOUD_AMQP_CLIENT.sleep(SLEEP_TIME_IN_SECONDS)
def test_basic():
    news = client.getNewsFromSource()
    NEWS_SOURCES = [
        'bbc-news', 'bbc-sport', 'bloomberg', 'cnn', 'entertainment-weekly',
        'espn', 'ign', 'techcrunch', 'the-new-york-times',
        'the-wall-street-journal', 'the-washington-post'
    ]
    news_category = client.getNewsFromCategory()
    NEWS_CATEGORY = ['general', 'sports']
    # print(news)
    print(news_category)
    assert len(news) > 0
    news = client.getNewsFromSource(sources=NEWS_SOURCES)
    assert len(news) > 0
    news = client.getNewsFromCategory(categories=NEWS_CATEGORY)
    assert len(news) > 0
    print('[x] test basic new_api passed!')
Ejemplo n.º 23
0
def test_basic():
  # news = client.getNewsFromSource()
  # print news
  # assert len(news) > 0
  # news = client.getNewsFromSource(sources=['bbc-news'])
  # assert len(news) > 0
  news = client.getNewsFromSource(sources=['the-verge'])
  print(news)
  assert len(news) > 0
  print 'test_basic passed!'
Ejemplo n.º 24
0
    'bloomberg',
    'cnn',
    'entertainment-weekly',
    'espn',
    'ign',
    'techcrunch',
    'the-new-york-times',
    'the-wall-street-journal',
    'the-washington-post'
]

redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT)
cloudAMQP_client = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL, SCRAPE_NEWS_TASK_QUEUE_NAME)

while True:
    news_list = news_api_client.getNewsFromSource(NEWS_SOURCES)
    num_of_new_news = 0
    for news in news_list:
        news_digest = hashlib.md5(news['title'].encode('utf-8')).digest().encode('base64')

        if redis_client.get(news_digest) is None:
            num_of_new_news = num_of_new_news + 1
            news['digest'] = news_digest

            if news['publishedAt'] is None:
                # format: YYYY-MM-DDTHH:MM:SS in UTC
                news['publishedAt'] = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')

            redis_client.set(news_digest, news)
            redis_client.expire(news_digest, NEWS_TIME_OUT_IN_SECONDS)
Ejemplo n.º 25
0
# ask for mq and memory configs
mq_config = config_service_client.getMessagequeueConfigForUsecase('scrape_news_task')
cloudAMQP_client = CloudAMQPClient(mq_config['queue_url'], mq_config['queue_name'])

mmr_config = config_service_client.getMemoryConfig('redis')
redis_client = redis.StrictRedis(mmr_config['host'], mmr_config['port'])

# ask for other params
news_monitor_config = config_service_client.getPipelineConfigForSection('news_monitor')
news_sources = news_monitor_config['news_sources']
news_timeout_seconds = int(news_monitor_config['news_timeout_seconds'])
sleeptime_seconds = int(news_monitor_config['scrape_queue_client_sleeptime_seconds'])

while True:
    # such a step takes a list of latest news task, but most of them could be old duplicates
    news_list = news_api_client.getNewsFromSource(news_sources)
    num_of_new_news = 0

    for news in news_list:
        news_digest = hashlib.md5(news['title'].encode('utf-8')).digest().encode('base64')

        if redis_client.get(news_digest) is None:
            num_of_new_news = num_of_new_news + 1
            news['digest'] = news_digest

            if news['publishedAt'] is None:
                news['publishedAt'] = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')
            
            # first level anti-duplicate by redis: only report new news tasks
            redis_client.set(news_digest, news)
            redis_client.expire(news_digest, news_timeout_seconds)
Ejemplo n.º 26
0
def test_basic():
    news = client.getNewsFromSource()
    print news
    assert len(news) > 0
    news = client.getNewsFromSource(sources=[NEWS_SOURCES], sortBy='top')
Ejemplo n.º 27
0
with open('../configuration/news_pipeline_conf.yaml', 'r') as stream:
    try:
        config = yaml.load(stream)
    except yaml.YAMLError as exc:
        print(exc)

redis_client = redis.StrictRedis(config['news_monitor']['REDIS_HOST'],
                                 int(config['news_monitor']['REDIS_PORT']))

cloudAMQP_client = CloudAMQPClient(
    config['news_monitor']['SCRAPE_NEWS_TASK_QUEUE_URL'],
    config['news_monitor']['SCRAPE_NEWS_TASK_QUEUE_NAME'])

while True:
    news_list = news_api_client.getNewsFromSource(
        config['news_monitor']['NEWS_SOURCES'])

    num_of_news_news = 0

    for news in news_list:
        news_digest = hashlib.md5(
            news['title'].encode('utf-8')).digest().encode(
                'base64')  # digest can be used as a unique ID

        if redis_client.get(news_digest) is None:
            '''new news coming in'''
            num_of_news_news = num_of_news_news + 1
            news['digest'] = news_digest

            if news['publishedAt'] is None:
                news['publishedAt'] = datetime.datetime.utcnow().strftime(
Ejemplo n.º 28
0
REDIS_PORT = 6379

SCRAPE_NEWS_TASK_QUEUE_URL = 'amqp://*****:*****@shark.rmq.cloudamqp.com/gvmlmvjt'
SCRAPE_NEWS_TASK_QUEUE_NAME = 'smart-news-scrape-task-queue'

NEWS_TIMEOUT_IN_SECONDS = 3600 * 24 
SLEEP_IN_SECONDS = 1800



redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT)
CloudAMPQ_client = CloudAMPQClient(SCRAPE_NEWS_TASK_QUEUE_URL,SCRAPE_NEWS_TASK_QUEUE_NAME)

while True:
    #src_list = news_api_client.getSources()
    news_list = news_api_client.getNewsFromSource()
    num_of_new = 0
    for news in news_list:

        news_digest = hashlib.md5(news['title'].encode('utf-8')).digest().encode('base64')
        if redis_client.get(news_digest) is None:
            num_of_new = num_of_new + 1
            news['digest'] = news_digest
            print(news_digest)
            if news['publishedAt'] is None:
                news['publishedAt'] = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')
            print(news)
            redis_client.set(news_digest, json.dumps(news))
            redis_client.expire(news_digest, NEWS_TIMEOUT_IN_SECONDS)
            print("[News-Monitor] SEND MSG : ")
            print(news)
Ejemplo n.º 29
0
def test():
    news = client.getNewsFromSource()
    print(news)
    assert len(news) > 0
    print("test passed")
Ejemplo n.º 30
0
NEWS_SOURCES = [
    'bbc-news', 'bbc-sport', 'bloomberg', 'cnn', 'entertainment-weekly',
    'espn', 'ign', 'techcrunch', 'the-new-york-times',
    'the-wall-street-journal', 'the-washington-post'
]
EVERYTHING_API = 'everything'
SORT_BY_TOP = 'publishedAt'
PAGE_SIZE = 100

redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT)
cloudAMQP_client = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL,
                                   SCRAPE_NEWS_TASK_QUEUE_NAME)

while True:
    news_list = news_api_client.getNewsFromSource(sources=NEWS_SOURCES,
                                                  api_name=EVERYTHING_API,
                                                  sort_by=SORT_BY_TOP,
                                                  pageSize=PAGE_SIZE)

    num_of_new_news = 0

    for news in news_list:
        news_digest = hashlib.md5(
            news['title'].encode('utf-8')).digest().encode('base64')

        if redis_client.get(news_digest) is None:
            num_of_new_news += 1
            news['digest'] = news_digest

            if news['publishedAt'] is None:
                news['publishedAt'] = datetime.datetime.utcnow().strftime(
                    "%Y-%m-%dT%H:%M:%SZ")
Ejemplo n.º 31
0
# Use your own Cloud AMQP queue
SCRAPE_NEWS_TASK_QUEUE_URL = "amqp://*****:*****@wasp.rmq.cloudamqp.com/yruquhmv"
SCRAPE_NEWS_TASK_QUEUE_NAME = "SCRAPE_NEWS_TASK"

NEWS_SOURCES = [
    'bbc-news', 'bbc-sport', 'bloomberg', 'cnn', 'entertainment-weekly',
    'espn', 'ign', 'techcrunch', 'the-new-york-times',
    'the-wall-street-journal', 'the-washington-post'
]

redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT)
cloudAMQP_client = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL,
                                   SCRAPE_NEWS_TASK_QUEUE_NAME)

while True:
    news_list = news_api_client.getNewsFromSource(NEWS_SOURCES)

    num_of_new_news = 0

    for news in news_list:
        news_digest = hashlib.md5(
            news['title'].encode('utf-8')).digest().encode('base64')

        if redis_client.get(news_digest) is None:
            num_of_new_news = num_of_new_news + 1
            news['digest'] = news_digest

            if news['publishedAt'] is None:
                # format: YYYY-MM-DDTHH:MM:SS in UTC
                news['publishedAt'] = datetime.datetime.utcnow().strftime(
                    '%Y-%m-%dT%H:%M:%SZ')
Ejemplo n.º 32
0
SLEEP_TIME_OUT_IN_SECONDS = 10

NEWS_SOURCE = [
    'the-new-york-times'
]

# Connect redis
redis_client = redis.Redis(host = REDIS_HOST,
                           port = REDIS_PORT)

# Connect CloudAMQP
cloudAMQP_client = CloudAMQPClient(AMQP_URL, SCRAPE_NEWS_QUEUE_NAME)

while True:
    # GET NEWS API 的 news
    articles = news_api_client.getNewsFromSource(NEWS_SOURCE)
    # count how many new news need to be saved
    num_of_new_news = 0
    for article in articles:
        # encoding title, store it into redis as the key
        article_digest = hashlib.md5(article['title'].encode('utf-8')).digest().encode('base64')
        # check whether redis already have it, if not, then store it into redis and send it to CloudAMQP QUEUE
        if redis_client.get(article_digest) is None:
            num_of_new_news = num_of_new_news + 1
            article['digest'] = article_digest
            # in case there is no this field, then add current time
            if article['publishedAt'] is None:
                article['publishedAt'] = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')
            # insert into redis and set expiration time
            redis_client.set(article_digest, article)
            redis_client.expire(article_digest, NEWS_TIME_OUT_IN_SECONDS)