コード例 #1
0
def test_basic():
    news = client.getNewsFromSources()
    print(news)
    assert len(news) > 0
    news = client.getNewsFromSources(sources=['espn'], sortBy='latest')
    assert len(news) > 0
    print('test_basic passed!')
コード例 #2
0
def test_basic():
    news = client.getNewsFromSources()
    print (news)
    assert len(news) > 0
    news = client.getNewsFromSources(sources=['bbc-news'])
    assert len(news) > 0
    print ("test_basic passed.")
コード例 #3
0
def test_basic():
    news = client.getNewsFromSources()
    print(news)
    assert len(news) > 0, 'not getting news'
    news = client.getNewsFromSources(sources=['cnn'], sortBy='top')
    assert len(news) > 0
    print('test_basic passed!')
コード例 #4
0
def test_basic():
    news = client.getNewsFromSources()
    print(news)
    assert len(news) > 0
    news = client.getNewsFromSources(sources=['cnn'], sortBy='top')
    assert len(news) > 0
    print('test passed')
コード例 #5
0
def test_basic():
    news = client.getNewsFromSources()
    print(news)
    assert len(news) > 0
    news = client.getNewsFromSources(['ign'], sortBy='top')
    print(news)
    assert len(news) > 0

    print('test_basic passed!')
コード例 #6
0
def test_basic():
    # test pass with no argument
    news = client.getNewsFromSources()
    print(news)
    assert len(news) > 0
    # test pass with argument
    news = client.getNewsFromSources(sources=['cnn'], sortBy='top')
    assert len(news) > 0
    print('test_basic passed!')
コード例 #7
0
def test_basic():
    news = client.getNewsFromSources(sources = ['bbc-news'])
    for new in news:
        if new['source'] == 'bbc-news':
            print new
    # print news
    assert len(news) > 0

    news = client.getNewsFromSources(sources = ['bloomberg'])
    # print news
    assert len(news) > 0

    print "test_basic passed."
コード例 #8
0
ファイル: news_monitor.py プロジェクト: rz1113/TapNews
def run():
    while True:
        news_list = news_api_client.getNewsFromSources(NEWS_SOURCES)
        num_of_new_news = 0

        for news in news_list:
            # calculate MD5 and convert to string use hexigest
            news_digest = hashlib.md5(news['title'].encode('utf-8')).hexdigest()
            if redis_client.get(news_digest) is None:
                num_of_new_news += 1
                # every news has unique digest
                news['digest'] = news_digest

                if news['publishedAt'] is None:
                    news['publishedAt'] = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
                
                # use as hash set, value does not matter
                redis_client.set(news_digest, "True")
                redis_client.expire(news_digest, NEWS_TIME_OUT_IN_SECONDS)

                cloudAMQP_client.send_message(news)

        print("Fetched %d news." % num_of_new_news)

        # use cloudAMQP_client.sleep keep queue heartbeat
        cloudAMQP_client.sleep(SLEEP_TIME_IN_SECONDS)
コード例 #9
0
def run():
    while True:
        news_list = news_api_client.getNewsFromSources(NEWS_SOURCES)
        #record new coming xinwen
        num_of_new_news = 0
        # find duplidate xinwen, hash title
        for news in news_list:
            news_digest = hashlib.md5(
                news['title'].encode('utf-8')).hexdigest()

            if redis_client.get(news_digest) is None:
                num_of_new_news += 1
                news['digest'] = news_digest

                if news['publishedAt'] is None:
                    news['publishedAt'] = datetime.datetime.utcnow().strftime(
                        "%Y-%m-%dT%H:%M:%SZ")

                redis_client.set(news_digest, 'True')
                redis_client.expire(news_digest, NEWS_TIME_OUT_IN_SECONDS)

                cloudAMQP_client.sendMessage(news)

        logger.info('Fetched %d news.', num_of_new_news)
        cloudAMQP_client.sleep(SLEEP_TIME_IN_SECONDS)
コード例 #10
0
def run():
    while True:
        news_list = news_api_client.getNewsFromSources(NEWS_SOURCES)
        num_of_new_news = 0
        # for each news, check duplicity, if pass, send news to AMQP
        for news in news_list:
            # skip when newsAPI fails to retrieve news, (e.g. news site got reformatted)
            if news['description'] is None:
                continue
            # news_digest is primary key of each news (for checking duplicity)
            news_digest = hashlib.md5(
                news['description'].encode('utf-8')).hexdigest()

            if redis_client.get(news_digest) is None:
                num_of_new_news = num_of_new_news + 1
                news['digest'] = news_digest

                if news['publishedAt'] is None:
                    news['publishedAt'] = datetime.datetime.utcnow().strftime(
                        '%Y-%m-%dT%H:%M:%SZ')

                redis_client.set(news_digest, 'True')
                redis_client.expire(news_digest, NEWS_TIME_OUT_IN_SECONDS)

                cloudAMQP_client.sendMessage(news)

        print('--------------------')
        print('Fetched %d news sources' % num_of_new_news)

        cloudAMQP_client.sleep(SLEEP_TIME_IN_SECONDS)
コード例 #11
0
def run ():
    while True:
        news_list = news_api_client.getNewsFromSources(NEWS_SOURCES)

        num_of_new_news = 0

        for news in news_list:
            news_digest = hashlib.md5(news['title'].encode('utf-8')).hexdigest()

            if redis_client.get(news_digest) is None:
                num_of_new_news += 1
                news['digest'] = news_digest

                #加时间是因为后面做去重需要最近的新闻,而有的新闻自己不带这个时间戳
                if news['publishedAt'] is None:
                    news['publishedAt'] = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")

                redis_client.set(news_digest, True) #value是什么无所谓, 确保key在就行了
                redis_client.expire(news_digest, NEW_TIME_OUT_IN_SECOND)

                #send the news in queue
                cloudAMQP_client.sendMessage(news)

        logger.info("Fetched %d news" , num_of_new_news)
        cloudAMQP_client.sleep(SLEEP_TIME_IN_SECONDS)  #这个while循环每10秒循环一次
コード例 #12
0
def run():
    while True:
        news_list = news_api_client.getNewsFromSources(NEWS_SOURCES)
        num_of_new_news = 0

        for news in news_list:
            if news['title'] is not None:
                news_digest = hashlib.md5(
                    news['title'].encode('utf-8')).hexdigest()
            else:
                print("The news does not have title")
                print(news)
                continue
            if redis_client.get(news_digest) is None:
                num_of_new_news = num_of_new_news + 1
                news['digest'] = news_digest

                if news['publishedAt'] is None:
                    news['publishedAt'] = datetime.datetime.utcnow().strftime(
                        "%Y-%m-%dT%H:%M:%SZ")

                redis_client.set(news_digest, 'True')
                redis_client.expire(news_digest, NEWS_TIME_OUT_IN_SECONDS)

                cloudAMQP_client.sendMessage(news)

        print("Fetched %d news." % num_of_new_news)

        # cloudAMQP_client.sleep(SLEEP_TIME_IN_SECONDS)

        # Send a Get_News Request every hour
        time.sleep(NEWS_REQUEST_TIME_IN_SECONDS)
コード例 #13
0
def run():
    while True:
        news_list = news_api_client.getNewsFromSources(NEWS_SOURCES)
        num_of_new_news = 0

        for news in news_list:
            news_digest = hashlib.md5(
                news['title'].encode('utf-8')).hexdigest()

            if redis_client.get(news_digest) is None:
                num_of_new_news += 1
                news['digest'] = news_digest

                if news['publishedAt'] is None:
                    news['publishedAt'] = datetime.datetime.utcnow().strftime(
                        "%Y-%m-%dT%H:%M:%SZ")

                redis_client.set(news_digest, "True")
                redis_client.expire(news_digest, NEWS_TIME_OUT_IN_SECONDS)

                cloudAMQP_client.send_message(news)

        print("Fetched %d news." % num_of_new_news)

        cloudAMQP_client.sleep(SLEEP_TIME_IN_SECONDS)
def run(news_api_client):
    """ fetch news from NewsAPI, 
  use redis to remove repeated, 
  send non-repeated to Scraper Queue """
    while True:
        news_list = news_api_client.getNewsFromSources(NEWS_SOURCES)

        num_of_new_news = 0
        num_of_total_news = 0

        for news in news_list:
            # or description, etc.
            # then change the string to hex
            news_digest = hashlib.md5(
                news['title'].encode('utf-8')).hexdigest()
            num_of_total_news += 1

            if redis_client.get(news_digest) is None:
                num_of_new_news += 1
                news['digest'] = news_digest

                if news['publishedAt'] is None:
                    news['publishedAt'] = datetime.datetime.utcnow().strftime(
                        "%Y-%m-%dT%H:%M:%SZ")

                redis_client.set(news_digest, '1')
                redis_client.expire(news_digest, NEWS_TIME_OUT_IN_SECONDS)

                cloundAMQP_client.sendMessage(news)

        logger.info("Fetched total %d news, including %d new news.",
                    num_of_total_news, num_of_new_news)

        cloundAMQP_client.sleep(SLEEP_TIME_IN_SECONDS)
コード例 #15
0
ファイル: news_monitor.py プロジェクト: zeeeengxin/iNews
def run():
    while True:
        news_list = news_api_client.getNewsFromSources(NEWS_SOURCES)
        num_of_new_news = 0

        for news in news_list:
            # calculate MD5 and convert to string use hexigest
            # there might not be a description, can use title + description
            news_digest = hashlib.md5(
                news['title'].encode('utf-8')).hexdigest()
            if redis_client.get(news_digest) is None:
                num_of_new_news += 1
                # every news has unique digest, store it in news
                news['digest'] = news_digest
                # time is important for later use, if missing time, we need to give it a timestamp
                if news['publishedAt'] is None:
                    news['publishedAt'] = datetime.datetime.utcnow().strftime(
                        "%Y-%m-%dT%H:%M:%SZ")

                # use as hash set, value does not matter
                redis_client.set(news_digest, "True")
                redis_client.expire(news_digest, NEWS_TIME_OUT_IN_SECONDS)

                cloudAMQP_client.send_message(news)

        print("Fetched %d news." % num_of_new_news)

        # use cloudAMQP_client.sleep to keep queue heartbeat
        # the whole thread will be stoped for 10s
        cloudAMQP_client.sleep(SLEEP_TIME_IN_SECONDS)
コード例 #16
0
ファイル: news_monitor.py プロジェクト: LBsh/news_capstone
    db_config = yaml.load(dbCfg)

with open(CLOUDAMQP_CONFIG_FILE, 'r') as amqpCfg:
    cloudAMQP_config = yaml.load(amqpCfg)

with open(NEWS_CONFIG_FILE, 'r') as newsCfg:
    news_config = yaml.load(newsCfg)

# loading configuration from yaml files
redis_client = redis.StrictRedis(db_config['redis']['host'],
                                 db_config['redis']['port'])
cloudAMQP_client = CloudAMQPClient(cloudAMQP_config['url'],
                                   cloudAMQP_config['scrape_queue_name'])

while True:
    news_list = news_api_client.getNewsFromSources(news_config['news_sources'])
    num_of_new_news = 0

    for news in news_list:
        news_digest = hashlib.md5(
            news['title'].encode('utf-8')).digest().encode('base64')

        if redis_client.get(news_digest) is None:
            num_of_new_news = num_of_new_news + 1
            news['digest'] = news_digest

            # If 'publishedAt' is None, set it to current UTC time
            if news['publishedAt'] is None:
                # Make the time in format YYYY-MM-DDTHH:MM:SSZ in UTC
                news['publishedAt'] = datetime.datetime.utcnow().strftime(
                    '%Y-%m-%dT%H:%M:%SZ')
コード例 #17
0
# TODO: use your own queue.
SCRAPE_NEWS_TASK_QUEUE_URL = ""
SCRAPE_NEWS_TASK_QUEUE_NAME = "tap-news-scrape-news-task-queue"

NEWS_SOURCES = [
    'bbc-news', 'bbc-sport', 'bloomberg', 'cnn', 'entertainment-weekly',
    'espn', 'ign', 'techcrunch', 'the-new-york-times',
    'the-wall-street-journal', 'the-washington-post'
]

redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT)
cloudAMQP_client = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL,
                                   SCRAPE_NEWS_TASK_QUEUE_NAME)

while True:
    news_list = news_api_client.getNewsFromSources(NEWS_SOURCES)

    num_of_new_news = 0

    for news in news_list:
        news_digest = hashlib.md5(
            news['title'].encode('utf-8')).digest().encode('base64')

        if redis_client.get(news_digest) is None:
            num_of_new_news = num_of_new_news + 1
            news['digest'] = news_digest

            if news['publishedAt'] is None:
                # Make the time in format YYYY-MM-DDTHH:MM:SS in UTC.
                news['publishedAt'] = datetime.datetime.utcnow().strftime(
                    '%Y-%m-%dT%H:%M:%SZ')
コード例 #18
0
import news_api_client as client 

def test_basic():
    news = client.getNewsFromSources()
    print(news)
    assert len(news) > 0
    test pass with argument
    news = client.getNewsFromSources(sources=['ign'], sort_by='top')
    print(news)
    assert len(news) > 0
    print('test_basic passed!')

if __name__ == "__main__":
    test_basic()
コード例 #19
0
def test_basic():
    news = client.getNewsFromSources()
    print(news)
    assert len(news) > 0
    test pass with argument