Ejemplo n.º 1
0
def test_basic():
    client = CloudAMQPClient(CLOUDAMQP_URL, TEST_QUEUE_NAME)

    sentMsg = {'test':'demo'}
    client.sendMessage(sentMsg)
    client.sleep(10)
    receivedMsg = client.getMessage()
    assert sentMsg == receivedMsg
    print 'test_basic passed!'
Ejemplo n.º 2
0
    'techcrunch',
    'the-new-york-times',
    'the-wall-street-journal',
    'the-washington-post'
]

redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT)
cloudAMQP_client = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL, SCRAPE_NEWS_TASK_QUEUE_NAME)

while True:
    news_list = news_api_client.getNewsFromSource(NEWS_SOURCES)
    num_of_new_news = 0
    for news in news_list:
        news_digest = hashlib.md5(news['title'].encode('utf-8')).digest().encode('base64')

        if redis_client.get(news_digest) is None:
            num_of_new_news = num_of_new_news + 1
            news['digest'] = news_digest

            if news['publishedAt'] is None:
                # format: YYYY-MM-DDTHH:MM:SS in UTC
                news['publishedAt'] = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')

            redis_client.set(news_digest, news)
            redis_client.expire(news_digest, NEWS_TIME_OUT_IN_SECONDS)

            cloudAMQP_client.sendMessage(news)

    print "Fetched %d new news." % num_of_new_news
    cloudAMQP_client.sleep(SLEEP_TIME_IN_SECOUNDS)
Ejemplo n.º 3
0
dedupe_news_queue_client = CloudAMQPClient(DEDUPE_NEWS_TASK_QUEUE_URL, DEDUPE_NEWS_TASK_QUEUE_NAME)
scrape_news_queue_client = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL, SCRAPE_NEWS_TASK_QUEUE_NAME)

def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        print 'message is broken'
        return
    task = msg
    
    article = Article(task['url'])
    article.download()
    article.parse()

    print article.text

    task['text'] = article.text
    dedupe_news_queue_client.sendMessage(task)
 
while True:
    # fetch msg from queue
    if scrape_news_queue_client is not None:
        msg = scrape_news_queue_client.getMessage()
        if msg is not None:
            # Handle message
            try:
                handle_message(msg)
            except Exception as e:
                print e
                pass
        scrape_news_queue_client.sleep(SLEEP_TIME_IN_SECONDS)
Ejemplo n.º 4
0
        print 'message from news_to_scrape is broken'
        logging.error('news_fetcher: message from news_to_scrape is broken')
        return
    
    # use Newspaper to scrape the text of news
    task = msg
    text = None
    article = Article(task['url'])
    article.download()
    article.parse()
    task['text'] = article.text
    
    # send this news to mq
    dedupe_news_queue_client.sendMessage(task)
    logging.info('news_fetcher: news text scraped, loaded and sent to news_to_dedupe queue')

while True:
    if scrape_news_queue_client is not None:
        msg = scrape_news_queue_client.getMessage()
        if msg is not None:
            logging.info('news_fetcher: news task aquired from news_to_scrape queue')
            try:
                handle_message(msg)
            except Exception as e:
                print 'news_fetcher exception: %s' % e
                logging.warning('news_fetcher: exception: %s' % e)
                pass
        scrape_news_queue_client.sleep(scrape_sleeptime_seconds)
    
    if dedupe_news_queue_client is not None:
        dedupe_news_queue_client.sleep(dedupe_sleeptime_seconds)
Ejemplo n.º 5
0
redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT)
cloudAMQP_client = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL,
                                   SCRAPE_NEWS_TASK_QUEUE_NAME)

while True:
    news_list = news_api_client.getNewsFromSource(NEWS_SOURCE)

    num_of_new_news = 0

    for news in news_list:
        news_digest = hashlib.md5(
            news['title'].encode('utf-8')).digest().encode('base64')

        if redis_client.get(news_digest) is None:
            num_of_new_news = num_of_new_news + 1
            news['digest'] = news_digest

            if news['publishedAt'] is None:
                # 2017-04-07T16:09:35Z formate: YYYY-MM-DDTHH:MM:SS in UTC
                news['publishedAt'] = datetime.datetime.utcnow().strftime(
                    '%Y-%m-%dT%H:%M:%SZ')

            redis_client.set(news_digest, news)
            redis_client.expire(news_digest, NEWS_TIME_OUT_IN_SECOND)

            cloudAMQP_client.sendMessage(news)

    print "Fetched %d new news" % num_of_new_news
    cloudAMQP_client.sleep(SLEEP_IN_SECOND)
                                           SCRAPE_NEWS_TASK_QUEUE_NAME)


def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        print('message is broken')
        return

    task = msg
    text = None

    article = Article(task['url'])
    article.download()
    article.parse()
    task['text'] = article.text

    dedupe_news_queue_client.sendMessage(task)


while True:
    if scrape_news_queue_client is not None:
        msg = scrape_news_queue_client.getMessage()
        if msg is not None:
            #Parse and process the task
            try:
                handle_message(msg)
            except Exception as e:
                print(e)
                pass
        scrape_news_queue_client.sleep(SLEEP_TIME_IN_SECONDS)
Ejemplo n.º 7
0
    elif description is not None:
        topic = news_topic_modeling_service_client.classify(description)
        task['class'] = topic

    db[config['news_deduper']['NEWS_TABLE_NAME']].replace_one(
        {'digest': task['digest']}, task, upsert=True)

    # logging.basicConfig(level=logging.INFO,
    #             format='%(asctime)s %(filename)s%(message)s',
    #             datefmt='%a %d %b %Y %H:%M:%S' + ',',
    #             filename='../logging/news_pipeline.log',
    #             filemode='a')
    # logging.info(', ' +
    #              'event_name : ' + 'news_dedupe' + ', ' +
    #              'queue_name : ' + str(config['news_deduper']['DEDUPE_NEWS_TASK_QUEUE_NAME']) + ', ' +
    #              'news_id : ' + str(task['digest']))


while True:
    if cloudAMQP_client is not None:
        msg = cloudAMQP_client.getMessage()
        if msg is not None:
            # Parse and process the task
            try:
                handle_message(msg)
            except Exception as e:
                print e
                pass

        cloudAMQP_client.sleep(config['news_deduper']['SLEEP_TIME_IN_SECONDS'])
                # Duplicated news. Ignore.
                print "Duplicated news. Ignore."
                return

    task['publishedAt'] = parser.parse(task['publishedAt'])

    # Classify news
    title = task['title']
    if title is not None:
        topic = news_topic_modeling_service_client.classify(title)
        task['class'] = topic
    print "add one news"

    db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']},
                                    task,
                                    upsert=True)


while True:
    if cloudAMQP_client is not None:
        msg = cloudAMQP_client.getMessage()
        if msg is not None:
            # Parse and process the task
            try:
                handle_message(msg)
            except Exception as e:
                print e
                pass

        cloudAMQP_client.sleep(SLEEP_TIME_IN_SECONDS)
Ejemplo n.º 9
0
        for row in range(1, rows):
            if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD:
                print 'duplicate news. ignore'
                return
    
    task['publishedAt'] = parser.parse(task['publishedAt'])

    # classify new news as it's being deduped. 
    if task['description'] is None:
        task['description'] = task['title']
    
    if task['title'] is not None: 
        topic = news_topic_modeling_service_client.classify(task['description'])
        task['class'] = topic

    db[MONGODB_TABLE_NAME].replace_one({'digest': task['digest']}, task, upsert=True)

while True:
    if dedupe_queue_client is not None:
        msg = dedupe_queue_client.getMessage()
        if msg is not None:
            # parse and proceed with task
            try:
                handle_message(msg)
            except Exception as e:
                print 'error while handling message in deduper: %s' % e
                pass
        
        dedupe_queue_client.sleep(SLEEP_TIMEOUT_IN_SECONDS)
Ejemplo n.º 10
0
DEDUPE_TASK_QUEUE_NAME = config['cloudAMQP']['deduperTaskQueue']['name']

SLEEP_IN_SECONDS = config['cloudAMQP']['scraperTaskQueue']['sleep']

scrape_task_mq_client = CloudAMQPClient(SCRAPE_TASK_QUEUE_URL, SCRAPE_TASK_QUEUE_NAME)
dedupe_task_mq_client = CloudAMQPClient(DEDUPE_TASK_QUEUE_URL, DEDUPE_TASK_QUEUE_NAME)

def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        print('message is invalid')
        return 

    task = msg

    article = Article(task['url'])
    article.download()
    article.parse()
    task['text'] = article.text
    dedupe_task_mq_client.sendMessage(task)

while True:
    if scrape_task_mq_client is not None:
        msg = scrape_task_mq_client.getMessage()
        if msg is not None:
            try:
                handle_message(msg)
            except Exception as ex:
                print(ex)
                pass
        scrape_task_mq_client.sleep(SLEEP_IN_SECONDS)
    # if news['source'] != 'cnn':
    #     print "News Source is not CNN, cannot handle!"
    # else:
    #     print 'scrape cnn news'
    #     text = CNN.extract_news_text(news['url'])

    # scraper news via newspaper API
    article = Article(news['url'])
    article.download()  # == request.get
    article.parse() 

    text = article.text.encode('utf-8')
    news_task['text'] = text

    DEDEUPER_MQ_CLIENT.send_message(news_task)
    print "[x] Sent msg to %s : %s" % (DEDUP_QUEUE_NAME, text)
    print news['url']


while True:
    if NEWS_TASK_MQ_CLIENT is not None:
        news = NEWS_TASK_MQ_CLIENT.receive_message()

        if news is not None:
            try:
                newsHanlder(news)
            except Exception as e:
                print "newsHanlder wrong"#coding=utf-8
                pass
    NEWS_TASK_MQ_CLIENT.sleep(SLEEP_SECONDS)
Ejemplo n.º 12
0
        print 'scraping CNN news'
        text = cnn_news_scraper.extract_news(task['url'])
    else:
        print 'news source [%s] is not supported' % task['source'] 

    task['text'] = text '''
    # print 'message numbers:' + dedupe_news_queue_client.getMessage_count()
    dedupe_news_queue_client.sendMessage(task)

    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(filename)s%(message)s',
                        datefmt='%a %d %b %Y %H:%M:%S' + ',',
                        filename='../logging/news_pipeline.log',
                        filemode='a')
    logging.info(', ' + 'event_name : ' + 'get_news_text_from_source' + ', ' +
                 'queue_name : ' +
                 str(config['news_fecher']['SCRAPE_NEWS_TASK_QUEUE_NAME']) +
                 ', ' + 'news_id : ' + str(task['digest']))


while True:
    if scrape_news_queue_client is not None:
        msg = scrape_news_queue_client.getMessage()
        if msg is not None:
            try:
                handle_message(msg)
            except Exception as e:
                print
                pass
        scrape_news_queue_client.sleep(
            config['news_fecher']['SLEEP_TIME_IN_SECONDS'])
Ejemplo n.º 13
0
news_sources = news_monitor_config['news_sources']
news_timeout_seconds = int(news_monitor_config['news_timeout_seconds'])
sleeptime_seconds = int(news_monitor_config['scrape_queue_client_sleeptime_seconds'])

while True:
    # such a step takes a list of latest news task, but most of them could be old duplicates
    news_list = news_api_client.getNewsFromSource(news_sources)
    num_of_new_news = 0

    for news in news_list:
        news_digest = hashlib.md5(news['title'].encode('utf-8')).digest().encode('base64')

        if redis_client.get(news_digest) is None:
            num_of_new_news = num_of_new_news + 1
            news['digest'] = news_digest

            if news['publishedAt'] is None:
                news['publishedAt'] = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')
            
            # first level anti-duplicate by redis: only report new news tasks
            redis_client.set(news_digest, news)
            redis_client.expire(news_digest, news_timeout_seconds)

            cloudAMQP_client.sendMessage(news)

    print 'Fetched %d news.' % num_of_new_news
    if num_of_new_news != 0:
        logging.info('news_monitor: Fetched %d news and sent to news_to_scrape queue.' % num_of_new_news)

    cloudAMQP_client.sleep(sleeptime_seconds)
        print "~~~~~~~~~~~~~~ text -> documents success"

        #calculate the tfidf values
        tfidf = TfidfVectorizer().fit_transform(documents)
        pairwise_sim = tfidf * tfidf.T
        print pairwise_sim.A

        rows, cols = pairwise_sim.shape
        for row in range(1, rows):
            if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD:
                print 'Warning~~~~~~ duplicated news'
                return

    msg_task['publishedAt'] = parser.parse(msg_task['publishedAt'])
    db[NEWS_TABLE_NAME].replace_one({'digest': msg_task['digest']},
                                    msg_task,
                                    upsert=True)


while True:
    if DEDEUPER_MQ_CLIENT is not None:
        msg = DEDEUPER_MQ_CLIENT.receive_message()

        if msg is not None:
            try:
                msgHandler(msg)
            except Exception as e:
                print "msgHanlder wrong"  #coding=utf-8
                pass
    DEDEUPER_MQ_CLIENT.sleep(SLEEP_SECONDS)
Ejemplo n.º 15
0
        config['news_monitor']['NEWS_SOURCES'])

    num_of_news_news = 0

    for news in news_list:
        news_digest = hashlib.md5(
            news['title'].encode('utf-8')).digest().encode('base64')

        if redis_client.get(news_digest) is None:
            num_of_news_news = num_of_news_news + 1
            news['digest'] = news_digest

            if news['publishedAt'] is None:
                news['publishedAt'] = datetime.datetime.utcnow().strftime(
                    "%Y-%m-%dT%H:%M:%SZ")

            redis_client.set(news_digest, "True")
            redis_client.expire(
                news_digest,
                config['news_monitor']['NEWS_TIME_OUT_IN_SECONDS'])

            cloudAMQP_client.sendMessage(news)

    print "Fetched %d news." % num_of_news_news
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(message)s',
                        filename='NumbersOfNews.log')
    logging.info('We have %d news monitored' % num_of_news_news)

    cloudAMQP_client.sleep(config['news_monitor']['SLEEP_TIME_IN_SECONDS'])