Ejemplo n.º 1
0
def test_basic():
    client = CloudAMQPClient(CLOUDAMQP_URL, TEST_QUEUE_NAME)

    sentMsg = {'test':'demo'}
    client.sendMessage(sentMsg)
    client.sleep(10)
    receivedMsg = client.getMessage()
    assert sentMsg == receivedMsg
    print 'test_basic passed!'
Ejemplo n.º 2
0
        documents.insert(0, text)

        # Calculate similarity matrix
        tfidf = TfidfVectorizer().fit_transform(documents)
        pairwise_sim = tfidf * tfidf.T

        print pairwise_sim.A

        rows, _ = pairwise_sim.shape

        for row in range(1, rows):
            if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD:
                # Duplicated news. Ignore.
                print "Duplicated news. Ignore."
                return
    task['publishedAt'] = parser.parse(task['publishedAt'])
    db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']}, task, upsert=True)


while True:
    if cloudAMQP_client is not None:
        msg = cloudAMQP_client.getMessage()
        if msg is not None:
            # Parse and process the task
            try:
                handle_message(msg)
            except Exception as e:
                print e
                pass

        cloudAMQP_client.sleep(SLEEP_TIME_IN_SECONDS)
Ejemplo n.º 3
0
        print 'message from news_to_scrape is broken'
        logging.error('news_fetcher: message from news_to_scrape is broken')
        return
    
    # use Newspaper to scrape the text of news
    task = msg
    text = None
    article = Article(task['url'])
    article.download()
    article.parse()
    task['text'] = article.text
    
    # send this news to mq
    dedupe_news_queue_client.sendMessage(task)
    logging.info('news_fetcher: news text scraped, loaded and sent to news_to_dedupe queue')

while True:
    if scrape_news_queue_client is not None:
        msg = scrape_news_queue_client.getMessage()
        if msg is not None:
            logging.info('news_fetcher: news task aquired from news_to_scrape queue')
            try:
                handle_message(msg)
            except Exception as e:
                print 'news_fetcher exception: %s' % e
                logging.warning('news_fetcher: exception: %s' % e)
                pass
        scrape_news_queue_client.sleep(scrape_sleeptime_seconds)
    
    if dedupe_news_queue_client is not None:
        dedupe_news_queue_client.sleep(dedupe_sleeptime_seconds)
Ejemplo n.º 4
0
        for row in range(1, rows):
            if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD:
                # Duplicated news. Ignore.
                print("Duplicated news. Ignore.")
                return
    task['publishedAt'] = parser.parse(task['publishedAt'])

    # Classify news
    title = task['title']
    if title is not None:
        topic = news_topic_modeling_service_client.classify(title)
        task['class'] = topic

    db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']},
                                    task,
                                    upsert=True)

while True:
    if cloudAMQP_client is not None:
        msg = cloudAMQP_client.getMessage()
        if msg is not None:
            # Parse and process the task
            try:
                handle_message(msg)
            except Exception as e:
                print(e)
                pass

        cloudAMQP_client.sleep(SLEEP_TIME_IN_SECONDS)
Ejemplo n.º 5
0
    # print article.cleaned_text

    task['text'] = article.cleaned_text

    # # Scraping CNN news
    # text = None
    # if task['source']['id'] == 'cnn':
    #     print "Scraping CNN news"
    #     text = cnn_news_scraper.extractNews(task['url'])
    # else:
    #     print "News source [%s] is not supported." % task['source']['name']
    #
    # task['text'] = text

    dedupe_news_queue_client.sendMessage(task)


while True:
    # fetch msg from queue
    if scrape_news_queue_client is not None:
        msg = scrape_news_queue_client.getMessage()
        if msg is not None:
            # Handle message
            try:
                handle_message(msg)
            except Exception as e:
                print e
                pass
        scrape_news_queue_client.sleep(SLEEP_TIME_IN_SECONDS)
Ejemplo n.º 6
0
        documents = [str(news['text']) for news in recent_news_list]
        documents.insert(0, text)

        # cal tf-idf similarity
        tfidf = TfidfVectorizer().fit_transform(documents)
        pairwise_sim = tfidf * tfidf.T
        print pairwise_sim.A
        rows = pairwise_sim.shape[0]

        for row in range(1, rows):
            if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD:
                # duplicate news, ignore
                print 'Duplicate news. Ignore'
                return

    task['publishedAt'] = parser.parse(task['publishedAt'])
    # replace if exist, else insert
    db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']}, task, upsert=True)

while True:
    if dedupe_news_queue_client is not None:
        msg = dedupe_news_queue_client.getMessage()
        if msg is not None:
            # Parse and process the task
            try:
                handle_message(msg)
            except Exception as e:
                print 'handle_message error:', e
                pass
        dedupe_news_queue_client.sleep(SLEEP_TIME_IN_SECONDS)
Ejemplo n.º 7
0
DEDUPE_TASK_QUEUE_NAME = config['cloudAMQP']['deduperTaskQueue']['name']

SLEEP_IN_SECONDS = config['cloudAMQP']['scraperTaskQueue']['sleep']

scrape_task_mq_client = CloudAMQPClient(SCRAPE_TASK_QUEUE_URL, SCRAPE_TASK_QUEUE_NAME)
dedupe_task_mq_client = CloudAMQPClient(DEDUPE_TASK_QUEUE_URL, DEDUPE_TASK_QUEUE_NAME)

def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        print('message is invalid')
        return 

    task = msg

    article = Article(task['url'])
    article.download()
    article.parse()
    task['text'] = article.text
    dedupe_task_mq_client.sendMessage(task)

while True:
    if scrape_task_mq_client is not None:
        msg = scrape_task_mq_client.getMessage()
        if msg is not None:
            try:
                handle_message(msg)
            except Exception as ex:
                print(ex)
                pass
        scrape_task_mq_client.sleep(SLEEP_IN_SECONDS)
SLEEP_TIME_IN_SECONDS = config['operations']['SLEEP_TIME_IN_SECONDS']

graphitelog_cloudAMQP_client = CloudAMQPClient(LOG_GRAPHITE_TASK_QUEUE_URL,
                                               LOG_GRAPHITE_TASK_QUEUE_NAME)


def handle_message(msg):
    if msg is None:
        log_client.logger.info('message is broken')
        #print 'message is broken'
        return

    counter = statsd.Counter(msg)
    counter += 1


while True:
    #fetch message from queue
    if graphitelog_cloudAMQP_client is not None:
        msg = graphitelog_cloudAMQP_client.getMessage()
        if msg is not None:
            #print 'msg: %s' % msg
            #handle message
            try:
                handle_message(msg)
            except Exception as e:
                log_client.logger.error(str(e))
                pass
        #print 'fetch 0 log message...'
        #graphitelog_cloudAMQP_client.sleep(SLEEP_TIME_IN_SECONDS)