Ejemplo n.º 1
0
def clearQueue(queue_url, queue_name):
    MQ_CLIENT = CloudAMQPClient(queue_url, queue_name)
    num_of_msg = 0

    while True:
        msg = MQ_CLIENT.receive_message()
        if msg is None:
            print "%s messages have beed popped up" % num_of_msg
            return
        num_of_msg = num_of_msg + 1
Ejemplo n.º 2
0
def test_basic():
    client = CloudAMQPClient(DEDUP_CLOUDAMQP_URL, DEDUP_QUEUE_NAME)

    sentMsg = {'test': 'test'}
    # try:
    #     client.send_message(sentMsg)
    # except Exception as e:
    #     print "send message wrong"
    receivedMSG = client.receive_message()

    print receivedMSG

    assert sentMsg == receivedMSG
    print "test_basic passed"
    # Replace XPATH based scraper as newspaper package (which is suitable for multiple website)
    # if news['source'] != 'cnn':
    #     print "News Source is not CNN, cannot handle!"
    # else:
    #     print 'scrape cnn news'
    #     text = CNN.extract_news_text(news['url'])

    # scraper news via newspaper API
    article = Article(news['url'])
    article.download()  # == request.get
    article.parse() 

    text = article.text.encode('utf-8')
    news_task['text'] = text

    DEDEUPER_MQ_CLIENT.send_message(news_task)
    print "[x] Sent msg to %s : %s" % (DEDUP_QUEUE_NAME, text)
    print news['url']


while True:
    if NEWS_TASK_MQ_CLIENT is not None:
        news = NEWS_TASK_MQ_CLIENT.receive_message()

        if news is not None:
            try:
                newsHanlder(news)
            except Exception as e:
                print "newsHanlder wrong"#coding=utf-8
                pass
    NEWS_TASK_MQ_CLIENT.sleep(SLEEP_SECONDS)
            0, text)  # add current news into the 1st item in the list
        print "~~~~~~~~~~~~~~ text -> documents success"

        #calculate the tfidf values
        tfidf = TfidfVectorizer().fit_transform(documents)
        pairwise_sim = tfidf * tfidf.T
        print pairwise_sim.A

        rows, cols = pairwise_sim.shape
        for row in range(1, rows):
            if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD:
                print 'Warning~~~~~~ duplicated news'
                return

    msg_task['publishedAt'] = parser.parse(msg_task['publishedAt'])
    db[NEWS_TABLE_NAME].replace_one({'digest': msg_task['digest']},
                                    msg_task,
                                    upsert=True)


while True:
    if DEDEUPER_MQ_CLIENT is not None:
        msg = DEDEUPER_MQ_CLIENT.receive_message()

        if msg is not None:
            try:
                msgHandler(msg)
            except Exception as e:
                print "msgHanlder wrong"  #coding=utf-8
                pass
    DEDEUPER_MQ_CLIENT.sleep(SLEEP_SECONDS)