def clearQueue(queue_url, queue_name): MQ_CLIENT = CloudAMQPClient(queue_url, queue_name) num_of_msg = 0 while True: msg = MQ_CLIENT.receive_message() if msg is None: print "%s messages have beed popped up" % num_of_msg return num_of_msg = num_of_msg + 1
def test_basic(): client = CloudAMQPClient(DEDUP_CLOUDAMQP_URL, DEDUP_QUEUE_NAME) sentMsg = {'test': 'test'} # try: # client.send_message(sentMsg) # except Exception as e: # print "send message wrong" receivedMSG = client.receive_message() print receivedMSG assert sentMsg == receivedMSG print "test_basic passed"
# Replace XPATH based scraper as newspaper package (which is suitable for multiple website) # if news['source'] != 'cnn': # print "News Source is not CNN, cannot handle!" # else: # print 'scrape cnn news' # text = CNN.extract_news_text(news['url']) # scraper news via newspaper API article = Article(news['url']) article.download() # == request.get article.parse() text = article.text.encode('utf-8') news_task['text'] = text DEDEUPER_MQ_CLIENT.send_message(news_task) print "[x] Sent msg to %s : %s" % (DEDUP_QUEUE_NAME, text) print news['url'] while True: if NEWS_TASK_MQ_CLIENT is not None: news = NEWS_TASK_MQ_CLIENT.receive_message() if news is not None: try: newsHanlder(news) except Exception as e: print "newsHanlder wrong"#coding=utf-8 pass NEWS_TASK_MQ_CLIENT.sleep(SLEEP_SECONDS)
0, text) # add current news into the 1st item in the list print "~~~~~~~~~~~~~~ text -> documents success" #calculate the tfidf values tfidf = TfidfVectorizer().fit_transform(documents) pairwise_sim = tfidf * tfidf.T print pairwise_sim.A rows, cols = pairwise_sim.shape for row in range(1, rows): if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD: print 'Warning~~~~~~ duplicated news' return msg_task['publishedAt'] = parser.parse(msg_task['publishedAt']) db[NEWS_TABLE_NAME].replace_one({'digest': msg_task['digest']}, msg_task, upsert=True) while True: if DEDEUPER_MQ_CLIENT is not None: msg = DEDEUPER_MQ_CLIENT.receive_message() if msg is not None: try: msgHandler(msg) except Exception as e: print "msgHanlder wrong" #coding=utf-8 pass DEDEUPER_MQ_CLIENT.sleep(SLEEP_SECONDS)