Ejemplo n.º 1
0
        tfidf = TfidfVectorizer().fit_transform(documents)
        pairwise_sim = tfidf * tfidf.T

        rows, cols = pairwise_sim.shape
        for row in range(1, rows):
            if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD:
                print('duplicate news.Ignore')
                return

        msg['publishedAt'] = published_at
        db[NEWS_TABLE_NAME].replace_one({'digest': msg['digest']},
                                        msg,
                                        upsert=True)
    else:
        msg['publishedAt'] = published_at
        db[NEWS_TABLE_NAME].replace_one({'digest': msg['digest']},
                                        msg,
                                        upsert=True)


while True:
    if cloudAMQP_client:
        msg = cloudAMQP_client.receiveMessage()
        if msg:
            try:
                handle_mesage(msg)
            except Exception as e:
                print(e)
        cloudAMQP_client.sleep(SLEEP_TIME_IN_SECOND)
Ejemplo n.º 2
0
def handle_message(msg):
    if not msg or not isinstance(msg, dict):
        print('msg in broken')
        return
    text = None
    #if msg['source'] == 'cnn':
    #text = news_scrapter.extract_news(msg['url'])
    #else:
    #print('News source [%s] is not supported.' % msg['source'])
    #Download article according the url
    article = Article(msg['url'])
    article.download()
    article.parse()
    msg['text'] = article.text
    #sendMessage
    fecth_news_queue_client.sendMessage(msg)


while True:
    #receive message
    if scraper_news_queue_client:
        msg = scraper_news_queue_client.receiveMessage()
        if msg:
            try:
                #handle message
                handle_message(msg)
            except Exception as e:
                print(e)
        fecth_news_queue_client.sleep(SLEEP_TIME_IN_SECOND)
Ejemplo n.º 3
0
]
#redis
REDIS_HOST = 'localhost'
REDIS_PORT = 6379
redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT)

#AMQP_client
QUEUE_URL = "amqp://*****:*****@termite.rmq.cloudamqp.com/svowqrcq"
QUEUE_NAME = "news-test"
cloudAMQP_client = CloudAMQPClient(QUEUE_URL, QUEUE_NAME)

#while
while True:
    news_list = news_api_client.getNews(NEWS_SOURCES)
    number_of_news = 0
    for news in news_list:
        #redis to prevent duplicate
        #use md5 for title
        news_digest = hashlib.md5(news['title'].encode('utf-8')).hexdigest()
        if not redis_client.get(news_digest):
            number_of_news = number_of_news + 1
            news['digest'] = news_digest
            redis_client.set(news_digest, 'hh')
            #set expire time
            redis_client.expire(news_digest, NEWS_TIME_OUT_IN_SECONDS)
            #send message to queue
            cloudAMQP_client.sendMessage(news)
    print('%s number of news' % number_of_news)
    #sleep
    cloudAMQP_client.sleep(SLEEP_TIME_TASK_SECONDS)