Ejemplo n.º 1
0
class NewsFetcher():
    def __init__(self):
        with open(CONFIG_FILE, 'r') as f:
            data = json.load(f)
            self.dedupe_news_task_queue_url = data['queue'][
                'dedupeNewsTaskQueueUrl']
            self.dedupe_news_task_queue_name = data['queue'][
                'dedupeNewsTaskQueueName']
            self.scrape_news_task_queue_url = data['queue'][
                'scrapeNewsTaskQueueUrl']
            self.scrape_news_task_queue_name = data['queue'][
                'scrapeNewsTaskQueueName']
            self.sleep_time_in_seconds = int(
                data['queue']['fetchNewsTaskSleepTime'])

    def handle_message(self, msg):
        if msg is None or not isinstance(msg, dict):
            print "message is broken"
            return

        task = msg
        article = Article(task['url'])
        article.download()
        article.parse()
        #      print article.text

        task['text'] = article.text
        self.dedupe_news_queue_client.sendMessage(task)

    def __call__(self):
        self.dedupe_news_queue_client = CloudAMQPClient(
            self.dedupe_news_task_queue_url, self.dedupe_news_task_queue_name)
        self.scrape_news_queue_client = CloudAMQPClient(
            self.scrape_news_task_queue_url, self.scrape_news_task_queue_name)

        #fetch msg from queue
        if self.scrape_news_queue_client is not None:
            while True:
                msg = self.scrape_news_queue_client.getMessage()
                if msg is not None:
                    #handle message
                    try:
                        self.handle_message(msg)
                    except Exception as e:
                        print e
                        pass
                    self.scrape_news_queue_client.sleep(
                        self.sleep_time_in_seconds)
                else:
                    self.scrape_news_queue_client.close()
                    self.dedupe_news_queue_client.close()
                    break
Ejemplo n.º 2
0
class NewsMonitor:
    def __init__(self):
        with open(CONFIG_FILE, 'r') as f:
            data = json.load(f)
            self.scrape_news_task_queue_url = data['queue'][
                'scrapeNewsTaskQueueUrl']
            self.scrape_news_task_queue_name = data['queue'][
                'scrapeNewsTaskQueueName']
            self.redis_server_host = data['redis']['redisServerHost']
            self.redis_server_port = int(data['redis']['redisServerPort'])
            self.news_timeout_redis_in_seconds = int(
                data['redis']['newsMonitorExpireInSeconds'])
            self.news_sources = list(data['newsApi']['source'])

    def __call__(self):
        self.redis_client = redis.StrictRedis(self.redis_server_host,
                                              self.redis_server_port)
        self.cloudAMQP_client = CloudAMQPClient(
            self.scrape_news_task_queue_url, self.scrape_news_task_queue_name)
        news_list = news_api_client.getNewsFromSource(self.news_sources)
        print "call news monitor"
        num_of_new_news = 0
        num_of_old_news = 0

        for news in news_list:
            news_digest = hashlib.md5(
                news['title'].encode('utf-8')).digest().encode('base-64')

            if self.redis_client.get(news_digest) is None:
                num_of_new_news = num_of_new_news + 1
                news['digest'] = news_digest

                if news['publishedAt'] is None:
                    news['publishedAt'] = datetime.datetime.utcnow().strftime(
                        '%Y-%m-%dT%H:%M:%SZZ')

                self.redis_client.set(news_digest, news)
                self.redis_client.expire(news_digest,
                                         self.news_timeout_redis_in_seconds)

                self.cloudAMQP_client.sendMessage(news)
            else:
                num_of_old_news = num_of_old_news + 1

        print "Fetched %d new news. %d old news in redis" % (num_of_new_news,
                                                             num_of_old_news)
        self.cloudAMQP_client.close()
Ejemplo n.º 3
0
class NewsDeduper:
    def __init__(self):
        with open(CONFIG_FILE, 'r') as f:
            data = json.load(f)
            self.dedupe_news_task_queue_url = data['queue']['dedupeNewsTaskQueueUrl']
            self.dedupe_news_task_queue_name = data['queue']['dedupeNewsTaskQueueName']
            self.sleep_time_in_seconds = int(data['queue']['dedupeNewsTaskSleepTime'])
            self.collection = data['mongoDb']['newsMongoDbCollection']
            self.sameNewsThreshold = float(data['newsDedupe']['sameNewsThreshold'])

    def handle_messages(self, msg):
        print "handle message from dedupe queue"
        if msg is None or not isinstance(msg, dict):
            print "message is broken"
            return False
        
        task = msg
        text = str(task['text'].encode('utf-8'))
        if text is None:
            return False

        #Get all recent news 
        published_at = parser.parse(task['publishedAt'])
        published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0)
        published_at_day_end = published_at_day_begin + datetime.timedelta(days=1)
        recent_news_list = list(self.db[self.collection].find({'publishedAt':{'$gte':published_at_day_begin, '$lt': published_at_day_end}}))
        print "get recent news list"
        if recent_news_list is not None and len(recent_news_list) > 0:
            documents = [str(news['text'].encode('ascii', 'ignore')) for news in recent_news_list]
            documents.insert(0, text)

            #caculate similarity matrix
            tfidf = TfidfVectorizer().fit_transform(documents)
            pairwise_sim = tfidf * tfidf.T
            print pairwise_sim.A
            rows, _ = pairwise_sim.shape

            for row in range(1, rows):
                if pairwise_sim[row, 0] > self.sameNewsThreshold:
                    # Duplicated news. Ignore.
                    print "Duplicated news. Ignore."
                    return False
        task['publishedAt'] = parser.parse(task['publishedAt'])
        title = task['title'].encode('ascii', 'ignore')
        source = task['source'].encode('ascii')
        url = task['url'].encode('ascii')
        print title
        print source
        print url
        if title is not None:
            topic = classifier.classify(source, url)
            if topic is not None:
                print "Get topic %s by url" % topic
            else:
                topic = news_topic_modeling_service_client.classify(title)
                print "Learn topic %s by ml" % topic
            task['class'] = topic

        self.db[self.collection].replace_one({'digest': task['digest']}, task, upsert=True)
        return True

    def __call__(self):      
        self.cloudAMQP_client = CloudAMQPClient(self.dedupe_news_task_queue_url, self.dedupe_news_task_queue_name)
        self.db = mongodb_client.get_db()
        num_unique_news = 0
        while True:
            if self.cloudAMQP_client is not None:
                msg = self.cloudAMQP_client.getMessage()
                if msg is not None:
                    # Parse and process the task
                    try:
                        if self.handle_messages(msg):
                            num_unique_news += 1
                        else:
                            print "invalid msg"
                    except Exception as e:
                        print e
                        pass
                    self.cloudAMQP_client.sleep(self.sleep_time_in_seconds)
                else:
                    print "Store %d unique news in mongoDb" % num_unique_news
                    self.cloudAMQP_client.close()
                    break