コード例 #1
0
    def start(self):
        while True:
            try:
                # dequeue a processed resource
                message = self._mq_client.get_message(self._processed_resource_queue)
                resource = self._mq_codec.decode(message.body, Resource)
            except EmptyQueueException:
                self._logger.info('Empty queue. Sleeping...')
                time.sleep(self.EMPTY_QUEUE_TIMEOUT)
                continue

            resource._id = makeIdFromURI(resource.uri)
            story = self._story_provider.provide_for_resource(resource)
            resource.story_id = story._id

            try:
                # save and index the resource
                self._resource_collection.insert_model(resource)
                self._index_helper.index_resource(resource, self._index)
                self._logger.debug('Resource %s: Inserted and indexed.' % resource._id)

                # queue update
                update = StoryUpdate()
                update.story_id = story._id
                update.type = STORY_UPDATE_RESOURCE_INSERTED
                update.value = resource._id
                self._update_collection.insert_model(update)

                self._su_client.update(story._id)

            except DuplicateKeyError:
                self._logger.warning('Resource %s: Already inserted.' % resource._id)

            # delete the queue message
            self._mq_client.delete_message(self._processed_resource_queue, message.id)
コード例 #2
0
    def _enqueue(self, resource):

        # Discovered URI - capped collection --db.discovered_uri
        mongoConnection = Connection()
        db = mongoConnection.feed_reading_discovery

        # settings
        max_content_length = config['discovery']['resource_enqueueing']['max_content_length']

        # message queue
        mq_client = message_queue_client_from_config(config['message_queue']['client'])
        mq_client.connect()
        mq_codec = JSONCodec()
        queue = 'discovered_resources'

        try:
            #insert into capped
            db.discovered_uri.insert({'_id':makeIdFromURI(resource.uri), 'uri':resource.uri})
            #insert into queue
            if len(resource.content) > max_content_length:
                self._logger.warning('Skipped %s: Content length is %s.' %
                    (resource, len(resource.content)))
            msg_body = mq_codec.encode(resource)
            mq_client.put_message(queue, msg_body)
            self._logger.debug("Enqueued: %s" % resource._id)
        except DuplicateKeyError:
            pass

        # remove resource from collection
        self._resources_collection.remove_model(resource)
コード例 #3
0
    def discovered_uri_process(self, entry):
        
        # Resource fields
        kwargs = {}

        kwargs['_id'] = makeIdFromURI(entry.link)
        kwargs['uri'] = entry.link

        return DiscoveredURI(**kwargs)
コード例 #4
0
    def resources_process(self, source, entry, config):
        
        self._config = config
        
        # configure logging
        logging.config.dictConfig(config['logging'])
        self._logger = logging.getLogger()
        
        # Resource fields
        kwargs = {} 
        
        kwargs['_id'] = makeIdFromURI(entry.link)
        kwargs['uri'] = entry.link
        kwargs['type'] = 'article'
        kwargs['discovered'] = int(time.time())
        kwargs['title'] = entry.title
        
        if 'updated_parsed' in entry:
            kwargs['published'] = calendar.timegm(entry.updated_parsed)
        elif 'date_parsed' in entry:
            kwargs['published'] = calendar.timegm(entry.date_parsed)
        else:
            kwargs['published'] = None

        if 'author' in entry:
            kwargs['author'] = entry.author

        # content
        content = ''
        if 'summary' in entry:
            content = entry.summary
        if 'content' in entry:
            c = entry.content[0].value
            if len(c) > len (content):
                content = c
        kwargs['feed_content'] = content
        kwargs['content'] = ''
        kwargs['source'] = source
        kwargs['content_extracted'] = 1

        return DiscoveredResource(**kwargs)
コード例 #5
0
        }
    mq_client = message_queue_client_from_config(mq_config)
    mq_codec = JSONCodec()
    processed_resource_queue = 'processed_resources'

    # ElasticSearch
    es = ES('localhost:9200', timeout=60)
    es_index = 'topic_tracking'

    # dequeue one resource
    mq_client.connect()
    message = mq_client.get_message(processed_resource_queue)
    resource = mq_codec.decode(message.body, Resource)
    mq_client.delete_message(processed_resource_queue, message.id)
    mq_client.disconnect()

    # save the resource to mongo
    resource._id = makeIdFromURI(resource.uri)
    resource_collection.insert_model(resource)

    # index the resource
    for boost in [1, 1000]:
        es_doc = {}
        es_doc['content'] = resource.content
        es_doc['title'] = resource.title
        es_doc['entities'] = build_payload_string(resource.entities, boost)
        es_doc['terms'] = build_payload_string(resource.terms, boost)
        id = '%s_%d' % (resource._id, boost)
        r = es.index(es_doc, es_index, 'resource', id)
        pprint(r)