Exemple #1
0
def queue_start(request_id):
    try:
        container = UnpackHelpers.start_docker_container(
            container_name=UnpackHelpers.
            DOCKER_CONTAINER_NAMES['QUEUE_MANAGER'],
            request_id=request_id,
        )

        connection = pika.BlockingConnection(
            pika.ConnectionParameters(os.environ['UNPACK_HOST']))
        channel = connection.channel()
        broadcaster_queue_name = UnpackHelpers.get_queue_name(
            queue_type='broadcast', request_id=request_id)
        channel.queue_declare(queue=broadcaster_queue_name)
        channel.basic_publish(
            exchange='',
            routing_key=broadcaster_queue_name,
            body=json.dumps(
                {
                    'event_name': UnpackHelpers.EVENT_NAME['REQUEST:QUEUED'],
                    'data': {},
                },
                default=str),
            properties=pika.BasicProperties(
                delivery_mode=2,  # make message persistent
            ))
        connection.close()

        return jsonify({'container': container.id})
    except Exception as e:
        logger.exception(f'Error starting queue: {request_id}')
        abort(500)
Exemple #2
0
def main():
    UnpackHelpers.get_sql_pool()
    is_debug = os.environ['UNPACK_DEV_ENV'] == 'TRUE'
    socketio.run(app,
                 host='0.0.0.0',
                 port='5000',
                 debug=is_debug,
                 use_reloader=False)
Exemple #3
0
    def get_node_and_links_from_db(cls, node_uuid, min_update_date=None):
        raw_node_details = UnpackHelpers.fetch_node(
            node_uuid, min_update_date=min_update_date)
        raw_links = []

        if raw_node_details is not None:
            raw_links = UnpackHelpers.fetch_links_by_source(node_uuid)

        return raw_node_details, raw_links
Exemple #4
0
    def __init__(self, ch, method, properties, body):
        request_id = UnpackHelpers.get_request_id_from_name(method.routing_key)
        queue_event_keys = UnpackHelpers.get_queue_event_keys(request_id)

        body = json.loads(body)
        event_name = body['event_name']

        socketio.emit(
            queue_event_keys[event_name],
            body['data'],
            namespace=f'/{request_id}',
        )
Exemple #5
0
def queue_stop(request_id):
    try:
        connection = pika.BlockingConnection(
            pika.ConnectionParameters(os.environ['UNPACK_HOST']))
        channel = connection.channel()
        fetcher_queue_name = UnpackHelpers.get_queue_name(
            queue_type='fetch', request_id=request_id)
        broadcaster_queue_name = UnpackHelpers.get_queue_name(
            queue_type='broadcast', request_id=request_id)
        channel.queue_delete(queue=fetcher_queue_name)
        channel.queue_delete(queue=broadcaster_queue_name)
        connection.close()

        return jsonify({'success': True})
    except Exception as e:
        logger.exception(f'Error stopping queue: {request_id}')
        abort(500)
Exemple #6
0
    def process_links(self, raw_links, type_cls):
        has_reached_max_depth = self.state['level'] + 1 > self.rules['max_link_depth']

        if has_reached_max_depth:
            if type_cls.TYPE != 'twitter':
                return
            elif self.rules['twitter_use_max_link_depth']:
                return

        for raw_link in raw_links:
            link_type = raw_link.get('link_type')
            if has_reached_max_depth and link_type in ['media', 'link']:
                continue

            source_node_url = self.node_url
            source_node_uuid = self.node_uuid
            target_node_url = raw_link.get('target_node_url')
            target_node_uuid = raw_link.get('target_node_uuid')
            nofollow_link = raw_link.get('nofollow', False)

            if not target_node_uuid and target_node_url:
                target_node_uuid = UnpackHelpers.fetch_node_uuid(target_node_url)

            if not target_node_url and target_node_uuid:
                target_node_url = UnpackHelpers.fetch_node_url(target_node_uuid)

            if not target_node_uuid or not target_node_url:
                continue

            if not nofollow_link:
                self.store_link(
                    source_node_url=source_node_url,
                    source_node_uuid=source_node_uuid,
                    target_node_url=target_node_url,
                    target_node_uuid=target_node_uuid,
                    raw_link=raw_link,
                )

            self.queue_next_node(
                source_node_url=source_node_url,
                source_node_uuid=source_node_uuid,
                target_node_url=target_node_url,
                target_node_uuid=target_node_uuid,
                raw_link=raw_link
            )
Exemple #7
0
 def publish_child(self, **kwargs):
     queue_name = UnpackHelpers.get_queue_name('fetch', self.request_id)
     self.channel.basic_publish(
         exchange='',
         routing_key=queue_name,
         body=json.dumps(kwargs),
         properties=pika.BasicProperties(
             delivery_mode=2,  # make message persistent
         ))
Exemple #8
0
    def store_link(self, source_node_url, source_node_uuid, target_node_url, target_node_uuid, raw_link):
        UnpackHelpers.store_link(
            source_node_uuid,
            target_node_uuid=target_node_uuid,
            link_type=raw_link.get('link_type'),
            weight=raw_link.get('weight'),
        )

        self.publish_broadcast(
            event_name=UnpackHelpers.EVENT_NAME['LINK:COMPLETED'],
            source_node_url=source_node_url,
            source_node_uuid=source_node_uuid,
            target_node_url=target_node_url,
            target_node_uuid=target_node_uuid,
            level=self.state['level'] + 1,
            link_type=raw_link.get('link_type'),
            weight=raw_link.get('weight'),
        )
Exemple #9
0
    def walk_node_tree(self):
        if self.node_url is None:
            return

        type_cls, node_url_match = Fetcher.get_node_type_class_by_url(self.node_url)
        node_details, raw_links = type_cls.fetch(
            self.node_uuid,
            self.node_url,
            url_matches=node_url_match,
            rules=self.rules
        )

        node_details['was_no_follow'] = self.nofollow_link

        has_links = len(raw_links) > 0

        if not node_details.get('is_from_db', True):
            UnpackHelpers.store_node(
                self.node_uuid,
                node_type=node_details.get('node_type'),
                data=node_details.get('data'),
                is_error=node_details.get('is_error'),
            )

        self.publish_broadcast(
            event_name=UnpackHelpers.EVENT_NAME['NODE:COMPLETED'],
            node_uuid=self.node_uuid,
            node_url=self.node_url,
            node_details=node_details,
        )

        if self.nofollow_link:
            return

        if not has_links:
            UnpackHelpers.store_link(self.node_uuid)
            return

        self.process_links(
            raw_links=raw_links,
            type_cls=type_cls,
        )
Exemple #10
0
 def publish_broadcast(self, event_name, **kwargs):
     queue_name = UnpackHelpers.get_queue_name('broadcast', self.request_id)
     self.channel.basic_publish(
         exchange='',
         routing_key=queue_name,
         body=json.dumps({
             'event_name': event_name,
             'data': kwargs,
         }, default=str),
         properties=pika.BasicProperties(
             delivery_mode=2,  # make message persistent
         ))
Exemple #11
0
    def __init__(self, ch, method, properties, body):
        self.channel = ch
        self.request_id = UnpackHelpers.get_request_id_from_name(method.routing_key)

        body = json.loads(body)

        if body.get('node_uuid') is None:
            body['node_uuid'] = UnpackHelpers.fetch_node_uuid(body.get('node_url'))

        if body.get('node_url') is None:
            body['node_url'] = UnpackHelpers.fetch_node_url(body.get('node_uuid'))

        state = body.get('state', {})
        self.state = {**self.DEFAULT_STATE, **state}

        rules = body.get('rules', {})
        rules = {} if rules is None else rules
        self.rules = {**self.DEFAULT_RULES, **rules}

        self.node_url = body['node_url']
        self.node_uuid = body['node_uuid']
        self.node_url_hash = UnpackHelpers.get_url_hash(body['node_url'])

        self.source_node_uuid = body.get('source_node_uuid')
        self.origin_source_node_url = body.get('origin_source_node_url', self.node_url)
        self.origin_source_uuid = UnpackHelpers.fetch_node_uuid(self.origin_source_node_url)
        self.is_origin_node = self.source_node_uuid is None

        self.nofollow_link = body.get('nofollow_link', False)

        self.publish_broadcast(
            event_name=UnpackHelpers.EVENT_NAME['NODE:IN_PROGRESS'],
            node_uuid=self.node_uuid,
            node_url=self.node_url,
        )

        self.walk_node_tree()
        ch.basic_ack(delivery_tag=method.delivery_tag)
Exemple #12
0
def queue_create():
    try:
        node_url = request.json.get('url')
        rules = request.json.get('rules')

        node_uuid = UnpackHelpers.fetch_node_uuid(node_url)
        request_id = UnpackHelpers.get_request_id(node_uuid=node_uuid)
        event_keys = UnpackHelpers.get_queue_event_keys(request_id)

        connection = pika.BlockingConnection(
            pika.ConnectionParameters(os.environ['UNPACK_HOST']))
        channel = connection.channel()
        fetcher_queue_name = UnpackHelpers.get_queue_name(
            queue_type='fetch', request_id=request_id)
        channel.queue_declare(queue=fetcher_queue_name)
        channel.basic_publish(
            exchange='',
            routing_key=fetcher_queue_name,
            body=json.dumps({
                'node_url': node_url,
                'rules': rules,
            }),
            properties=pika.BasicProperties(
                delivery_mode=2,  # make message persistent
            ))
        connection.close()

        return jsonify({
            'node_url': node_url,
            'node_uuid': node_uuid,
            'request_id': request_id,
            'event_keys': event_keys,
        })
    except Exception:
        logger.exception('Error GET queue_create')
        abort(500)
Exemple #13
0
def main(request_id):
    queue_name = UnpackHelpers.get_queue_name(queue_type='broadcast',
                                              request_id=request_id)
    connection_params = pika.ConnectionParameters(
        os.environ['UNPACK_HOST'],
        heartbeat=4 * 60,
        blocked_connection_timeout=2 * 60)
    connection = pika.BlockingConnection(connection_params)
    channel = connection.channel()

    channel.basic_consume(queue=queue_name,
                          on_message_callback=handle_message_callback,
                          auto_ack=True)

    try:
        logger.info(
            ' [*] Waiting for Broadcaster messages. To exit press CTRL+C or delete the queue'
        )
        channel.start_consuming()
    except KeyboardInterrupt:
        channel.stop_consuming()
        connection.close()
    except pika.exceptions.ConnectionClosedByBroker:
        pass
Exemple #14
0
def main(request_id):
    empty_since = None
    queue_ttl = 1 * 30
    check_queue_rate = 5
    containers = []

    connection = pika.BlockingConnection(
        pika.ConnectionParameters(os.environ['UNPACK_HOST'])
    )

    channel = connection.channel()

    fetcher_queue_name = UnpackHelpers.get_queue_name(
        queue_type='fetch',
        request_id=request_id
    )

    broadcaster_queue_name = UnpackHelpers.get_queue_name(
        queue_type='broadcast',
        request_id=request_id
    )

    fetcher_q = channel.queue_declare(queue=fetcher_queue_name)
    broadcaster_q = channel.queue_declare(queue=broadcaster_queue_name)

    publish_broadcast(
        channel=channel,
        queue_name=broadcaster_queue_name,
        event_name=UnpackHelpers.EVENT_NAME['REQUEST:IN_PROGRESS']
    )

    logger.info(f'Creating queues for id: {request_id}')

    # Workers to create
    broadcast_container = UnpackHelpers.start_docker_container(
        container_name=UnpackHelpers.DOCKER_CONTAINER_NAMES['QUEUE_BROADCAST_WORKER'],
        request_id=request_id,
    )
    containers.append(broadcast_container)

    for i in range(10):
        fetcher_container = UnpackHelpers.start_docker_container(
            container_name=UnpackHelpers.DOCKER_CONTAINER_NAMES['QUEUE_FETCHER_WORKER'],
            request_id=request_id,
        )
        containers.append(fetcher_container)

    logger.info(f'Created containers {containers} for id {request_id}')

    # As long as queue has data in it and the TTL has not been reached
    while True:
        if empty_since is not None and (time.time() - empty_since) >= queue_ttl:
            handleEmptyQueue(
                channel=channel,
                request_id=request_id,
                containers=containers,
                fetcher_queue_name=fetcher_queue_name,
                broadcaster_queue_name=broadcaster_queue_name,
            )
            break

        empty_since = tick(
            channel=channel,
            fetcher_queue_name=fetcher_queue_name,
            broadcaster_queue_name=broadcaster_queue_name,
            empty_since=empty_since,
            check_queue_rate=check_queue_rate
        )