Ejemplo n.º 1
0
    def _create_data_stream(self, data):
        """
        Creates the data stream in case of object processing
        """
        extra_get_args = {}

        if 'url' in data:
            url = data['url']
            logger.info('Getting dataset from {}'.format(url.path))
            if url.data_byte_range is not None:
                range_str = 'bytes={}-{}'.format(*url.data_byte_range)
                extra_get_args['Range'] = range_str
                logger.info('Chunk: {} - Range: {}'.format(url.part, extra_get_args['Range']))
            resp = requests.get(url.path, headers=extra_get_args, stream=True)
            url.data_stream = resp.raw

        if 'obj' in data:
            obj = data['obj']
            obj.storage_backend
            storage_handler = Storage(self.pywren_config, obj.storage_backend).get_storage_handler()
            logger.info('Getting dataset from {}://{}/{}'.format(obj.storage_backend, obj.bucket, obj.key))
            if obj.data_byte_range is not None:
                extra_get_args['Range'] = 'bytes={}-{}'.format(*obj.data_byte_range)
                logger.info('Chunk: {} - Range: {}'.format(obj.part, extra_get_args['Range']))
                sb = storage_handler.get_object(obj.bucket, obj.key, stream=True, extra_get_args=extra_get_args)
                obj.data_stream = WrappedStreamingBodyPartition(sb, obj.chunk_size, obj.data_byte_range)
            else:
                obj.data_stream = storage_handler.get_object(obj.bucket, obj.key, stream=True)
Ejemplo n.º 2
0
    def _fill_optional_args(self, function, data):
        """
        Fills in those reserved, optional parameters that might be write to the function signature
        """
        func_sig = inspect.signature(function)

        if 'ibm_cos' in func_sig.parameters:
            if 'ibm_cos' in self.pywren_config:
                if self.internal_storage.backend == 'ibm_cos':
                    ibm_boto3_client = self.internal_storage.get_client()
                else:
                    ibm_boto3_client = Storage(self.pywren_config, 'ibm_cos').get_client()
                data['ibm_cos'] = ibm_boto3_client
            else:
                raise Exception('Cannot create the ibm_cos client: missing configuration')

        if 'storage' in func_sig.parameters:
            data['storage'] = self.internal_storage.get_client()

        if 'rabbitmq' in func_sig.parameters:
            if 'rabbitmq' in self.pywren_config:
                rabbit_amqp_url = self.pywren_config['rabbitmq'].get('amqp_url')
                params = pika.URLParameters(rabbit_amqp_url)
                connection = pika.BlockingConnection(params)
                data['rabbitmq'] = connection
            else:
                raise Exception('Cannot create the rabbitmq client: missing configuration')

        if 'id' in func_sig.parameters:
            data['id'] = int(self.call_id)
Ejemplo n.º 3
0
    def _load_object(self, data):
        """
        Loads the object in /tmp in case of object processing
        """
        extra_get_args = {}

        if 'url' in data:
            url = data['url']
            logger.info('Getting dataset from {}'.format(url.path))
            if url.data_byte_range is not None:
                range_str = 'bytes={}-{}'.format(*url.data_byte_range)
                extra_get_args['Range'] = range_str
                logger.info('Chunk: {} - Range: {}'.format(
                    url.part, extra_get_args['Range']))
            resp = requests.get(url.path, headers=extra_get_args, stream=True)
            url.data_stream = resp.raw

        if 'obj' in data:
            obj = data['obj']
            logger.info('Getting dataset from {}://{}/{}'.format(
                obj.backend, obj.bucket, obj.key))

            if obj.backend == self.internal_storage.backend:
                storage = self.internal_storage.storage
            else:
                storage = Storage(pywren_config=self.pywren_config,
                                  storage_backend=obj.backend)

            if obj.data_byte_range is not None:
                extra_get_args['Range'] = 'bytes={}-{}'.format(
                    *obj.data_byte_range)
                logger.info('Chunk: {} - Range: {}'.format(
                    obj.part, extra_get_args['Range']))
                sb = storage.get_object(obj.bucket,
                                        obj.key,
                                        stream=True,
                                        extra_get_args=extra_get_args)
                wsb = WrappedStreamingBodyPartition(sb, obj.chunk_size,
                                                    obj.data_byte_range)
                obj.data_stream = wsb
            else:
                sb = storage.get_object(obj.bucket,
                                        obj.key,
                                        stream=True,
                                        extra_get_args=extra_get_args)
                obj.data_stream = sb
Ejemplo n.º 4
0
    def _fill_optional_args(self, function, data):
        """
        Fills in those reserved, optional parameters that might be write to the function signature
        """
        func_sig = inspect.signature(function)

        if 'ibm_cos' in func_sig.parameters:
            if 'ibm_cos' in self.pywren_config:
                try:
                    ibm_boto3_client = Storage(self.storage_config,
                                               'ibm_cos').get_client()
                    data['ibm_cos'] = ibm_boto3_client
                except Exception as e:
                    logger.error('Cannot create the ibm_cos connection: {}',
                                 str(e))
                    data['ibm_cos'] = None
            else:
                logger.error(
                    'Cannot create the ibm_cos connection: Configuration not provided'
                )
                data['ibm_cos'] = None

        if 'internal_storage' in func_sig.parameters:
            data['internal_storage'] = self.internal_storage

        if 'rabbitmq' in func_sig.parameters:
            if 'rabbitmq' in self.pywren_config:
                try:
                    rabbit_amqp_url = self.pywren_config['rabbitmq'].get(
                        'amqp_url')
                    params = pika.URLParameters(rabbit_amqp_url)
                    connection = pika.BlockingConnection(params)
                    data['rabbitmq'] = connection
                except Exception as e:
                    logger.error('Cannot create the rabbitmq connection: {}',
                                 str(e))
                    data['rabbitmq'] = None
            else:
                logger.error(
                    'Cannot create the rabbitmq connection: Configuration not provided'
                )
                data['rabbitmq'] = None

        if 'id' in func_sig.parameters:
            data['id'] = int(self.call_id)
Ejemplo n.º 5
0
def create_partitions(pywren_config, map_iterdata, chunk_size, chunk_number):
    """
    Method that returns the function that will create the partitions of the objects in the Cloud
    """
    logger.debug('Starting partitioner')

    parts_per_object = None

    sbs = set()
    buckets = set()
    prefixes = set()
    obj_names = set()
    urls = set()

    logger.debug("Parsing input data")
    for elem in map_iterdata:
        if 'url' in elem:
            urls.add(elem['url'])
        elif 'obj' in elem:
            sb, bucket, prefix, obj_name = utils.split_object_url(elem['obj'])
            if obj_name:
                obj_names.add((bucket, prefix))
            elif prefix:
                prefixes.add((bucket, prefix))
            else:
                buckets.add(bucket)
            sbs.add(sb)

    if len(sbs) > 1:
        raise Exception(
            'Currently we only support to process one storage backend at a time. '
            'Current storage backends: {}'.format(sbs))

    if [prefixes, obj_names, urls, buckets].count(True) > 1:
        raise Exception(
            'You must provide as an input data a list of bucktes, '
            'a list of buckets with object prefix, a list of keys '
            'or a list of urls. Intermingled types are not allowed.')

    if not urls:
        # process objects from an object store. No url
        sb = sbs.pop()
        storage_handler = Storage(pywren_config, sb).get_storage_handler()
        objects = {}
        if obj_names:
            for bucket, prefix in obj_names:
                logger.debug("Listing objects in '{}://{}'".format(
                    sb, '/'.join([bucket, prefix])))
                if bucket not in objects:
                    objects[bucket] = []
                objects[bucket].extend(
                    storage_handler.list_objects(bucket, prefix))
        elif prefixes:
            for bucket, prefix in prefixes:
                logger.debug("Listing objects in '{}://{}'".format(
                    sb, '/'.join([bucket, prefix])))
                if bucket not in objects:
                    objects[bucket] = []
                objects[bucket].extend(
                    storage_handler.list_objects(bucket, prefix))
        elif buckets:
            for bucket in buckets:
                logger.debug("Listing objects in '{}://{}'".format(sb, bucket))
                objects[bucket] = storage_handler.list_objects(bucket)

        keys_dict = {}
        for bucket in objects:
            keys_dict[bucket] = {}
            for obj in objects[bucket]:
                keys_dict[bucket][obj['Key']] = obj['Size']

    if buckets or prefixes:
        partitions, parts_per_object = _split_objects_from_buckets(
            map_iterdata, keys_dict, chunk_size, chunk_number)

    elif obj_names:
        partitions, parts_per_object = _split_objects_from_keys(
            map_iterdata, keys_dict, chunk_size, chunk_number)

    elif urls:
        partitions, parts_per_object = _split_objects_from_urls(
            map_iterdata, chunk_size, chunk_number)

    else:
        raise ValueError('You did not provide any bucket or object key/url')

    return partitions, parts_per_object