Ejemplo n.º 1
0
    def run(self, shards):
        result = []

        # Shuffle files to sort all 0 files together, all 1 files together, etc.
        shuffled_shards = [[] for _ in shards[0]]
        for shard in shards:
            for i, filename in enumerate(shard):
                shuffled_shards[i].append(filename)

        for filenames in shuffled_shards:
            sorted_files = yield shuffler.SortPipeline(filenames)
            result.append(sorted_files)
        yield pipeline_common.Append(*result)
Ejemplo n.º 2
0
 def run(self, job_name, filenames):
     sort_mappers = []
     for i in range(len(filenames)):
         filename = filenames[i]
         sort_mapper = yield mapper_pipeline.MapperPipeline(
             "%s-shuffle-sort-%s" % (job_name, str(i)),
             __name__ + "._sort_records_map",
             __name__ + "._BatchRecordsReader",
             None, {
                 "files": [filename],
                 "processing_rate": 1000000,
             },
             shards=1)
         sort_mappers.append(sort_mapper)
     with pipeline.After(*sort_mappers):
         job_ids = yield pipeline_common.Append(
             *[mapper.job_id for mapper in sort_mappers])
         result = yield _CollectOutputFiles(job_ids)
         with pipeline.After(result):
             yield _CleanupOutputFiles(job_ids)
         yield pipeline_common.Return(result)
Ejemplo n.º 3
0
    def run(self, config):
        """Runs the stage.

    Args:
      config: Specifies the source object(s) and sinks.
    Yields:
      If necessary, a pipeline future.
    """
        storage = s3.S3(config=config.get('s3Credentials'))

        s3_objects = []
        if 'object' in config:
            s3_objects.append(config['object'])

        if 'objects' in config:
            objects = config['objects']
            for s3_obj in storage.ListBucket(objects['bucket'],
                                             objects.get('prefix')):
                s3_objects.append(s3.S3.MakeUrl(objects['bucket'], s3_obj))

        s3_objects = zip(s3_objects, config['sinks'])
        # do the first one here
        (s3_obj, gcs_obj) = s3_objects.pop()

        # fan out any others
        sub_stages = []
        if 'objects' in config:
            config.pop('objects')
        for (next_s3_obj, next_gcs_obj) in s3_objects:
            cfg = copy.deepcopy(config)
            cfg['object'] = next_s3_obj
            cfg['sinks'] = [next_gcs_obj]
            s = yield S3Input(cfg)
            sub_stages.append(s)

        start = config.get('start')
        if not start:
            start = 0
            config['start'] = 0

        length = config.get('length')
        if not length:
            length = storage.StatObject(s3_obj)['size']
            config['length'] = length

        if 'shardSize' not in config:
            config['shardSize'] = self.REQUEST_CHUNK_SIZE

        (shards, compositors) = self.ShardStage(config)
        if shards and compositors:
            with pipeline.After(*[(yield shard) for shard in shards]):
                _ = [(yield compositor) for compositor in compositors]
        else:
            handler = _S3ReadBufferHandler(s3_obj, gcs_obj,
                                           config.get('shardPrefix'))

            storage.ReadObject(url=s3_obj,
                               handler=handler.Handle,
                               start=start,
                               length=length)

            comp_stage = common.Ignore()
            if handler.chunk_urls:
                comp_config = {
                    'contentType': handler.content_type,
                    'sources': handler.chunk_urls,
                    'sinks': [gcs_obj]
                }
                comp_stage = yield gcscompositor.GcsCompositor(comp_config)
                sub_stages.append(comp_stage)

            yield common.Append(*sub_stages)
Ejemplo n.º 4
0
 def run(self, defn):
   all_stages = []
   for d in defn:
     s = yield GetStage(d)
     all_stages.append(s)
   yield common.Append(*all_stages)