def run(self, config):
        """Run the stage.

    Args:
      config: Specifies the source objects and sinks.
    """
        storage = gcs.Gcs()

        # Adds a snapshot id to all the records so they can be joined if necessary.
        snapshot_id = int(time.mktime(time.localtime()))

        for (source, sink) in zip(config['sources'], config['sinks']):
            logging.debug('Transforming %s to %s', source, sink)

            with contextlib.closing(storage.OpenObject(source)) as source_file:
                with contextlib.closing(storage.OpenObject(
                        sink, mode='w')) as sink_file:
                    for line in source_file:
                        if line:
                            json_obj = json.loads(line)
                            json_obj['snapshotId'] = snapshot_id

                            # Transforms the resource.
                            try:
                                GceDataTransformer._transform_func[
                                    json_obj['kind']](json_obj)
                            except KeyError:
                                logging.warning('Unrecognized resource %r',
                                                line)

                        sink_file.write('%s\n' % json.dumps(json_obj))
Esempio n. 2
0
    def run(self, config):
        """Runs the stage.

    Args:
      config: Specifies the source object(s) and sinks.
    """
        storage = gcs.Gcs()
        objs = []
        if 'object' in config:
            objs.append(config['object'])
        if 'objects' in config:
            objects = config['objects']
            for o in storage.ListBucket(objects['bucket'],
                                        objects.get('prefix'),
                                        objects.get('glob')):
                objs.append(gcs.Gcs.MakeUrl(objects['bucket'], o))

        diff = len(objs) - len(config['sinks'])
        if diff < 0:
            msg = ''.join([
                'More sinks than objects available.', 'Ignoring: %s',
                str(config['sinks'][diff:])
            ])
            logging.warning(msg)
        elif diff > 0:
            logging.info('Found more objects than available sinks.')

        # copy any objects to sinks if the urls differ
        to_copy = zip(objs, config['sinks'])
        res = [storage.CopyObject(o[0], o[1]) for o in to_copy if o[0] != o[1]]
        for r in res:
            logging.info('Copied to %s', r['selfLink'])
Esempio n. 3
0
  def run(self, config):
    """Runs the stage.

    Object composition requires that all source objects reside in the same
    bucket as the destination object. This stage will strip any sources with
    buckets that do not match the destination as taken as the first sink URL.
    Optionally, sources can be deleted after composition.

    Args:
      config: Specifies the source object(s) and sink.

    Yields:
      Possible deleter stage future.
    """
    (dest_bucket, dest_obj) = gcs.Gcs.UrlToBucketAndName(config['sinks'][0])

    sources = config['sources']
    src_objects = [gcs.Gcs.UrlToBucketAndName(s)[1] for s in sources]

    storage = gcs.Gcs()
    storage.ComposeObjects(dest_bucket,
                           src_objects,
                           dest_obj,
                           config['contentType'])

    if config.get('deleteSources', False):
      yield gcsdeleter.GcsDeleter({'sources': sources})
Esempio n. 4
0
  def run(self, config):
    """Runs the stage.

    Args:
      config: Specifies the projectID and required zones.
    """
    logging.info('GceInstancesInput.Pipeline start\n%s',
                 json.dumps(config, indent=4, separators=(',', ': ')))

    storage = gcs.Gcs()
    gce = computeengine.ComputeEngine(config['apiInput']['projectId'])

    replies = []

    start_time = int(time.mktime(time.localtime()))
    for zone in config['zones']:
      replies.append(gce.ListInstances(zone=zone, fields=config['fields']))
    end_time = int(time.mktime(time.localtime()))

    with contextlib.closing(StringIO.StringIO()) as buf:
      for reply in replies:
        for instance in reply:
          #  Insert start and end snapshot timestamps
          instance['snapshotStartTime'] = start_time
          instance['snapshotEndTime'] = end_time
          json.dump(instance, buf)
          buf.write('\n')
      buf.seek(0)
      storage.InsertObject(buf, url=config['sinks'][0])
Esempio n. 5
0
    def run(self, config):
        """Runs the stage.

    Args:
      config: Specifies the projectID and required zones.
    """
        logging.info('GceZoneOperationsInput.Pipeline start\n%s',
                     json.dumps(config, indent=4, separators=(',', ': ')))

        # To obtain new operations that are not added to BigQuery, all
        # operations in BigQuery over the past 3 days are obtained.
        # The Compute Engine Operations API is called for the past 2 days.
        # All operations from the API are checked against the BigQuery
        # data to verify they are already recorded. If they are new, they
        # are added to a list which will be ingested to BigQuery.
        # 2 days of operations are obtained from Operations API to guarantee
        # operations which occured at the end of the day will be obtained.
        # 3 days of BigQuery data is obtained to handle the timezone difference.

        today_date = datetime.date.today()
        filter_str = GceZoneOperationsInput._ListZoneOperationsFilter(
            today_date, num_days=2)
        logging.info('Filter string for operations: %s', filter_str)

        gce = computeengine.ComputeEngine(
            config['destinationTable']['projectId'])
        new_operations = []
        start_time = int(time.mktime(time.localtime()))
        for zone in config['zones']:
            stored_operations = GceZoneOperationsInput._GetStoredOperations(
                zone=zone, config=config, date=today_date, number_days=3)
            logging.debug('Found %d operation ids already stored for zone %s.',
                          len(stored_operations), zone)

            while True:
                operations, next_page_token = gce.ListZoneOperations(
                    zone, filter_expression=filter_str)
                for operation in operations:
                    if operation['id'] not in stored_operations:
                        new_operations.append(operation)
                if not next_page_token:
                    break
        end_time = int(time.mktime(time.localtime()))
        logging.info('All pages of operation reviewed, adding %d operations.',
                     len(new_operations))

        storage = gcs.Gcs()
        with contextlib.closing(StringIO.StringIO()) as buf:
            for operation in new_operations:
                #  Inserts start and end snapshot timestamps.
                operation['snapshotStartTime'] = start_time
                operation['snapshotEndTime'] = end_time
                # Indicates that this is a zone resource.
                operation['resourceType'] = 'zone'
                json.dump(operation, buf)
                buf.write('\n')
            buf.seek(0)
            storage.InsertObject(buf, url=config['sinks'][0])
Esempio n. 6
0
  def run(self, config):
    """Runs the stage.

    Args:
      config: Specifies the source object(s).
    """
    storage = gcs.Gcs()
    for s in config['sources']:
      storage.DeleteObject(*gcs.Gcs.UrlToBucketAndName(s))
Esempio n. 7
0
 def __init__(self, s3_obj, gcs_obj, shard_prefix):
     self.s3_obj = s3_obj
     self.gcs_obj = gcs_obj
     self.gcs_storage = gcs.Gcs()
     (gcs_bucket, _) = gcs.Gcs.UrlToBucketAndName(gcs_obj)
     self.url_gen = gcs.Gcs.UrlCreator(gcs_bucket, shard_prefix)
     self.chunk_urls = []
     self.content_type = None
     self.once = True
Esempio n. 8
0
    def run(self, config):
        """Runs the stage.

    Args:
      config: Specifies the source object(s) and sinks.
    Yields:
      If necessary, a pipeline future for a GcsCompositor stage
    """
        start = config.get('start')
        if not start:
            start = 0
            config['start'] = 0

        if 'length' not in config:
            # hit the resource with a one-byte range GET to find out length
            # this is necessary as App Engine will strip the Content-Length header
            # from a HEAD request
            req = urllib2.Request(config['url'])
            req.add_header('Range', 'bytes=0-0')
            meta_inf = None
            with contextlib.closing(urllib2.urlopen(req)) as resp:
                meta_inf = resp.info()

            range_len = meta_inf.getheaders('Content-Range')
            if range_len:
                range_len = long(range_len[0].split('/')[1])
                config['length'] = range_len - start
            else:
                logging.warning('Cannot determine resource length.')

        if 'shardSize' not in config:
            config['shardSize'] = self.REQUEST_CHUNK_SIZE

        (shards, compositors) = self.ShardStage(config)
        if shards and compositors:
            with pipeline.After(*[(yield shard) for shard in shards]):
                _ = [(yield compositor) for compositor in compositors]
        else:
            gcs_obj = config['sinks'][0]
            gcs_storage = gcs.Gcs()

            start = config.get('start', 0)
            length = config.get('length')

            req = urllib2.Request(config['url'])
            range_bytes = 'bytes=%s-%s'
            if length:
                range_bytes %= (start, start + length - 1)
            else:
                range_bytes %= (start, '')
            req.add_header('Range', range_bytes)
            with contextlib.closing(urllib2.urlopen(req, timeout=300)) as resp:
                with contextlib.closing(StringIO.StringIO(
                        resp.read())) as resp_buf:
                    gcs_storage.InsertObject(resp_buf, url=gcs_obj)
    def run(self, config):
        """Runs the stage.

    Args:
      config: Specifies the either an inline GQL query or a GCS object that
        contains a GQL query.
    """
        storage = gcs.Gcs()
        gql = config.get('gql')
        if not gql:
            with contextlib.closing(
                    storage.OpenObject(url=config['object'])) as stream:
                with contextlib.closing(StringIO.StringIO()) as gql_buf:
                    while True:
                        buf = stream.read(gcs.Gcs.READ_CHUNK_SIZE)
                        if buf and len(buf):
                            gql_buf.write(buf)
                        else:
                            break
                    gql = gql_buf.getvalue()

        qkwargs = {}
        consistency = None
        keys_only = False
        projection = None
        if 'params' in config:
            params = config['params']
            qkwargs = params.get('values', {})
            consistency = params.get('consistency')
            if 'consistency' in params and params['consistency'] is 'eventual':
                consistency = ndb.EVENTUAL_CONSISTENCY
            keys_only = params.get('keysOnly', False)
            projection = params.get('projection')

        # for now just emit a dumb CSV
        # TODO(user): better way to decide output type.
        # TODO(user): make robust - don't always want to spool into memory...
        writer = None
        with contextlib.closing(StringIO.StringIO()) as buf:
            query = ndb.gql(gql, **qkwargs)
            for entity in query.iter(read_policy=consistency,
                                     keys_only=keys_only,
                                     projection=projection):
                if not projection:
                    # pylint: disable=protected-access
                    projection = entity._properties.keys()
                if not writer:
                    writer = csv.DictWriter(buf, projection)
                    headers = dict((p, p) for p in projection)
                    writer.writerow(headers)
                writer.writerow(entity.to_dict())

            # TODO(user): what to do with multiple sinks?
            buf.seek(0)
            storage.InsertObject(buf, url=config['sinks'][0])
Esempio n. 10
0
  def run(self, config):
    """Runs the stage.

    Args:
      config: Specifies the destination object and source.
    """
    storage = gcs.Gcs()
    src = config['sources'][0]
    dest = config['object']
    if src is not dest:
      res = storage.CopyObject(src, dest)
      logging.info('Copied %s to %s', src, res['selfLink'])
Esempio n. 11
0
 def testList(self):
     objs = ['/bucket/booo', '/bucket/foozle', '/bucket/dir/a/b']
     objs_in_gs = ['gs:/' + o for o in objs]
     storage = gcs.Gcs()
     mock_list_resp = mock.MagicMock()
     mock_list_resp.return_value = [
         mock.MagicMock(filename=o) for o in objs
     ]
     with mock.patch.object(cloudstorage, 'listbucket', mock_list_resp):
         res = storage.ListBucket('bucket')
         for o in objs_in_gs:
             self.assertIn(o, res)
Esempio n. 12
0
    def testInsert(self):
        mock_service = mock.MagicMock()
        mock_buckets = mock.MagicMock()
        mock_service.buckets.return_value = mock_buckets

        storage = gcs.Gcs()
        storage._service = mock_service
        storage.InsertBucket('project', 'bucket')

        call = mock.call.insert(project='project',
                                body={
                                    'name': 'bucket',
                                    'location': 'US'
                                })
        mock_buckets.assert_has_calls(call)
Esempio n. 13
0
    def testComposeNoRec(self):
        src = ['0', '1', '2', '3', '4', '5', '6', '7']
        mock_service = mock.MagicMock()
        mock_objects = mock.MagicMock()
        mock_service.objects.return_value = mock_objects

        storage = gcs.Gcs()
        storage._service = mock_service
        storage.ComposeObjects('bucket', src, 'dest', 'text/plain')

        call = mock.call.compose(destinationBucket='bucket',
                                 destinationObject='dest',
                                 body={
                                     'sourceObjects': [{
                                         'name': s
                                     } for s in src],
                                     'destination': {
                                         'contentType': 'text/plain'
                                     }
                                 })
        mock_objects.assert_has_calls(call)
  def __init__(self, config):
    """Constructor.

    Args:
      config: Transform configuration in Python dictionary.
    Raises:
      HadoopError: When no Hadoop cluster is available.
    """
    self.config = config
    self.boundary = 'Hadoop_MapReduce_Request_Http_Boundary'
    self.cloud_storage_client = gcs.Gcs()

    cluster_query = datastore.ClusterInfo.query()
    if not cluster_query.count():
      raise HadoopError('No Hadoop cluster available')
    # For now, always use the first Hadoop cluster.
    # TODO(user): Make configurable which Hadoop cluster to use.
    hadoop_cluster = cluster_query.fetch(1)[0]
    logging.info('Starting Hadoop MapReduce on cluster "%s"',
                 hadoop_cluster.name)
    self.master_ip = hadoop_cluster.GetMasterIpAddress()
Esempio n. 15
0
    def testStat(self):
        class MockStat(object):
            def __init__(self):
                self.st_size = 100
                self.etag = '35cb8ce70d66aac33163db67180fb6d3'
                self.content_type = 'text/plain'
                self.metadata = {}

        expect = {
            'size': 100,
            'md5Hash': '35cb8ce70d66aac33163db67180fb6d3',
            'contentType': 'text/plain',
            'metadata': {}
        }

        # test using gae cloudstorage api
        storage = gcs.Gcs()
        mock_stat_resp = mock.MagicMock()
        mock_stat_resp.return_value = MockStat()
        with mock.patch.object(cloudstorage, 'stat', mock_stat_resp):
            stat = storage.StatObject(bucket='bucket', obj='obj')
            self.assertSameStructure(stat, expect)
    def run(self, config):
        """Transform data according to some search/replace patterns from config.

    Args:
      config: Specifies what to replace and which columns are wanted.
    Yields:
      possibly yields some sharded stages.
    """

        # quick check to skip the leading rows
        skip_leading_rows = config.get('skipLeadingRows', 0)
        start = config.get('start', 0)
        source_url = config['sources'][0]

        if 'length' not in config:
            config['length'] = gcs.Gcs().StatObject(
                url=source_url)['size'] - start

        if skip_leading_rows > 0 and start == 0:
            # We're skipping these rows by using the start parameter.
            config['skipLeadingRows'] = 0
            bytes_to_skip = FindStartAfterSkippingRows(skip_leading_rows,
                                                       source_url)
            # We start one byte back since we find the start of the next
            # line when we start processing (to avoid jumping in at the
            # start of a line.
            config['start'] = max(0, bytes_to_skip - 1)
            if 'length' in config:
                config['length'] -= bytes_to_skip

        if 'shardSize' not in config:
            config['shardSize'] = self.SHARD_CHUNK_SIZE

        (shards, compositors) = self.ShardStage(config)
        if shards and compositors:
            with pipeline.After(*[(yield shard) for shard in shards]):
                _ = [(yield compositor) for compositor in compositors]
        else:
            # TODO(user) handle this task dying halfway through and resuming.
            # TODO(user) handle some way to update progress (memcache!?)

            # TODO(user) if the input file/blob is over 10M split it into
            # chunks and run over those and then merge it. Should be easy with
            # Pipelines.

            logging.info('Transformer start\n%s',
                         json.dumps(config, indent=4, separators=(',', ': ')))

            logging.info('CsvMatchReplace called for = %s', source_url)

            sink_url = config['sinks'][0]
            if len(config['sinks']) > 1:
                badrows_url = config['sinks'][1]
            else:
                badrows_url = None

            finished = ReadTransformWrite(config, source_url, sink_url,
                                          badrows_url)
            if not finished:
                logging.error('Unable to CsvMatchReplace')
                return
Esempio n. 17
0
    def testComposeOneRec(self):
        src = ['0', '1', '2', '3', '4', '5', '6', '7']
        mock_service = mock.MagicMock()
        mock_objects = mock.MagicMock()
        mock_service.objects.return_value = mock_objects

        storage = gcs.Gcs()
        with mock.patch.object(storage,
                               'UrlCreator',
                               return_value=lambda: 'gs://bucket/X',
                               autospec=True):
            storage.MAX_COMPOSABLE_OBJECTS = 3
            storage._service = mock_service
            storage.ComposeObjects('bucket', src, 'dest', 'text/plain')

            call_a = mock.call.compose(destinationBucket='bucket',
                                       destinationObject='X',
                                       body={
                                           'sourceObjects': [{
                                               'name': '0'
                                           }, {
                                               'name': '1'
                                           }, {
                                               'name': '2'
                                           }],
                                           'destination': {
                                               'contentType': 'text/plain'
                                           }
                                       })
            call_b = mock.call.compose(destinationBucket='bucket',
                                       destinationObject='X',
                                       body={
                                           'sourceObjects': [{
                                               'name': '3'
                                           }, {
                                               'name': '4'
                                           }, {
                                               'name': '5'
                                           }],
                                           'destination': {
                                               'contentType': 'text/plain'
                                           }
                                       })
            call_c = mock.call.compose(destinationBucket='bucket',
                                       destinationObject='X',
                                       body={
                                           'sourceObjects': [{
                                               'name': '6'
                                           }, {
                                               'name': '7'
                                           }],
                                           'destination': {
                                               'contentType': 'text/plain'
                                           }
                                       })
            call_d = mock.call.compose(destinationBucket='bucket',
                                       destinationObject='dest',
                                       body={
                                           'sourceObjects': [{
                                               'name': 'X'
                                           }, {
                                               'name': 'X'
                                           }, {
                                               'name': 'X'
                                           }],
                                           'destination': {
                                               'contentType': 'text/plain'
                                           }
                                       })

            calls = [call_a, call_b, call_c, call_d]
            mock_objects.assert_has_calls(calls, any_order=True)