Beispiel #1
0
    def test_retain_copy(self, session_mock):
        settings = {
            'aws_bucket': self.aws_bucket,
            'aws_identity': 'identity',
            'aws_secret': 'credential',
            'account': 'account',
            'container': 'container',
            'retain_local': False
        }

        sync = SyncContainer(self.scratch_space, settings)
        sync.provider = mock.Mock()
        swift_client = mock.Mock()
        row = {
            'deleted': 0,
            'created_at': str(time.time() - 5),
            'name': 'foo',
            'storage_policy_index': 99
        }
        sync.handle(row, swift_client)

        _, _, swift_ts = decode_timestamps(row['created_at'])
        swift_ts.offset += 1

        sync.provider.upload_object.assert_called_once_with(
            row['name'], 99, swift_client)
        swift_client.delete_object.assert_called_once_with(
            settings['account'],
            settings['container'],
            row['name'],
            headers={'X-Timestamp': Timestamp(swift_ts).internal})
Beispiel #2
0
def parse_raw_obj(obj_info):
    """
    Translate a reconciler container listing entry to a dictionary
    containing the parts of the misplaced object queue entry.

    :param obj_info: an entry in an a container listing with the
                     required keys: name, content_type, and hash

    :returns: a queue entry dict with the keys: q_policy_index, account,
              container, obj, q_op, q_ts, q_record, and path
    """
    raw_obj_name = obj_info['name'].encode('utf-8')

    policy_index, obj_name = raw_obj_name.split(':', 1)
    q_policy_index = int(policy_index)
    account, container, obj = split_path(obj_name, 3, 3, rest_with_last=True)
    try:
        q_op = {
            'application/x-put': 'PUT',
            'application/x-delete': 'DELETE',
        }[obj_info['content_type']]
    except KeyError:
        raise ValueError('invalid operation type %r' %
                         obj_info.get('content_type', None))
    return {
        'q_policy_index': q_policy_index,
        'account': account,
        'container': container,
        'obj': obj,
        'q_op': q_op,
        'q_ts': decode_timestamps((obj_info['hash']))[0],
        'q_record': last_modified_date_to_timestamp(obj_info['last_modified']),
        'path': '/%s/%s/%s' % (account, container, obj)
    }
Beispiel #3
0
    def test_retain_copy(self, session_mock):
        settings = {
            'aws_bucket': self.aws_bucket,
            'aws_identity': 'identity',
            'aws_secret': 'credential',
            'account': 'account',
            'container': 'container',
            'retain_local': False
        }

        sync = SyncContainer(self.scratch_space, settings, self.stats_factory)
        sync.provider = mock.Mock()
        sync.provider.upload_object.return_value = SyncS3.UploadStatus.PUT
        swift_client = mock.Mock()
        swift_client.get_object_metadata.return_value = {}
        row = {
            'deleted': 0,
            'created_at': str(time.time() - 5),
            'name': 'foo',
            'storage_policy_index': 99
        }
        sync.handle(row, swift_client)

        _, _, swift_ts = decode_timestamps(row['created_at'])

        sync.provider.upload_object.assert_called_once_with(
            row, swift_client, mock.ANY)
        sync.provider.delete_local_object.assert_called_once_with(
            swift_client, row, swift_ts, False)
        sync.stats_reporter.increment.assert_called_once_with(
            'copied_objects', 1)
    def handle(self, row, swift_client):
        if row['deleted']:
            if self.propagate_delete:
                self.provider.delete_object(row['name'], swift_client)
        else:
            # The metadata timestamp should always be the latest timestamp
            _, _, meta_ts = decode_timestamps(row['created_at'])
            if time.time() <= self.copy_after + meta_ts.timestamp:
                raise RetryError('Object is not yet eligible for archive')
            self.provider.upload_object(row['name'],
                                        row['storage_policy_index'],
                                        swift_client)

            if not self.retain_local:
                # NOTE: We rely on the DELETE object X-Timestamp header to
                # mitigate races where the object may be overwritten. We
                # increment the offset to ensure that we never remove new
                # customer data.
                self.logger.debug("Creating a new TS: %f %f" %
                                  (meta_ts.offset, meta_ts.timestamp))
                delete_ts = Timestamp(meta_ts, offset=meta_ts.offset + 1)
                try:
                    swift_client.delete_object(
                        self._account,
                        self._container,
                        row['name'],
                        headers={'X-Timestamp': delete_ts.internal})
                except UnexpectedResponse as e:
                    if '409 Conflict' in e.message:
                        pass
Beispiel #5
0
    def handle(self, row, swift_client):
        if self.exclude_regex.match(row['name']) is not None:
            self.logger.debug('Skipping excluded object: %s/%s' %
                              (self._container, row['name'].decode('utf-8')))
            return

        if row['deleted']:
            if self.propagate_delete:
                self.provider.delete_object(row['name'])
                self.stats_reporter.increment('deleted_objects', 1)
        else:
            # The metadata timestamp should always be the latest timestamp
            _, _, meta_ts = decode_timestamps(row['created_at'])
            if time.time() <= self.copy_after + meta_ts.timestamp:
                raise RetryError('Object is not yet eligible for archive')
            status = self.provider.upload_object(
                row, swift_client,
                lambda bytes_uploaded: self.stats_reporter.increment(
                    'bytes', bytes_uploaded))
            if status == BaseSync.UploadStatus.PUT:
                self.stats_reporter.increment('copied_objects', 1)

            uploaded_statuses = [
                BaseSync.UploadStatus.PUT,
                BaseSync.UploadStatus.POST,
                # NOOP means the object already exists
                BaseSync.UploadStatus.NOOP
            ]
            if not self.retain_local and status in uploaded_statuses:
                self.provider.delete_local_object(swift_client, row, meta_ts,
                                                  self.retain_local_segments)
Beispiel #6
0
def parse_raw_obj(obj_info):
    """
    Translate a reconciler container listing entry to a dictionary
    containing the parts of the misplaced object queue entry.

    :param obj_info: an entry in an a container listing with the
                     required keys: name, content_type, and hash

    :returns: a queue entry dict with the keys: q_policy_index, account,
              container, obj, q_op, q_ts, q_record, and path
    """
    raw_obj_name = obj_info['name'].encode('utf-8')

    policy_index, obj_name = raw_obj_name.split(':', 1)
    q_policy_index = int(policy_index)
    account, container, obj = split_path(obj_name, 3, 3, rest_with_last=True)
    try:
        q_op = {
            'application/x-put': 'PUT',
            'application/x-delete': 'DELETE',
        }[obj_info['content_type']]
    except KeyError:
        raise ValueError('invalid operation type %r' %
                         obj_info.get('content_type', None))
    return {
        'q_policy_index': q_policy_index,
        'account': account,
        'container': container,
        'obj': obj,
        'q_op': q_op,
        'q_ts': decode_timestamps((obj_info['hash']))[0],
        'q_record': last_modified_date_to_timestamp(
            obj_info['last_modified']),
        'path': '/%s/%s/%s' % (account, container, obj)
    }
Beispiel #7
0
 def _transform_record(self, record):
     """
     Decode the created_at timestamp into separate data, content-type and
     meta timestamps and replace the created_at timestamp with the
     metadata timestamp i.e. the last-modified time.
     """
     t_data, t_ctype, t_meta = decode_timestamps(record[1])
     return (record[0], t_meta.internal) + record[2:]
Beispiel #8
0
 def _transform_record(self, record):
     """
     Decode the created_at timestamp into separate data, content-type and
     meta timestamps and replace the created_at timestamp with the
     metadata timestamp i.e. the last-modified time.
     """
     t_data, t_ctype, t_meta = decode_timestamps(record[1])
     return (record[0], t_meta.internal) + record[2:]
Beispiel #9
0
    def test_fail_upload_segment(self, factory_mock):
        factory_mock.return_value = mock.Mock()
        base = base_sync.BaseSync(self.settings, max_conns=1)
        base.logger = mock.Mock()
        swift_client = mock.Mock()
        swift_client.get_object_metadata.return_value = {
            'x-static-large-object': 'true'
        }

        def _get_object(account, container, key, **kwargs):
            manifest = [{
                'name': '/container_segments/part1',
                'hash': 'deadbeef'
            }, {
                'name': '/container_segments/part2',
                'hash': 'deadbeef2'
            }]
            body = json.dumps(manifest)
            headers = {'etag': hashlib.md5(body).hexdigest()}
            return 200, headers, StringIO.StringIO(body)

        def _delete_object(acc, cont, obj, acceptable_statuses):
            if obj == 'part1':
                raise UnexpectedResponse('foo', None)

        swift_client.get_object.side_effect = _get_object
        swift_client.delete_object.side_effect = _delete_object

        row = {
            'deleted': 0,
            'created_at': str(time.time() - 5),
            'name': 'foo',
            'storage_policy_index': 99
        }

        _, _, swift_ts = decode_timestamps(row['created_at'])
        base.delete_local_object(swift_client, row, swift_ts, False)

        # manifest should not be deleted
        swift_client.delete_object.assert_has_calls([
            mock.call(self.settings['account'],
                      'container_segments',
                      'part1',
                      acceptable_statuses=(2, 404, 409)),
            mock.call(self.settings['account'],
                      'container_segments',
                      'part2',
                      acceptable_statuses=(2, 404, 409))
        ])

        base.logger.warning.assert_called_once_with(
            'Failed to delete segment %s/%s/%s: %s', 'account',
            'container_segments', 'part1', 'foo')
        base.logger.error.assert_called_once_with(
            'Failed to delete %s segments of %s/%s', 1, 'container', 'foo')
Beispiel #10
0
 def _get_new_rows(self, broker, start_row, nodes, node_id, verifying):
     rows = []
     if verifying:
         cutoff = time.time() - self._verification_slack
     for row in broker.get_items_since(start_row, self.items_chunk):
         hnum = num_from_row(row)
         if not verifying and hnum % nodes != node_id:
             continue
         ts = decode_timestamps(row['created_at'])[2].timestamp
         if verifying and ts > cutoff:
             break
         rows.append(row)
     return rows
Beispiel #11
0
def get_reconciler_container_name(obj_timestamp):
    """
    Get the name of a container into which a misplaced object should be
    enqueued. The name is the object's last modified time rounded down to the
    nearest hour.

    :param obj_timestamp: a string representation of the object's 'created_at'
                          time from it's container db row.
    :return: a container name
    """
    # Use last modified time of object to determine reconciler container name
    _junk, _junk, ts_meta = decode_timestamps(obj_timestamp)
    return str(
        int(ts_meta) // MISPLACED_OBJECTS_CONTAINER_DIVISOR *
        MISPLACED_OBJECTS_CONTAINER_DIVISOR)
Beispiel #12
0
def get_reconciler_container_name(obj_timestamp):
    """
    Get the name of a container into which a misplaced object should be
    enqueued. The name is the object's last modified time rounded down to the
    nearest hour.

    :param obj_timestamp: a string representation of the object's 'created_at'
                          time from it's container db row.
    :return: a container name
    """
    # Use last modified time of object to determine reconciler container name
    _junk, _junk, ts_meta = decode_timestamps(obj_timestamp)
    return str(int(ts_meta) //
               MISPLACED_OBJECTS_CONTAINER_DIVISOR *
               MISPLACED_OBJECTS_CONTAINER_DIVISOR)
Beispiel #13
0
    def test_retain_copy_slo(self, factory_mock):
        factory_mock.return_value = mock.Mock()
        base = base_sync.BaseSync(self.settings, max_conns=1)
        swift_client = mock.Mock()
        swift_client.get_object_metadata.return_value = {
            'x-static-large-object': 'true'
        }

        def _get_object(account, container, key, **kwargs):
            manifest = [{
                'name': '/container_segments/part1',
                'hash': 'deadbeef'
            }, {
                'name': '/container_segments/part2',
                'hash': 'deadbeef2'
            }]
            body = json.dumps(manifest)
            headers = {'etag': hashlib.md5(body).hexdigest()}
            return 200, headers, StringIO.StringIO(body)

        swift_client.get_object.side_effect = _get_object
        row = {
            'deleted': 0,
            'created_at': str(time.time() - 5),
            'name': 'foo',
            'storage_policy_index': 99
        }

        _, _, swift_ts = decode_timestamps(row['created_at'])
        base.delete_local_object(swift_client, row, swift_ts, False)
        swift_ts.offset += 1

        swift_client.delete_object.assert_has_calls([
            mock.call(self.settings['account'],
                      'container_segments',
                      'part1',
                      acceptable_statuses=(2, 404, 409)),
            mock.call(self.settings['account'],
                      'container_segments',
                      'part2',
                      acceptable_statuses=(2, 404, 409)),
            mock.call(self.settings['account'],
                      self.settings['container'],
                      row['name'],
                      acceptable_statuses=(2, 404, 409),
                      headers={'X-Timestamp': Timestamp(swift_ts).internal})
        ])
Beispiel #14
0
    def test_retain_copy_dlo(self, factory_mock):
        factory_mock.return_value = mock.Mock()
        base = base_sync.BaseSync(self.settings, max_conns=1)
        swift_client = mock.NonCallableMock()
        swift_client.get_object_metadata.return_value = {
            'x-object-manifest': 'container_segments/segment_'
        }

        swift_client.make_request.side_effect = (mock.Mock(body=json.dumps([{
            'name':
            'segments_%d' % (i + 1),
            'hash':
            'deadbeef'
        } for i in range(2)]),
                                                           status_int=200),
                                                 mock.Mock(body='[]',
                                                           status_int=200))

        row = {
            'deleted': 0,
            'created_at': str(time.time() - 5),
            'name': 'foo',
            'storage_policy_index': 99
        }

        _, _, swift_ts = decode_timestamps(row['created_at'])
        base.delete_local_object(swift_client, row, swift_ts, False)
        swift_ts.offset += 1

        swift_client.delete_object.assert_has_calls([
            mock.call(self.settings['account'],
                      'container_segments',
                      'segments_1',
                      acceptable_statuses=(2, 404, 409)),
            mock.call(self.settings['account'],
                      'container_segments',
                      'segments_2',
                      acceptable_statuses=(2, 404, 409)),
            mock.call(self.settings['account'],
                      self.settings['container'],
                      row['name'],
                      acceptable_statuses=(2, 404, 409),
                      headers={'X-Timestamp': Timestamp(swift_ts).internal})
        ])
Beispiel #15
0
    def test_retain_copy(self, factory_mock):
        factory_mock.return_value = mock.Mock()
        base = base_sync.BaseSync(self.settings, max_conns=1)
        swift_client = mock.Mock()
        swift_client.get_object_metadata.return_value = {}

        row = {
            'deleted': 0,
            'created_at': str(time.time() - 5),
            'name': 'foo',
            'storage_policy_index': 99
        }

        _, _, swift_ts = decode_timestamps(row['created_at'])
        base.delete_local_object(swift_client, row, swift_ts, False)
        swift_ts.offset += 1

        swift_client.delete_object.assert_called_once_with(
            self.settings['account'],
            self.settings['container'],
            row['name'],
            acceptable_statuses=(2, 404, 409),
            headers={'X-Timestamp': Timestamp(swift_ts).internal})
Beispiel #16
0
    def container_sync_row(self, row, sync_to, user_key, broker, info, realm,
                           realm_key):
        """
        Sends the update the row indicates to the sync_to container.
        Update can be either delete or put.

        :param row: The updated row in the local database triggering the sync
                    update.
        :param sync_to: The URL to the remote container.
        :param user_key: The X-Container-Sync-Key to use when sending requests
                         to the other container.
        :param broker: The local container database broker.
        :param info: The get_info result from the local container database
                     broker.
        :param realm: The realm from self.realms_conf, if there is one.
            If None, fallback to using the older allowed_sync_hosts
            way of syncing.
        :param realm_key: The realm key from self.realms_conf, if there
            is one. If None, fallback to using the older
            allowed_sync_hosts way of syncing.
        :returns: True on success
        """
        try:
            start_time = time()
            # extract last modified time from the created_at value
            ts_data, ts_ctype, ts_meta = decode_timestamps(row['created_at'])
            if row['deleted']:
                # when sync'ing a deleted object, use ts_data - this is the
                # timestamp of the source tombstone
                try:
                    headers = {'x-timestamp': ts_data.internal}
                    self._update_sync_to_headers(row['name'], sync_to,
                                                 user_key, realm, realm_key,
                                                 'DELETE', headers)
                    delete_object(sync_to,
                                  name=row['name'],
                                  headers=headers,
                                  proxy=self.select_http_proxy(),
                                  logger=self.logger,
                                  timeout=self.conn_timeout)
                except ClientException as err:
                    if err.http_status != HTTP_NOT_FOUND:
                        raise
                self.container_deletes += 1
                self.container_stats['deletes'] += 1
                self.logger.increment('deletes')
                self.logger.timing_since('deletes.timing', start_time)
            else:
                # when sync'ing a live object, use ts_meta - this is the time
                # at which the source object was last modified by a PUT or POST
                if self._object_in_remote_container(row['name'], sync_to,
                                                    user_key, realm, realm_key,
                                                    ts_meta):
                    return True
                exc = None
                # look up for the newest one; the symlink=get query-string has
                # no effect unless symlinks are enabled in the internal client
                # in which case it ensures that symlink objects retain their
                # symlink property when sync'd.
                headers_out = {
                    'X-Newest':
                    True,
                    'X-Backend-Storage-Policy-Index':
                    str(info['storage_policy_index'])
                }
                try:
                    source_obj_status, headers, body = \
                        self.swift.get_object(info['account'],
                                              info['container'], row['name'],
                                              headers=headers_out,
                                              acceptable_statuses=(2, 4),
                                              params={'symlink': 'get'})

                except (Exception, UnexpectedResponse, Timeout) as err:
                    headers = {}
                    body = None
                    exc = err
                timestamp = Timestamp(headers.get('x-timestamp', 0))
                if timestamp < ts_meta:
                    if exc:
                        raise exc
                    raise Exception(
                        _('Unknown exception trying to GET: '
                          '%(account)r %(container)r %(object)r'), {
                              'account': info['account'],
                              'container': info['container'],
                              'object': row['name']
                          })
                for key in ('date', 'last-modified'):
                    if key in headers:
                        del headers[key]
                if 'etag' in headers:
                    headers['etag'] = normalize_etag(headers['etag'])
                if 'content-type' in headers:
                    headers['content-type'] = clean_content_type(
                        headers['content-type'])
                self._update_sync_to_headers(row['name'], sync_to, user_key,
                                             realm, realm_key, 'PUT', headers)
                put_object(sync_to,
                           name=row['name'],
                           headers=headers,
                           contents=FileLikeIter(body),
                           proxy=self.select_http_proxy(),
                           logger=self.logger,
                           timeout=self.conn_timeout)
                self.container_puts += 1
                self.container_stats['puts'] += 1
                self.container_stats['bytes'] += row['size']
                self.logger.increment('puts')
                self.logger.timing_since('puts.timing', start_time)
        except ClientException as err:
            if err.http_status == HTTP_UNAUTHORIZED:
                self.logger.info(
                    _('Unauth %(sync_from)r => %(sync_to)r'), {
                        'sync_from':
                        '%s/%s' %
                        (quote(info['account']), quote(info['container'])),
                        'sync_to':
                        sync_to
                    })
            elif err.http_status == HTTP_NOT_FOUND:
                self.logger.info(
                    _('Not found %(sync_from)r => %(sync_to)r \
                      - object %(obj_name)r'), {
                        'sync_from':
                        '%s/%s' %
                        (quote(info['account']), quote(info['container'])),
                        'sync_to':
                        sync_to,
                        'obj_name':
                        row['name']
                    })
            else:
                self.logger.exception(_('ERROR Syncing %(db_file)s %(row)s'), {
                    'db_file': str(broker),
                    'row': row
                })
            self.container_failures += 1
            self.logger.increment('failures')
            return False
        except (Exception, Timeout) as err:
            self.logger.exception(_('ERROR Syncing %(db_file)s %(row)s'), {
                'db_file': str(broker),
                'row': row
            })
            self.container_failures += 1
            self.logger.increment('failures')
            return False
        return True
Beispiel #17
0
    def _upload_object(self, src_container, dst_container, key,
                       internal_client, segment=False, policy_index=None,
                       timestamp=None, stats_cb=None):
        req_hdrs = {}
        if policy_index is not None:
            req_hdrs['X-Backend-Storage-Policy-Index'] = policy_index
        try:
            with self.client_pool.get_client() as swift_client:
                remote_meta = swift_client.head_object(
                    dst_container, key, headers=self._client_headers())
        except swiftclient.exceptions.ClientException as e:
            if e.http_status == 404:
                remote_meta = None
            else:
                raise

        try:
            metadata = internal_client.get_object_metadata(
                self.account, src_container, key,
                headers=req_hdrs)
        except UnexpectedResponse as e:
            if '404 Not Found' in e.message:
                return self.UploadStatus.NOT_FOUND
            raise

        if not segment:
            _, _, internal_timestamp = decode_timestamps(timestamp)
            if float(metadata['x-timestamp']) <\
                    float(internal_timestamp.internal):
                raise RetryError('Stale object %s' % key)

        if not segment and not match_item(metadata, self.selection_criteria):
            self.logger.debug(
                'Not archiving %s as metadata does not match: %s %s' % (
                    key, metadata, self.selection_criteria))
            return self.UploadStatus.SKIPPED_METADATA

        if check_slo(metadata):
            if segment:
                self.logger.warning(
                    'Nested SLOs are not currently supported. Failing to '
                    'upload: %s/%s/%s' % (self.account, src_container, key))
                return self.UploadStatus.SKIPPED_NESTED_SLO

            if remote_meta and self._check_slo_uploaded(
                    key, remote_meta, internal_client, req_hdrs):
                if not self._is_meta_synced(metadata, remote_meta):
                    # TODO: Update segments' X-Delete-At headers if
                    # remote_delete_after is applied/updated/removed.
                    self.update_metadata(key, metadata,
                                         remote_metadata=remote_meta,
                                         bucket=dst_container)
                    return self.UploadStatus.POST
                return self.UploadStatus.NOOP
            return self._upload_slo(key, internal_client, req_hdrs,
                                    stats_cb=stats_cb)

        dlo_prefix = get_dlo_prefix(metadata)
        if not segment and dlo_prefix:
            # TODO: we should be able to consolidate checking of uploaded
            # objects before getting into the specifics of uploading large
            # objects or regular objects.
            if remote_meta and self._check_dlo_uploaded(metadata, remote_meta,
                                                        internal_client):
                if not self._is_meta_synced(metadata, remote_meta):
                    self.update_metadata(key, metadata,
                                         remote_metadata=remote_meta,
                                         bucket=dst_container)
                    # TODO: Update segments' X-Delete-At headers if
                    # remote_delete_after is applied/updated/removed.
                    return self.UploadStatus.POST
                return self.UploadStatus.NOOP
            return self._upload_dlo(key, internal_client, metadata, req_hdrs,
                                    stats_cb=stats_cb)

        if remote_meta and metadata['etag'] == remote_meta['etag']:
            if not self._is_meta_synced(
                    metadata, remote_meta, segment=segment):
                self.update_metadata(key, metadata,
                                     remote_metadata=remote_meta,
                                     bucket=dst_container,
                                     segment=segment)
                return self.UploadStatus.POST
            return self.UploadStatus.NOOP

        body = FileWrapper(
            internal_client, self.account, src_container, key, req_hdrs,
            stats_cb=stats_cb)
        headers = self._get_user_headers(
            body.get_headers(), segment=segment)
        self.logger.debug('Uploading %s with meta: %r' % (
            key, headers))

        try:
            resp = self.put_object(
                key, self._client_headers(headers), body,
                bucket=dst_container,
                etag=body.get_headers()['etag'],
                content_length=len(body))
            if not resp.success:
                resp.reraise()
        finally:
            body.close()
        return self.UploadStatus.PUT
Beispiel #18
0
def update_new_item_from_existing(new_item, existing):
    """
    Compare the data and meta related timestamps of a new object item with
    the timestamps of an existing object record, and update the new item
    with data and/or meta related attributes from the existing record if
    their timestamps are newer.

    The multiple timestamps are encoded into a single string for storing
    in the 'created_at' column of the objects db table.

    :param new_item: A dict of object update attributes
    :param existing: A dict of existing object attributes
    :return: True if any attributes of the new item dict were found to be
             newer than the existing and therefore not updated, otherwise
             False implying that the updated item is equal to the existing.
    """

    # item[created_at] may be updated so keep a copy of the original
    # value in case we process this item again
    new_item.setdefault('data_timestamp', new_item['created_at'])

    # content-type and metadata timestamps may be encoded in
    # item[created_at], or may be set explicitly.
    item_ts_data, item_ts_ctype, item_ts_meta = decode_timestamps(
        new_item['data_timestamp'])

    if new_item.get('ctype_timestamp'):
        item_ts_ctype = Timestamp(new_item.get('ctype_timestamp'))
        item_ts_meta = item_ts_ctype
    if new_item.get('meta_timestamp'):
        item_ts_meta = Timestamp(new_item.get('meta_timestamp'))

    if not existing:
        # encode new_item timestamps into one string for db record
        new_item['created_at'] = encode_timestamps(item_ts_data, item_ts_ctype,
                                                   item_ts_meta)
        return True

    # decode existing timestamp into separate data, content-type and
    # metadata timestamps
    rec_ts_data, rec_ts_ctype, rec_ts_meta = decode_timestamps(
        existing['created_at'])

    # Extract any swift_bytes values from the content_type values. This is
    # necessary because the swift_bytes value to persist should be that at the
    # most recent data timestamp whereas the content-type value to persist is
    # that at the most recent content-type timestamp. The two values happen to
    # be stored in the same database column for historical reasons.
    for item in (new_item, existing):
        content_type, swift_bytes = extract_swift_bytes(item['content_type'])
        item['content_type'] = content_type
        item['swift_bytes'] = swift_bytes

    newer_than_existing = [True, True, True]
    if rec_ts_data >= item_ts_data:
        # apply data attributes from existing record
        new_item.update([(k, existing[k])
                         for k in ('size', 'etag', 'deleted', 'swift_bytes')])
        item_ts_data = rec_ts_data
        newer_than_existing[0] = False
    if rec_ts_ctype >= item_ts_ctype:
        # apply content-type attribute from existing record
        new_item['content_type'] = existing['content_type']
        item_ts_ctype = rec_ts_ctype
        newer_than_existing[1] = False
    if rec_ts_meta >= item_ts_meta:
        # apply metadata timestamp from existing record
        item_ts_meta = rec_ts_meta
        newer_than_existing[2] = False

    # encode updated timestamps into one string for db record
    new_item['created_at'] = encode_timestamps(item_ts_data, item_ts_ctype,
                                               item_ts_meta)

    # append the most recent swift_bytes onto the most recent content_type in
    # new_item and restore existing to its original state
    for item in (new_item, existing):
        if item['swift_bytes']:
            item['content_type'] += ';swift_bytes=%s' % item['swift_bytes']
        del item['swift_bytes']

    return any(newer_than_existing)
Beispiel #19
0
    def container_sync_row(self, row, sync_to, user_key, broker, info, realm,
                           realm_key):
        """
        Sends the update the row indicates to the sync_to container.

        :param row: The updated row in the local database triggering the sync
                    update.
        :param sync_to: The URL to the remote container.
        :param user_key: The X-Container-Sync-Key to use when sending requests
                         to the other container.
        :param broker: The local container database broker.
        :param info: The get_info result from the local container database
                     broker.
        :param realm: The realm from self.realms_conf, if there is one.
            If None, fallback to using the older allowed_sync_hosts
            way of syncing.
        :param realm_key: The realm key from self.realms_conf, if there
            is one. If None, fallback to using the older
            allowed_sync_hosts way of syncing.
        :returns: True on success
        """
        try:
            start_time = time()
            # extract last modified time from the created_at value
            ts_data, ts_ctype, ts_meta = decode_timestamps(row['created_at'])
            if row['deleted']:
                # when sync'ing a deleted object, use ts_data - this is the
                # timestamp of the source tombstone
                try:
                    headers = {'x-timestamp': ts_data.internal}
                    if realm and realm_key:
                        nonce = uuid.uuid4().hex
                        path = urlparse(sync_to).path + '/' + quote(
                            row['name'])
                        sig = self.realms_conf.get_sig('DELETE', path,
                                                       headers['x-timestamp'],
                                                       nonce, realm_key,
                                                       user_key)
                        headers['x-container-sync-auth'] = '%s %s %s' % (
                            realm, nonce, sig)
                    else:
                        headers['x-container-sync-key'] = user_key
                    delete_object(sync_to,
                                  name=row['name'],
                                  headers=headers,
                                  proxy=self.select_http_proxy(),
                                  logger=self.logger,
                                  timeout=self.conn_timeout)
                except ClientException as err:
                    if err.http_status != HTTP_NOT_FOUND:
                        raise
                self.container_deletes += 1
                self.logger.increment('deletes')
                self.logger.timing_since('deletes.timing', start_time)
            else:
                # when sync'ing a live object, use ts_meta - this is the time
                # at which the source object was last modified by a PUT or POST
                part, nodes = \
                    self.get_object_ring(info['storage_policy_index']). \
                    get_nodes(info['account'], info['container'],
                              row['name'])
                shuffle(nodes)
                exc = None
                # look up for the newest one
                headers_out = {
                    'X-Newest':
                    True,
                    'X-Backend-Storage-Policy-Index':
                    str(info['storage_policy_index'])
                }
                try:
                    source_obj_status, headers, body = \
                        self.swift.get_object(info['account'],
                                              info['container'], row['name'],
                                              headers=headers_out,
                                              acceptable_statuses=(2, 4))

                except (Exception, UnexpectedResponse, Timeout) as err:
                    headers = {}
                    body = None
                    exc = err
                timestamp = Timestamp(headers.get('x-timestamp', 0))
                if timestamp < ts_meta:
                    if exc:
                        raise exc
                    raise Exception(
                        _('Unknown exception trying to GET: '
                          '%(account)r %(container)r %(object)r'), {
                              'account': info['account'],
                              'container': info['container'],
                              'object': row['name']
                          })
                for key in ('date', 'last-modified'):
                    if key in headers:
                        del headers[key]
                if 'etag' in headers:
                    headers['etag'] = headers['etag'].strip('"')
                if 'content-type' in headers:
                    headers['content-type'] = clean_content_type(
                        headers['content-type'])
                if realm and realm_key:
                    nonce = uuid.uuid4().hex
                    path = urlparse(sync_to).path + '/' + quote(row['name'])
                    sig = self.realms_conf.get_sig('PUT', path,
                                                   headers['x-timestamp'],
                                                   nonce, realm_key, user_key)
                    headers['x-container-sync-auth'] = '%s %s %s' % (
                        realm, nonce, sig)
                else:
                    headers['x-container-sync-key'] = user_key
                put_object(sync_to,
                           name=row['name'],
                           headers=headers,
                           contents=FileLikeIter(body),
                           proxy=self.select_http_proxy(),
                           logger=self.logger,
                           timeout=self.conn_timeout)
                self.container_puts += 1
                self.logger.increment('puts')
                self.logger.timing_since('puts.timing', start_time)
        except ClientException as err:
            if err.http_status == HTTP_UNAUTHORIZED:
                self.logger.info(
                    _('Unauth %(sync_from)r => %(sync_to)r'), {
                        'sync_from':
                        '%s/%s' %
                        (quote(info['account']), quote(info['container'])),
                        'sync_to':
                        sync_to
                    })
            elif err.http_status == HTTP_NOT_FOUND:
                self.logger.info(
                    _('Not found %(sync_from)r => %(sync_to)r \
                      - object %(obj_name)r'), {
                        'sync_from':
                        '%s/%s' %
                        (quote(info['account']), quote(info['container'])),
                        'sync_to':
                        sync_to,
                        'obj_name':
                        row['name']
                    })
            else:
                self.logger.exception(_('ERROR Syncing %(db_file)s %(row)s'), {
                    'db_file': str(broker),
                    'row': row
                })
            self.container_failures += 1
            self.logger.increment('failures')
            return False
        except (Exception, Timeout) as err:
            self.logger.exception(_('ERROR Syncing %(db_file)s %(row)s'), {
                'db_file': str(broker),
                'row': row
            })
            self.container_failures += 1
            self.logger.increment('failures')
            return False
        return True
Beispiel #20
0
    def container_sync_row(self, row, sync_to, user_key, broker, info,
                           realm, realm_key):
        """
        Sends the update the row indicates to the sync_to container.

        :param row: The updated row in the local database triggering the sync
                    update.
        :param sync_to: The URL to the remote container.
        :param user_key: The X-Container-Sync-Key to use when sending requests
                         to the other container.
        :param broker: The local container database broker.
        :param info: The get_info result from the local container database
                     broker.
        :param realm: The realm from self.realms_conf, if there is one.
            If None, fallback to using the older allowed_sync_hosts
            way of syncing.
        :param realm_key: The realm key from self.realms_conf, if there
            is one. If None, fallback to using the older
            allowed_sync_hosts way of syncing.
        :returns: True on success
        """
        try:
            start_time = time()
            # extract last modified time from the created_at value
            ts_data, ts_ctype, ts_meta = decode_timestamps(
                row['created_at'])
            if row['deleted']:
                # when sync'ing a deleted object, use ts_data - this is the
                # timestamp of the source tombstone
                try:
                    headers = {'x-timestamp': ts_data.internal}
                    if realm and realm_key:
                        nonce = uuid.uuid4().hex
                        path = urlparse(sync_to).path + '/' + quote(
                            row['name'])
                        sig = self.realms_conf.get_sig(
                            'DELETE', path, headers['x-timestamp'], nonce,
                            realm_key, user_key)
                        headers['x-container-sync-auth'] = '%s %s %s' % (
                            realm, nonce, sig)
                    else:
                        headers['x-container-sync-key'] = user_key
                    delete_object(sync_to, name=row['name'], headers=headers,
                                  proxy=self.select_http_proxy(),
                                  logger=self.logger,
                                  timeout=self.conn_timeout)
                except ClientException as err:
                    if err.http_status != HTTP_NOT_FOUND:
                        raise
                self.container_deletes += 1
                self.logger.increment('deletes')
                self.logger.timing_since('deletes.timing', start_time)
            else:
                # when sync'ing a live object, use ts_meta - this is the time
                # at which the source object was last modified by a PUT or POST
                exc = None
                # look up for the newest one
                headers_out = {'X-Newest': True,
                               'X-Backend-Storage-Policy-Index':
                               str(info['storage_policy_index'])}
                try:
                    source_obj_status, headers, body = \
                        self.swift.get_object(info['account'],
                                              info['container'], row['name'],
                                              headers=headers_out,
                                              acceptable_statuses=(2, 4))

                except (Exception, UnexpectedResponse, Timeout) as err:
                    headers = {}
                    body = None
                    exc = err
                timestamp = Timestamp(headers.get('x-timestamp', 0))
                if timestamp < ts_meta:
                    if exc:
                        raise exc
                    raise Exception(
                        _('Unknown exception trying to GET: '
                          '%(account)r %(container)r %(object)r'),
                        {'account': info['account'],
                         'container': info['container'],
                         'object': row['name']})
                for key in ('date', 'last-modified'):
                    if key in headers:
                        del headers[key]
                if 'etag' in headers:
                    headers['etag'] = headers['etag'].strip('"')
                if 'content-type' in headers:
                    headers['content-type'] = clean_content_type(
                        headers['content-type'])
                if realm and realm_key:
                    nonce = uuid.uuid4().hex
                    path = urlparse(sync_to).path + '/' + quote(row['name'])
                    sig = self.realms_conf.get_sig(
                        'PUT', path, headers['x-timestamp'], nonce, realm_key,
                        user_key)
                    headers['x-container-sync-auth'] = '%s %s %s' % (
                        realm, nonce, sig)
                else:
                    headers['x-container-sync-key'] = user_key
                put_object(sync_to, name=row['name'], headers=headers,
                           contents=FileLikeIter(body),
                           proxy=self.select_http_proxy(), logger=self.logger,
                           timeout=self.conn_timeout)
                self.container_puts += 1
                self.logger.increment('puts')
                self.logger.timing_since('puts.timing', start_time)
        except ClientException as err:
            if err.http_status == HTTP_UNAUTHORIZED:
                self.logger.info(
                    _('Unauth %(sync_from)r => %(sync_to)r'),
                    {'sync_from': '%s/%s' %
                        (quote(info['account']), quote(info['container'])),
                     'sync_to': sync_to})
            elif err.http_status == HTTP_NOT_FOUND:
                self.logger.info(
                    _('Not found %(sync_from)r => %(sync_to)r \
                      - object %(obj_name)r'),
                    {'sync_from': '%s/%s' %
                        (quote(info['account']), quote(info['container'])),
                     'sync_to': sync_to, 'obj_name': row['name']})
            else:
                self.logger.exception(
                    _('ERROR Syncing %(db_file)s %(row)s'),
                    {'db_file': str(broker), 'row': row})
            self.container_failures += 1
            self.logger.increment('failures')
            return False
        except (Exception, Timeout) as err:
            self.logger.exception(
                _('ERROR Syncing %(db_file)s %(row)s'),
                {'db_file': str(broker), 'row': row})
            self.container_failures += 1
            self.logger.increment('failures')
            return False
        return True
Beispiel #21
0
    def upload_slo(self, row, s3_meta, internal_client, upload_stats_cb=None):
        # Converts an SLO into a multipart upload. We use the segments as
        # is, for the part sizes.
        # NOTE: If the SLO segment is < 5MB and is not the last segment, the
        # UploadPart call will fail. We need to stitch segments together in
        # that case.
        #
        # For Google Cloud Storage, we will convert the SLO into a single
        # object put, assuming the SLO is < 5TB. If the SLO is > 5TB, we have
        # to fail the upload. With GCS _compose_, we could support larger
        # objects, but defer this work for the time being.
        swift_req_hdrs = {
            'X-Backend-Storage-Policy-Index': row['storage_policy_index']}
        swift_key = row['name']
        status, headers, body = internal_client.get_object(
            self.account, self.container, swift_key, headers=swift_req_hdrs)
        if status != 200:
            body.close()
            raise RuntimeError('Failed to get the manifest')
        manifest = json.loads(''.join(body))
        body.close()
        _, _, metadata_timestamp = decode_timestamps(row['created_at'])
        if float(headers['x-timestamp']) < metadata_timestamp.timestamp:
            raise RetryError('Stale object %s' % row['name'])
        self.logger.debug("JSON manifest: %s" % str(manifest))
        s3_key = self.get_s3_name(swift_key)

        if not self._validate_slo_manifest(manifest):
            # We do not raise an exception here -- we should not retry these
            # errors and they will be logged.
            # TODO: When we report statistics, we need to account for permanent
            # failures.
            self.logger.error('Failed to validate the SLO manifest for %s' %
                              self._full_name(swift_key))
            return self.UploadStatus.INVALID_SLO

        if self._google():
            if s3_meta:
                slo_etag = s3_meta['Metadata'].get(SLO_ETAG_FIELD, None)
                if slo_etag == headers['etag']:
                    if self.is_object_meta_synced(s3_meta, headers):
                        return self.UploadStatus.NOOP
                    self.update_metadata(swift_key, headers)
                    return self.UploadStatus.POST
            self._upload_google_slo(manifest, headers, s3_key, internal_client,
                                    upload_stats_cb)
        else:
            expected_etag = get_slo_etag(manifest)

            if s3_meta and self.check_etag(expected_etag, s3_meta['ETag']):
                if self.is_object_meta_synced(s3_meta, headers):
                    return self.UploadStatus.NOOP
                elif not self.in_glacier(s3_meta):
                    self.update_slo_metadata(headers, manifest, s3_key,
                                             swift_req_hdrs, internal_client)
                    return self.UploadStatus.POST
            self._upload_slo(manifest, headers, s3_key, internal_client,
                             upload_stats_cb)

        with self.client_pool.get_client() as s3_client:
            # We upload the manifest so that we can restore the object in
            # Swift and have it match the S3 multipart ETag. To avoid name
            # length issues, we hash the object name and append the suffix
            params = dict(
                Bucket=self.aws_bucket,
                Key=self.get_manifest_name(s3_key),
                Body=json.dumps(manifest),
                ContentLength=len(json.dumps(manifest)),
                ContentType='application/json')
            if self._is_amazon() and self.encryption:
                params['ServerSideEncryption'] = 'AES256'
            s3_client.put_object(**params)
            return self.UploadStatus.PUT
Beispiel #22
0
def update_new_item_from_existing(new_item, existing):
    """
    Compare the data and meta related timestamps of a new object item with
    the timestamps of an existing object record, and update the new item
    with data and/or meta related attributes from the existing record if
    their timestamps are newer.

    The multiple timestamps are encoded into a single string for storing
    in the 'created_at' column of the objects db table.

    :param new_item: A dict of object update attributes
    :param existing: A dict of existing object attributes
    :return: True if any attributes of the new item dict were found to be
             newer than the existing and therefore not updated, otherwise
             False implying that the updated item is equal to the existing.
    """

    # item[created_at] may be updated so keep a copy of the original
    # value in case we process this item again
    new_item.setdefault('data_timestamp', new_item['created_at'])

    # content-type and metadata timestamps may be encoded in
    # item[created_at], or may be set explicitly.
    item_ts_data, item_ts_ctype, item_ts_meta = decode_timestamps(
        new_item['data_timestamp'])

    if new_item.get('ctype_timestamp'):
        item_ts_ctype = Timestamp(new_item.get('ctype_timestamp'))
        item_ts_meta = item_ts_ctype
    if new_item.get('meta_timestamp'):
        item_ts_meta = Timestamp(new_item.get('meta_timestamp'))

    if not existing:
        # encode new_item timestamps into one string for db record
        new_item['created_at'] = encode_timestamps(
            item_ts_data, item_ts_ctype, item_ts_meta)
        return True

    # decode existing timestamp into separate data, content-type and
    # metadata timestamps
    rec_ts_data, rec_ts_ctype, rec_ts_meta = decode_timestamps(
        existing['created_at'])

    # Extract any swift_bytes values from the content_type values. This is
    # necessary because the swift_bytes value to persist should be that at the
    # most recent data timestamp whereas the content-type value to persist is
    # that at the most recent content-type timestamp. The two values happen to
    # be stored in the same database column for historical reasons.
    for item in (new_item, existing):
        content_type, swift_bytes = extract_swift_bytes(item['content_type'])
        item['content_type'] = content_type
        item['swift_bytes'] = swift_bytes

    newer_than_existing = [True, True, True]
    if rec_ts_data >= item_ts_data:
        # apply data attributes from existing record
        new_item.update([(k, existing[k])
                         for k in ('size', 'etag', 'deleted', 'swift_bytes')])
        item_ts_data = rec_ts_data
        newer_than_existing[0] = False
    if rec_ts_ctype >= item_ts_ctype:
        # apply content-type attribute from existing record
        new_item['content_type'] = existing['content_type']
        item_ts_ctype = rec_ts_ctype
        newer_than_existing[1] = False
    if rec_ts_meta >= item_ts_meta:
        # apply metadata timestamp from existing record
        item_ts_meta = rec_ts_meta
        newer_than_existing[2] = False

    # encode updated timestamps into one string for db record
    new_item['created_at'] = encode_timestamps(
        item_ts_data, item_ts_ctype, item_ts_meta)

    # append the most recent swift_bytes onto the most recent content_type in
    # new_item and restore existing to its original state
    for item in (new_item, existing):
        if item['swift_bytes']:
            item['content_type'] += ';swift_bytes=%s' % item['swift_bytes']
        del item['swift_bytes']

    return any(newer_than_existing)
Beispiel #23
0
 def _get_last_modified_date(row):
     ts, content, meta = decode_timestamps(row['created_at'])
     # NOTE: the meta timestamp will always be latest, as it will be updated
     # when content type is updated
     return meta
Beispiel #24
0
    def upload_object(self, row, internal_client, upload_stats_cb=None):
        swift_key = row['name']
        s3_key = self.get_s3_name(swift_key)
        try:
            with self.client_pool.get_client() as s3_client:
                s3_meta = s3_client.head_object(Bucket=self.aws_bucket,
                                                Key=s3_key)
        except botocore.exceptions.ClientError as e:
            resp_meta = e.response.get('ResponseMetadata', {})
            if resp_meta.get('HTTPStatusCode', 0) == 404:
                s3_meta = None
            else:
                raise e
        swift_req_hdrs = {
            'X-Backend-Storage-Policy-Index': row['storage_policy_index']}

        try:
            metadata = internal_client.get_object_metadata(
                self.account, self.container, swift_key,
                headers=swift_req_hdrs)
        except UnexpectedResponse as e:
            if '404 Not Found' in e.message:
                return self.UploadStatus.NOT_FOUND
            raise
        _, _, metadata_timestamp = decode_timestamps(row['created_at'])
        if float(metadata['x-timestamp']) < metadata_timestamp.timestamp:
            raise RetryError('Stale object %s' % row['name'])

        if not match_item(metadata, self.selection_criteria):
            self.logger.debug(
                'Not archiving %s as metadata does not match: %s %s' % (
                    swift_key, metadata, self.selection_criteria))
            return self.UploadStatus.SKIPPED_METADATA

        self.logger.debug("Metadata: %s" % str(metadata))
        if check_slo(metadata):
            return self.upload_slo(row, s3_meta, internal_client,
                                   upload_stats_cb)

        if s3_meta and self.check_etag(metadata['etag'], s3_meta['ETag']):
            if self.is_object_meta_synced(s3_meta, metadata):
                return self.UploadStatus.NOOP
            elif not self.in_glacier(s3_meta):
                self.update_metadata(swift_key, metadata)
                return self.UploadStatus.POST

        with self.client_pool.get_client() as s3_client:
            wrapper_stream = FileWrapper(internal_client,
                                         self.account,
                                         self.container,
                                         swift_key,
                                         swift_req_hdrs,
                                         stats_cb=upload_stats_cb)
            self.logger.debug('Uploading %s with meta: %r' % (
                s3_key, wrapper_stream.get_s3_headers()))

            params = dict(
                Bucket=self.aws_bucket,
                Key=s3_key,
                Body=wrapper_stream,
                Metadata=wrapper_stream.get_s3_headers(),
                ContentLength=len(wrapper_stream),
                ContentMD5=base64.b64encode(
                    wrapper_stream.get_headers()['etag'].decode('hex')),
                ContentType=metadata['content-type']
            )
            if self._is_amazon() and self.encryption:
                params['ServerSideEncryption'] = 'AES256'
            try:
                s3_client.put_object(**params)
            finally:
                wrapper_stream.close()
        return self.UploadStatus.PUT