def prepare(self): try: self.sections = client.get_metadata_sections(self.src_conn) except HttpError as e: log.error('Error listing metadata sections: %s', e) raise # grab the lastest shard markers and timestamps before we sync self.shard_info = {} self.init_num_shards() for shard_num in xrange(self.num_shards): info = client.get_log_info(self.src_conn, 'metadata', shard_num) # setting an empty marker returns an error if info['marker']: self.shard_info[shard_num] = info['marker'] else: self.shard_info[shard_num] = ' ' self.metadata_by_shard = {} for section in self.sections: try: for key in client.list_metadata_keys(self.src_conn, section): shard = self.shard_num_for_key(section + ':' + key) self.metadata_by_shard.setdefault(shard, []) self.metadata_by_shard[shard].append((section, key)) except NotFound: # no keys of this type exist continue except HttpError as e: log.error('Error listing metadata for section %s: %s', section, e) raise
def prepare(self): log.info('preparing to do a full data sync') self.init_num_shards() # save data log markers for each shard self.shard_info = {} for shard in xrange(self.num_shards): info = client.get_log_info(self.src_conn, 'data', shard) # setting an empty marker returns an error if info['marker']: self.shard_info[shard] = info['marker'] else: self.shard_info[shard] = ' ' # get list of buckets after getting any markers to avoid skipping # entries added before we got the marker info log.debug('getting bucket list') buckets = client.get_bucket_list(self.src_conn) self.prepared_at = time.time() self.buckets_by_shard = {} for bucket in buckets: shard = self.shard_num_for_key(bucket) self.buckets_by_shard.setdefault(shard, []) self.buckets_by_shard[shard].append(bucket)
def full_sync_bucket(self, bucket): try: instance = self.get_bucket_instance(bucket) try: marker = client.get_log_info(self.src_conn, 'bucket-index', instance)['max_marker'] except client.NotFound: marker = '' log.debug('bucket instance is "%s" with marker %s', instance, marker) # nothing to do for this bucket if not marker: return True objects = client.list_objects_in_bucket(self.src_conn, bucket) if not objects: return True except Exception as e: log.error('error preparing for full sync of bucket "%s": %s', bucket, e) return False retries = self.sync_bucket(bucket, objects) result = self.set_bound(instance, marker, retries, 'bucket-index') return not retries and result == RESULT_SUCCESS
def full_sync_bucket(self, bucket): try: instance = self.get_bucket_instance(bucket) try: marker = client.get_log_info(self.src_conn, "bucket-index", instance)["max_marker"] except NotFound: marker = " " log.debug('bucket instance is "%s" with marker %s', instance, marker) objects = client.list_objects_in_bucket(self.src_conn, bucket) retries = self.sync_bucket(bucket, objects) result = self.set_bound(instance, marker, retries, "bucket-index") return not retries and result == RESULT_SUCCESS except BucketEmpty: log.debug("no objects in bucket %s", bucket) return True except Exception: log.exception('error preparing for full sync of bucket "%s"', bucket) return False
def full_sync_bucket(self, bucket): try: instance = self.get_bucket_instance(bucket) try: marker = client.get_log_info(self.src_conn, 'bucket-index', instance)['max_marker'] except NotFound: marker = ' ' log.debug('bucket instance is "%s" with marker %s', instance, marker) objects = client.list_objects_in_bucket(self.src_conn, bucket) retries = self.sync_bucket(bucket, objects) result = self.set_bound(instance, marker, retries, 'bucket-index') return not retries and result == RESULT_SUCCESS except BucketEmpty: log.debug('no objects in bucket %s', bucket) return True except Exception: log.exception('error preparing for full sync of bucket "%s"', bucket) return False
def sync_full(self, num_workers, log_lock_time): try: sections = client.get_metadata_sections(self.src_conn) except client.HttpError as e: log.error('Error listing metadata sections: %s', e) raise # grab the lastest shard markers and timestamps before we sync shard_info = [] num_shards = client.num_log_shards(self.src_conn, 'metadata') for shard_num in xrange(num_shards): info = client.get_log_info(self.src_conn, 'metadata', shard_num) # setting an empty marker returns an error if info['marker']: shard_info.append((shard_num, info['marker'], info['last_update'])) meta_keys = [] for section in sections: try: meta_keys += [(section, key) for key in client.list_metadata_keys(self.src_conn, section)] except client.NotFound: # no keys of this type exist continue except client.HttpError as e: log.error('Error listing metadata for section %s: %s', section, e) raise # create the work and results Queue workQueue = multiprocessing.Queue() resultQueue = multiprocessing.Queue() # create the worker processes if self._type == 'data': worker_cls = worker.DataWorkerFull else: worker_cls = worker.MetadataWorkerFull processes = [worker_cls(workQueue, resultQueue, log_lock_time, self.src, self.dest) for i in xrange(num_workers)] for process in processes: process.daemon = True process.start() start_time = datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%SZ") log.info('Starting full sync at %s', start_time) # enqueue the shards to be synced for meta in meta_keys: workQueue.put(meta) # add a poison pill for each worker for i in xrange(num_workers): workQueue.put(None) # pull the results out as they are produced errors = [] for i in xrange(len(meta_keys)): log.info('%d/%d items synced', i, len(meta_keys)) result, section, name = resultQueue.get() if result != worker.RESULT_SUCCESS: log.error('error syncing %s %r', section, name) errors.append((section, name)) else: log.debug('synced %s %r', section, name) for process in processes: process.join() if errors: log.error('Encountered errors syncing these %d entries: %s', len(errors), errors) else: for shard_num, marker, timestamp in shard_info: client.set_worker_bound(self.src_conn, 'metadata', shard_num, marker, timestamp, self.daemon_id) client.del_worker_bound(self.src_conn, 'metadata', shard_num, self.daemon_id)