def clear_bucket(bucket_name): objects = list_bucket_objects(bucket_name) if objects is not None: file_names = [] for obj in objects: file_key = urllib.parse.unquote_plus(obj["Key"], encoding='utf-8') file_names.append(file_key) if len(file_names) >= 1: #print("delete files {} in bucket {}".format(file_names, bucket_name)) delete_objects(bucket_name, file_names) return True
def delete_expired_merged_epoch(bucket_name, cur_epoch): objects = list_bucket_objects(bucket_name) if objects is not None: file_names = [] for obj in objects: file_key = urllib.parse.unquote_plus(obj["Key"], encoding='utf-8') key_splits = file_key.split("_") key_epoch = int(key_splits[-1]) if key_epoch < cur_epoch: file_names.append(file_key) if len(file_names) >= 1: #print("delete files {} in bucket {}".format(file_names, bucket_name)) delete_objects(bucket_name, file_names)
def reduce_batch(vector, tmp_bucket, merged_bucket, num_workers, worker_index, postfix): # vector is supposed to be a 1-d numpy array vec_shape = vector.shape vec_dtype = vector.dtype merged_vec = np.zeros(vec_shape, dtype=vec_dtype) postfix_splits = postfix.split("_") curr_epoch = int(postfix_splits[0]) curr_batch = int(postfix_splits[1]) # put object to s3, format of key: workerID_epoch_batch key = "{}_{}".format(worker_index, postfix) put_object(tmp_bucket, key, vector.tobytes()) # the first worker read and aggregate the corresponding chunk if worker_index == 0: num_files = 0 while num_files < num_workers: objects = list_bucket_objects(tmp_bucket) if objects is not None: delete_list = [] for obj in objects: file_key = urllib.parse.unquote_plus(obj["Key"], encoding='utf-8') key_splits = file_key.split("_") key_epoch = key_splits[1] key_batch = key_splits[2] if key_epoch == str(curr_epoch) and key_batch == str( curr_batch): data = get_object(tmp_bucket, file_key).read() bytes_data = np.frombuffer(data, dtype=vec_dtype) tmp_vec = bytes_data.reshape(vec_shape) merged_vec += tmp_vec num_files += 1 delete_list.append(file_key) delete_objects(tmp_bucket, delete_list) # write the merged data back to s3 merged_file_name = 'merged_' + postfix put_object(merged_bucket, merged_file_name, merged_vec.tobytes()) delete_expired_merged_batch(merged_bucket, curr_epoch, curr_batch) else: merged_file_name = 'merged_' + postfix merged_data = get_object_or_wait(merged_bucket, merged_file_name, 0.1).read() merged_vec = np.frombuffer(merged_data, dtype=vec_dtype).reshape(vec_shape) return merged_vec