def merge_all_workers(bucket_name, num_workers, prefix): num_files = 0 # merged_value = np.zeros(dshape, dtype=dtype) merged_value = [] while num_files < num_workers: objects = list_bucket_objects(bucket_name) if objects is not None: for obj in objects: file_key = urllib.parse.unquote_plus(obj["Key"], encoding='utf-8') data_bytes = get_object(bucket_name, file_key).read() data = pickle.loads(data_bytes) for i in range(len(data)): if num_files == 0: merged_value.append( np.zeros(data[i].shape, dtype=data[i].dtype)) merged_value[i] = merged_value[i] + data[i] num_files = num_files + 1 delete_object(bucket_name, file_key) # average weights # if prefix == 'w_': merged_value = [value / float(num_workers) for value in merged_value] return merged_value
def delete_expired_merged(bucket_name, cur_epoch, cur_batch): objects = list_bucket_objects(bucket_name) if objects is not None: for obj in objects: file_key = urllib.parse.unquote_plus(obj["Key"], encoding='utf-8') key_splits = file_key.split("_") key_batch = int(key_splits[-1]) key_epoch = int(key_splits[-2]) if key_epoch < cur_epoch or (key_epoch == cur_epoch and key_batch < cur_batch): # print("delete object {} in bucket {}".format(file_key, bucket_name)) delete_object(bucket_name, file_key)
def merge_w_b(bucket_name, num_workers, dtype, w_shape, b_shape, w_prefix="tmp_w_", b_prefix="tmp_b_"): num_w_files = 0 num_b_files = 0 w_files = [] b_files = [] w_sum = np.zeros(w_shape, dtype=dtype) b_sum = np.zeros(b_shape, dtype=dtype) while num_w_files < num_workers or num_b_files < num_workers: objects = list_bucket_objects(bucket_name) if objects is not None: for obj in objects: file_key = urllib.parse.unquote_plus(obj["Key"], encoding='utf-8') #print("found file {} in bucket {}".format(obj, bucket_name)) if file_key.startswith(w_prefix): data = get_object(bucket_name, file_key).read() bytes_data = np.frombuffer(data, dtype=dtype) w_files.append(file_key) w_grad = bytes_data.reshape(w_shape) #print("merge the {}-th weight grad {} in bucket {} = {}".format(num_w_files, file_key, bucket_name, w_grad[0][:5])) w_sum = w_sum + w_grad num_w_files = num_w_files + 1 delete_object( bucket_name, file_key ) # do not put this outside 'if', in case of deleting w_ and b_ elif file_key.startswith(b_prefix): data = get_object(bucket_name, file_key).read() bytes_data = np.frombuffer(data, dtype=dtype) b_files.append(file_key) b_grad = bytes_data.reshape(b_shape) #print("merge the {}-th bias grad {} in bucket {} = {}".format(num_b_files, file_key, bucket_name, b_grad)) b_sum = b_sum + b_grad num_b_files = num_b_files + 1 delete_object( bucket_name, file_key ) # do not put this outside 'if', in case of deleting w_ and b_ #print("found {} w files: {}".format(len(w_files), w_files)) #print("found {} b files: {}".format(len(b_files), b_files)) # else: # # Didn't get any keys # print('No objects in {}'.format(bucket_name)) return w_sum / float(num_workers), b_sum / float(num_workers)
def delete_expired_w_b_by_epoch(bucket_name, cur_epoch, w_prefix="w_", b_prefix="b_"): objects = list_bucket_objects(bucket_name) if objects is not None: for obj in objects: file_key = urllib.parse.unquote_plus(obj["Key"], encoding='utf-8') if file_key.startswith(w_prefix) or file_key.startswith(b_prefix): key_splits = file_key.split("_") key_epoch = int(key_splits[-1]) if key_epoch < cur_epoch: print("delete object {} in bucket {}".format( file_key, bucket_name)) delete_object(bucket_name, file_key)
def merge_w_b_grads(bucket_name, num_workers, dtype, w_shape, b_shape, w_grad_prefix="w_grad_", b_grad_prefix="b_grad"): num_w_files = 0 num_b_files = 0 w_grad_sum = np.zeros(w_shape, dtype=dtype) b_grad_sum = np.zeros(b_shape, dtype=dtype) while num_w_files < num_workers or num_b_files < num_workers: objects = list_bucket_objects(bucket_name) if objects is not None: for obj in objects: file_key = urllib.parse.unquote_plus(obj["Key"], encoding='utf-8') data = get_object(bucket_name, file_key).read() bytes_data = np.frombuffer(data, dtype=dtype) if file_key.startswith(w_grad_prefix): w_grad = bytes_data.reshape(w_shape) print("merge the {}-th weight grad {} in bucket {} = {}". format(num_w_files, file_key, bucket_name, w_grad)) w_grad_sum = w_grad_sum + w_grad num_w_files = num_w_files + 1 elif file_key.startswith(b_grad_prefix): b_grad = bytes_data.reshape(b_shape) print("merge the {}-th bias grad {} in bucket {} = {}". format(num_b_files, file_key, bucket_name, b_grad)) b_grad_sum = b_grad_sum + b_grad num_b_files = num_b_files + 1 delete_object(bucket_name, file_key) # else: # # Didn't get any keys # print('No objects in {}'.format(bucket_name)) return w_grad_sum / float(num_workers), b_grad_sum / float(num_workers)
def merge_np_bytes(bucket_name, num_workers, dtype, shape): num_files = 0 sum_arr = np.zeros(shape, dtype=dtype) while num_files < num_workers: objects = list_bucket_objects(bucket_name) if objects is not None: for obj in objects: file_key = urllib.parse.unquote_plus(obj["Key"], encoding='utf-8') print('file in bucket {} = {}'.format(bucket_name, file_key)) data = get_object(bucket_name, file_key).read() tmp_arr = np.frombuffer(data, dtype=dtype).reshape(shape) print("the {}-th numpy array".format(num_files)) print(tmp_arr) sum_arr = sum_arr + tmp_arr num_files = num_files + 1 delete_object(bucket_name, file_key) else: # Didn't get any keys print('No objects in {}'.format(bucket_name)) return sum_arr
def scatter_reduce(vector, tmp_bucket, merged_bucket, num_workers, myrank, postfix): # vector is supposed to be a 1-d numpy array num_all_values = vector.size num_values_per_worker = num_all_values // num_workers residue = num_all_values % num_workers curr_epoch = postfix.split("_")[0] curr_batch = postfix.split("_")[1] my_offset = (num_values_per_worker * myrank) + min(residue, myrank) my_length = num_values_per_worker + (1 if myrank < residue else 0) my_chunk = vector[my_offset:my_offset + my_length] # write partitioned vector to the shared memory, except the chunk charged by myself for i in range(num_workers): if i != myrank: offset = (num_values_per_worker * i) + min(residue, i) length = num_values_per_worker + (1 if i < residue else 0) # indicating the chunk number and which worker it comes from key = "{}_{}".format(i, myrank) # format of key in tmp-bucket: chunkID_workerID_epoch_batch put_object(tmp_bucket, key + '_' + postfix, vector[offset:offset + length].tobytes()) # read and aggergate the corresponding chunk num_files = 0 while num_files < num_workers - 1: objects = list_bucket_objects(tmp_bucket) if objects is not None: for obj in objects: file_key = urllib.parse.unquote_plus(obj["Key"], encoding='utf-8') key_splits = file_key.split("_") # if it's the chunk I care and it is from the current step # format of key in tmp-bucket: chunkID_workerID_epoch_batch if key_splits[0] == str(myrank) and key_splits[ 2] == curr_epoch and key_splits[3] == curr_batch: data = get_object(tmp_bucket, file_key).read() bytes_data = np.frombuffer(data, dtype=vector.dtype) my_chunk = my_chunk + bytes_data num_files += 1 delete_object(tmp_bucket, file_key) # write the aggregated chunk back # key format in merged_bucket: chunkID_epoch_batch put_object(merged_bucket, str(myrank) + '_' + postfix, my_chunk.tobytes()) # read other aggregated chunks merged_value = {} merged_value[myrank] = my_chunk num_merged_files = 0 already_read = [] while num_merged_files < num_workers - 1: objects = list_bucket_objects(merged_bucket) if objects is not None: for obj in objects: file_key = urllib.parse.unquote_plus(obj["Key"], encoding='utf-8') key_splits = file_key.split("_") #key format in merged_bucket: chunkID_epoch_batch if key_splits[0] != str( myrank) and key_splits[1] == curr_epoch and key_splits[ 2] == curr_batch and file_key not in already_read: # if not file_key.startswith(str(myrank)) and file_key not in already_read: # key_splits = file_key.split("_") data = get_object(merged_bucket, file_key).read() bytes_data = np.frombuffer(data, dtype=vector.dtype) merged_value[int(key_splits[0])] = bytes_data already_read.append(file_key) num_merged_files += 1 # reconstruct the whole vector result = merged_value[0] for k in range(1, num_workers): result = np.concatenate((result, merged_value[k])) # elif k == myrank: # result = np.concatenate((result, my_chunk)) # else: # result = np.concatenate((result, merged_value[k])) return result
def reduce_scatter_batch_multi_bucket(vector, tmp_bucket_prefix, merged_bucket_prefix, num_buckets, num_workers, myrank, postfix): # vector is supposed to be a 1-d numpy array num_all_values = vector.size num_values_per_worker = num_all_values // num_workers residue = num_all_values % num_workers curr_epoch = postfix.split("_")[0] curr_batch = postfix.split("_")[1] my_offset = (num_values_per_worker * myrank) + min(residue, myrank) my_length = num_values_per_worker + (1 if myrank < residue else 0) my_chunk = vector[my_offset:my_offset + my_length] # write partitioned vector to the shared memory, except the chunk charged by myself for i in range(num_workers): if i != myrank: offset = (num_values_per_worker * i) + min(residue, i) length = num_values_per_worker + (1 if i < residue else 0) # indicating the chunk number and which worker it comes from key = "{}_{}".format(i, myrank) tmp_bucket_ind = i % num_buckets tmp_bucket = "{}-{}".format(tmp_bucket_prefix, tmp_bucket_ind) # format of key in tmp-bucket: chunkID_workerID_epoch_batch put_object(tmp_bucket, key + '_' + postfix, vector[offset:offset + length].tobytes()) # read and aggeregate the corresponding chunk num_files = 0 tmp_bucket_ind = myrank % num_buckets tmp_bucket = "{}-{}".format(tmp_bucket_prefix, tmp_bucket_ind) print( "worker [{}] read and aggregate the corresponding chunks in bucket {}". format(myrank, tmp_bucket)) while num_files < num_workers - 1: objects = list_bucket_objects(tmp_bucket) if objects is not None: for obj in objects: file_key = urllib.parse.unquote_plus(obj["Key"], encoding='utf-8') key_splits = file_key.split("_") # if it's the chunk I care and it is from the current step # format of key in tmp-bucket: chunkID_workerID_epoch_batch if key_splits[0] == str(myrank) and key_splits[ 2] == curr_epoch and key_splits[3] == curr_batch: print("get obj = {}".format(file_key)) data = get_object(tmp_bucket, file_key).read() bytes_data = np.frombuffer(data, dtype=vector.dtype) my_chunk = my_chunk + bytes_data num_files += 1 delete_object(tmp_bucket, file_key) merged_bucket_ind = myrank % num_buckets my_merged_bucket = "{}-{}".format(merged_bucket_prefix, merged_bucket_ind) # write the aggregated chunk back # key format in merged_bucket: chunkID_epoch_batch put_object(my_merged_bucket, str(myrank) + '_' + postfix, my_chunk.tobytes()) # read other aggregated chunks merged_value = {myrank: my_chunk} bucket_num_objs = [] if num_workers % num_buckets == 0: bucket_num_objs = [ num_workers / num_buckets for _ in range(num_buckets) ] else: for i in range(num_buckets % num_buckets): num_buckets.append(num_workers / num_buckets + 1) for i in range(num_buckets % num_buckets, num_buckets): num_buckets.append(num_workers / num_buckets) # check boundary # do not count responsible chunk bucket_num_objs[myrank % num_buckets] -= 1 print("bucket num objs = {}".format(bucket_num_objs)) num_merged_files = 0 already_read = [] bucket_num_merged = [0 for _ in range(num_buckets)] while num_merged_files < num_workers - 1: for i in range(num_buckets): if bucket_num_merged[i] < bucket_num_objs[i]: merged_bucket = "{}-{}".format(merged_bucket_prefix, i) objects = list_bucket_objects(merged_bucket) if objects is not None: for obj in objects: file_key = urllib.parse.unquote_plus(obj["Key"], encoding='utf-8') key_splits = file_key.split("_") # key format in merged_bucket: chunkID_epoch_batch # if not file_key.startswith(str(myrank)) and file_key not in already_read: if key_splits[0] != str(myrank) and key_splits[1] == curr_epoch \ and key_splits[2] == curr_batch and file_key not in already_read: print("merge obj = {}".format(file_key)) data = get_object(merged_bucket, file_key).read() bytes_data = np.frombuffer(data, dtype=vector.dtype) merged_value[int(key_splits[0])] = bytes_data already_read.append(file_key) bucket_num_merged[i] += 1 num_merged_files += 1 # reconstruct the whole vector result = merged_value[0] for k in range(1, num_workers): result = np.concatenate((result, merged_value[k])) return result