def delete_expired_merged(endpoint, bucket_name, cur_epoch, cur_batch, prefix="merged"): candidate = [] for epoch in range(cur_epoch + 1): for batch in range(cur_batch): candidate.append("{}}_{}_{}".format(prefix, epoch, batch)) hdelete_keys(endpoint, candidate)
def merge_w_b_grads(endpoint, bucket_name, num_workers, dtype, w_shape, b_shape, w_grad_prefix="w_grad_", b_grad_prefix="b_grad_"): num_w_files = 0 num_b_files = 0 w_grad_sum = np.zeros(w_shape, dtype=dtype) b_grad_sum = np.zeros(b_shape, dtype=dtype) w_candidate = [] b_candidate = [] for worker_index in range(num_workers): w_candidate.append(bucket_name + "_" + w_grad_prefix + str(worker_index)) b_candidate.append(bucket_name + "_" + b_grad_prefix + str(worker_index)) candidate = w_candidate + b_candidate while num_w_files < num_workers or num_b_files < num_workers: file_keys = [] objects = hlist_keys(endpoint, candidate) if objects is not None: print(objects.keys()) for key, value in objects.items(): #file_key = bytes.decode(obj) file_key = key file_keys.append(file_key) print("the name of the file being processed = {}".format( file_key)) bytes_data = np.fromstring(value, dtype=dtype) if file_key.startswith(bucket_name + "_" + w_grad_prefix): w_grad = bytes_data.reshape(w_shape) #print("merge the {}-th weight grad {} in bucket {} = {}".format(num_w_files, file_key, bucket_name, w_grad[0][:5])) w_grad_sum = w_grad_sum + w_grad num_w_files = num_w_files + 1 elif file_key.startswith(bucket_name + "_" + b_grad_prefix): b_grad = bytes_data.reshape(b_shape) #print("merge the {}-th bias grad {} in bucket {} = {}".format(num_b_files, file_key, bucket_name, b_grad)) b_grad_sum = b_grad_sum + b_grad num_b_files = num_b_files + 1 #the processed keys are deleted at the stage of mergence print("number of file = {},{}".format(num_w_files, num_b_files)) hdelete_keys(endpoint, file_keys) #objects = hlist_keys(endpoint,bucket_name) #print("the keys being deleted = {}".format(objects)) return w_grad_sum / num_workers, b_grad_sum / num_workers
def delete_expired_w_b_layers(endpoint, bucket_name, cur_epoch, cur_batch, prefix, end): if cur_batch == 0: if cur_epoch != 0: expired = [ bucket_name + "_" + prefix + str(cur_epoch - 1) + "_" + str(end) ] else: return else: expired = bucket_name + "_" + prefix + str(cur_epoch) + "_" + str( cur_batch - 1) #expired = [bucket_name+"_"+prefix+str(cur_epoch)+"_"+str(cur_batch)] hdelete_keys(endpoint, [expired])
def reduce_batch(endpoint, vector, merged_bucket, num_workers, worker_index, postfix): # vector is supposed to be a 1-d numpy array vec_shape = vector.shape vec_dtype = vector.dtype merged_vec = np.zeros(vec_shape, dtype=vec_dtype) postfix_splits = postfix.split("_") curr_epoch = int(postfix_splits[0]) curr_batch = int(postfix_splits[1]) # put object to ec, format of key: workerID_epoch_batch key = "{}_{}".format(worker_index, postfix) hset_object(endpoint, merged_bucket, key, vector.tobytes()) # the first worker read and aggregate the corresponding chunk if worker_index == 0: candidate = [] for worker_index in range(num_workers): candidate.append("{}_{}_{}".format(merged_bucket, worker_index, postfix)) num_files = 0 while num_files < num_workers: file_keys = [] objects = hlist_keys(endpoint, candidate) if objects is not None: for key, value in objects.items(): file_keys.append(key) bytes_data = np.fromstring(value, dtype=vec_dtype) tmp_vec = bytes_data.reshape(vec_shape) merged_vec += tmp_vec num_files += 1 # print("number of file = {}".format(num_files)) # the processed keys are deleted at the stage of mergence hdelete_keys(endpoint, file_keys) # write the merged data back to EC hset_object(endpoint, merged_bucket, 'merged_' + postfix, merged_vec.tobytes()) delete_expired_merged(endpoint, merged_bucket, curr_epoch, curr_batch, "merged") else: merged_file_name = 'merged_' + postfix merged_data = hget_object_or_wait(endpoint, merged_bucket, merged_file_name, 0.01) merged_vec = np.frombuffer(merged_data, dtype=vec_dtype).reshape(vec_shape) return merged_vec
def delete_expired_w_b_grads(endpoint, bucket_name, cur_epoch, cur_batch, end, w_prefix="w_grad_", b_prefix="b_grad"): w_expired = [ w_prefix + str(cur_epoch) + "" + str(cur_batch - 1), w_prefix + str(cur_epoch - 1) + "" + str(end - 1) ] b_expired = [ b_prefix + str(cur_epoch) + "" + str(cur_batch - 1), b_prefix + str(cur_epoch - 1) + "" + str(end - 1) ] hdelete_keys(endpoint, bucket_name, w_expired + b_expired)
def merge_w_b_layers(endpoint, bucket_name, num_workers, prefix): #whatever merges w/b grads or model num_files = 0 merged_value = [] candidate = [] split = [] for worker_index in range(num_workers): #split.append(bucket_name+"_"+prefix+str(worker_index)) #if worker_index%20==0 and worker_index != 0: # candidate.append(split) # split = [] #candidate.append(split) candidate.append(bucket_name + "_" + prefix + str(worker_index)) #group = len(candidate) count = 0 while num_files < num_workers: #or candidate != []: objects = hlist_keys(endpoint, candidate) #[count%group]) while objects is not None: #or candidate != []: #file_keys = [] print(objects.keys()) for key, value in objects.items(): file_key = key #index = file_key.split('_')[2] #del candidate[index] candidate.remove(file_key) data_bytes = value data = pickle.loads(data_bytes) for i in range(len(data)): if num_files == 0: merged_value.append( np.zeros(data[i].shape, dtype=data[i].dtype)) merged_value[i] = merged_value[i] + data[i] #print("{} is being deleted? {}".format(file_key,hdelete_keys(endpoint, [file_key]))) num_files = num_files + 1 hdelete_keys(endpoint, [file_key]) #file_keys = list(objects.keys()) objects = hlist_keys(endpoint, candidate) #[count%group]) count = count + 1 print(num_workers - num_files) #clear_all(endpoint) # average weights if prefix == 'w_': merged_value = [value / float(num_workers) for value in merged_value] return merged_value
def reduce_scatter_epoch(endpoint, vector, merged_bucket, num_workers, myrank, postfix): # vector is supposed to be a 1-d numpy array num_all_values = vector.size num_values_per_worker = num_all_values // num_workers residue = num_all_values % num_workers curr_epoch = int(postfix) my_offset = (num_values_per_worker * myrank) + min(residue, myrank) my_length = num_values_per_worker + (1 if myrank < residue else 0) my_chunk = vector[my_offset:my_offset + my_length] # write partitioned vector to the shared memory, except the chunk charged by myself for i in range(num_workers): if i != myrank: offset = (num_values_per_worker * i) + min(residue, i) length = num_values_per_worker + (1 if i < residue else 0) # indicating the chunk number and which worker it comes from # format of key in tmp-bucket: chunkID_workerID_epoch key = "{}_{}".format(i, myrank) hset_object(endpoint, merged_bucket, key + '_' + postfix, vector[offset:offset + length].tobytes()) # read and aggregate the corresponding chunk candidate = [] for worker_index in range(num_workers): if worker_index != myrank: candidate.append("{}_{}_{}_{}".format(merged_bucket, myrank, worker_index, postfix)) num_files = 0 while num_files < num_workers - 1: objects = hlist_keys(endpoint, candidate) if objects is not None: file_keys = [] for key, value in objects.items(): file_keys.append(key) bytes_data = np.fromstring(value, dtype=vector.dtype) my_chunk = my_chunk + bytes_data num_files += 1 num_files += 1 # the processed keys are deleted at the stage of mergence hdelete_keys(endpoint, file_keys) # write the aggregated chunk back # key format in merged_bucket: chunkID_epoch hset_object(endpoint, merged_bucket, str(myrank) + '_' + postfix, my_chunk.tobytes()) # read other aggregated chunks merged_value = {} merged_value[myrank] = my_chunk candidate = [] for worker_index in range(num_workers): if worker_index != myrank: candidate.append("{}_{}_{}".format(merged_bucket, worker_index, postfix)) num_merged_files = 0 already_read = [] while num_merged_files < num_workers - 1: objects = hlist_keys(endpoint, candidate) if objects is not None: for key, value in objects.items(): bytes_data = np.frombuffer(value, dtype=vector.dtype) key_splits = key.split("_") merged_value[int(key_splits[1])] = bytes_data already_read.append(key) num_merged_files += 1 # reconstruct the whole vector result = merged_value[0] for k in range(1, num_workers): result = np.concatenate((result, merged_value[k])) return result