def bp_recv_proc(conv_wid, conv_wn, fc_wid, fc_wn, wid, wn, pred_wid, succ_wid, comm_rank, world_sz, bs, subbs, pd, input_shp, output_shp, bp_tail_list, shared_cnters, global_step, sta_lidx, end_lidx): #fp_send:0; fp_recv:1; bp_send:2; bp_recv:3 iter_thresh = int(bs / subbs) allreduce_group, fp_gather_group, bp_scatter_group = init_processes( comm_rank, world_sz) print("bp_recv_proc comm_rank=", comm_rank) if wid == wn - 1: shared_cnters[3] = iter_thresh return src_rank = succ_wid * 4 + 2 while True: if shared_cnters[3] < iter_thresh: if wid == 2: dist.recv(tensor=bp_tail_list[shared_cnters[3]], src=src_rank) elif wid == 0 or wid == 1: dist.scatter(tensor=bp_tail_list[shared_cnters[3]], scatter_list=[], src=src_rank, group=bp_scatter_group, async_op=False) shared_cnters[3] += 1 #print("wid=",wid, " bp_recv") else: time.sleep(0.001)
def train_model(model, train_loader, optimizer, criterion, epoch, rank): """ model (torch.nn.module): The model created to train train_loader (pytorch data loader): Training data loader optimizer (optimizer.*): A instance of some sort of optimizer, usually SGD criterion (nn.CrossEntropyLoss) : Loss function used to train the network epoch (int): Current epoch number """ group = dist.new_group([0, 1, 2, 3]) # remember to exit the train loop at end of the epoch model.train() for batch_idx, (data, target) in enumerate(train_loader): # Your code goes here! data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) train_loss = criterion(output, target) train_loss.backward() for p in model.parameters(): dist.gather(p.grad, group=group, async_op=False) dist.scatter(p.grad, group=group, src=0, async_op=False) optimizer.step() if batch_idx % 20 == 0: print(batch_idx, "loss: ", train_loss.item()) now = datetime.now() if batch_idx == 10: later = datetime.now() print("average time: ", (later - now).total_seconds() / 9)
def _test_scatter_helper(self, group, group_id, rank): for dest in group: tensor = _build_tensor(dest + 1, -1) expected_tensor = _build_tensor(dest + 1, rank) tensors = [_build_tensor(dest + 1, i) for i in group] if rank == dest else [] dist.scatter(tensor, src=dest, scatter_list=tensors, group=group_id) self.assertEqual(tensor, expected_tensor) self._barrier()
def _scattered(self, scatter_list, target_shape, target_type=torch.float32): target_tensor = torch.empty(target_shape, dtype=target_type) dist.scatter(target_tensor, src=0, scatter_list=scatter_list, group=self.process_group) return target_tensor
def forward(ctx, src, group, *tensors): ctx.src = src ctx.group = group assert all(t.size() == tensors[0].size() for t in tensors) output = torch.zeros_like(tensors[0]) if dist.get_rank(group=group) == src: dist.scatter(output, list(tensors), src, group=group) else: dist.scatter(output, None, src, group=group) return output
def distribute_samples(nodes, rank, dataset, eta, epochs): """ The master node (rank 0) randomly chooses and transmits samples indices to each device for training. Upon reception of their assigned samples, the nodes create their training dataset """ if rank == 0: inpi = tables.open_file(dataset).root.train.data.shape[0] print(inpi) n_samples = tables.open_file(dataset).root.train.data.shape[0] # Total number of samples n_samples_train_per_class = int(n_samples / 2 * 0.9) # There are 2 classes and 10% of the dataset is kept for testing # Indices corresponding to each class indices_0 = np.asarray(torch.max(torch.sum(torch.FloatTensor(tables.open_file(dataset).root.train.label[:]), dim=-1), dim=-1).indices == 0).nonzero()[0][:n_samples_train_per_class] indices_1 = np.asarray(torch.max(torch.sum(torch.FloatTensor(tables.open_file(dataset).root.train.label[:]), dim=-1), dim=-1).indices == 1).nonzero()[0][:n_samples_train_per_class] assert len(indices_0) == len(indices_1) n_main_class = math.floor(epochs * eta) n_secondary_class = epochs - n_main_class assert (n_main_class + n_secondary_class) == epochs # Randomly select samples for each worker indices_worker_0 = np.hstack((np.random.choice(indices_0, [n_main_class], replace=False), np.random.choice(indices_1, [n_secondary_class], replace=False))) np.random.shuffle(indices_worker_0) remaining_indices_0 = [i for i in indices_0 if i not in indices_worker_0] remaining_indices_1 = [i for i in indices_1 if i not in indices_worker_0] indices_worker_1 = np.hstack((np.random.choice(remaining_indices_0, [n_secondary_class], replace=False), np.random.choice(remaining_indices_1, [n_main_class], replace=False))) np.random.shuffle(indices_worker_1) assert len(indices_worker_0) == len(indices_worker_1) # Send samples to the workers indices = [torch.zeros([epochs], dtype=torch.int), torch.IntTensor(indices_worker_0), torch.IntTensor(indices_worker_1)] indices_local = torch.zeros([epochs], dtype=torch.int) dist.scatter(tensor=indices_local, src=0, scatter_list=indices, group=nodes) # Save samples sent to the workers at master to evaluate train loss and accuracy later indices_local = torch.IntTensor(np.hstack((indices_worker_0, indices_worker_1))) local_input = tables.open_file(dataset).root.train.data[:][indices_local] local_output = tables.open_file(dataset).root.train.label[:][indices_local] local_teaching_signal = torch.cat((torch.FloatTensor(local_input), torch.FloatTensor(local_output)), dim=1) else: indices_local = torch.zeros([epochs], dtype=torch.int) dist.scatter(tensor=indices_local, src=0, scatter_list=[], group=nodes) assert torch.sum(indices_local) != 0 local_input = tables.open_file(dataset).root.train.data[:][indices_local] local_output = tables.open_file(dataset).root.train.label[:][indices_local] local_teaching_signal = torch.cat((torch.FloatTensor(local_input), torch.FloatTensor(local_output)), dim=1) return local_teaching_signal
def scatter(self, scatter_list, src, size=None): """Scatters a list of tensors to all parties.""" assert dist.is_initialized(), "initialize the communicator first" if src != self.get_rank(): if size is None: size = scatter_list[self.get_rank()].size() tensor = torch.empty(size=size, dtype=torch.long) dist.scatter(tensor, [], src, group=self.main_group) else: tensor = scatter_list[self.get_rank()] dist.scatter(tensor, scatter_list, src, group=self.main_group) return tensor
def backward(ctx, *grads): input, = ctx.saved_tensors grad_input = torch.zeros_like(input) if dist.get_rank(ctx.group) == ctx.dst: grad_outputs = list(grads) dist.scatter(grad_input, grad_outputs, src=ctx.dst, group=ctx.group) return (grad_input, None, None, None) + grads else: dist.scatter(grad_input, [], src=ctx.dst, group=ctx.group) return grad_input, None, None, None, None
def run(rank, numProcesses, group): tensor = torch.ones(1) * rank #dst (int) – Destination rank, dist.gather(tensor, dst, gather_list, group) #dist.all_reduce(tensor, op=dist.reduce_op.SUM, group=group) #print('Rank ',rank,' has data ', tensor[0]) for i in range(rank): gather_list = None dist.gather(tensor=tensor, gather_list=gather_list, dst=0, group=group) #send to process 2 outputTens = torch.ones(1) dist.scatter(tensor=outputTens, scatter_list=None, src=0, group=group) print('Rank ', rank, ' has data ', outputTens)
def average_gradients(model, rank): for p in model.parameters(): if rank == 0: inputs = [ torch.empty(p.grad.size()) for _ in range(dist.get_world_size()) ] dist.gather(p.grad, inputs) avg_grad = torch.mean(torch.stack(inputs), dim=0) outputs = [avg_grad for _ in range(dist.get_world_size())] dist.scatter(p.grad, outputs) else: dist.gather(p.grad) dist.scatter(p.grad)
def _scatter(rank, rows, columns): source = 0 tensor = _get_tensor(rank, rows, columns) if rank == source: tensors_list = _get_zeros_tensors_list(rows, columns) logger.debug('Rank: {},\nTensor BEFORE scatter: {}. tensors_list: {}'.format( rank, tensor, tensors_list)) dist.scatter(tensor=tensor, scatter_list=tensors_list) else: logger.debug('Rank: {},\nTensor BEFORE scatter: {}\n'.format(rank, tensor)) dist.scatter(tensor=tensor, src=source) logger.debug('Rank: {},\nTensor AFTER scatter: {}\n'.format(rank, tensor)) assert torch.equal(tensor, _get_zeros_tensor(rows, columns)), \ 'Rank {}: Tensor should be all zeroes after scatter.'.format(rank)
def run(number, scatter_list, sr): ranks = list(range(number)) lengths = [] examples = len(scatter_list) for a in scatter_list: lengths = lengths + list(a.size()) maxlen = max(lengths) for a in range(len(lengths)): zero = torch.zeros(maxlen - lengths[a]) scatter_list[a] = torch.cat((scatter_list[a], zero, sr), 0) # ind = lengths.index(maxlen) win_length = 2048 group = dist.new_group(ranks) src = dst = 0 scatter_op = dist.scatter(torch.zeros(lengths), scatter_list=scatter_list, group=group) #assume scatter_o_p is a list while True: all_done = True for i in scatter_op: if i.is_completed == False: all_done = False break if (all_done): break frames_size = librosa.util.frame(scatter_list[0]).shape[0] gather_list_ele = torch.zeros(frames_size, win_length) gather_list = [] for i in range(len(scatter_list)): gather_list.append(deepcopy(gather_list_ele)) dist.gather(gather_list=gather_list, dst=dst, group=group) return gather_list
def forward(ctx, a2a_info, *inputs): global myreq batch_split_lengths = (a2a_info.global_batch_partition_slices if a2a_info.global_batch_partition_slices else a2a_info.local_batch_num) table_split_lengths = (a2a_info.global_table_wise_parition_slices if a2a_info.global_table_wise_parition_slices else [a2a_info.local_table_num] * my_size) input = torch.cat(inputs, dim=1) scatter_list = list(input.split(batch_split_lengths, dim=0)) gather_list = [] req_list = [] for i in range(my_size): out_tensor = input.new_empty([ a2a_info.local_batch_num, table_split_lengths[i] * a2a_info.emb_dim ]) req = dist.scatter(out_tensor, scatter_list if i == my_rank else [], src=i, async_op=True) gather_list.append(out_tensor) req_list.append(req) myreq.req = req_list myreq.tensor = tuple(gather_list) myreq.a2a_info = a2a_info ctx.a2a_info = a2a_info return myreq.tensor
def forward(ctx, tensor, src, group=dist.group.WORLD, inplace=True, *scatter_list): ctx.src = src ctx.group = group if not inplace: tensor = torch.zeros_like(tensor) if dist.get_rank(group) == src: ctx.save_for_backward(*scatter_list) scatter_list = list(scatter_list) dist.scatter(tensor, scatter_list, src=src, group=group) else: dist.scatter(tensor, [], src=src, group=group) return tensor
def forward(ctx, group, *tensors): ctx.group = group out_tensor_list = [ torch.empty_like(tensors[i]) for i in range(dist.get_world_size(group=group)) ] reqs = [None] * dist.get_world_size(group=group) my_rank = dist.get_rank(group=group) # Implement it on means of scatter/gather, send/recv async operations have issues if dist.get_backend(group=group) is dist.Backend.GLOO: for i in range(dist.get_world_size(group=group)): to_send = None if i == my_rank: to_send = list(tensors) dist.scatter(out_tensor_list[i], to_send, i, group=group) else: dist.all_to_all(out_tensor_list, list(tensors), group=group) return tuple(out_tensor_list)
def communicate(self): for ii, param in enumerate(self.gather_list[-1].net.parameters()): param_list = [list(self.gather_list[idx].net.parameters())[ii].data for idx in range(self.world_size)] dist.gather(tensor=param.data, dst=self.world_size - 1, gather_list=param_list) self.mix() if (self.round_idx - 1) % self.log_freq == 0: self.write_logs() for ii, param in enumerate(self.scatter_list[-1].net.parameters()): param_list = [list(self.scatter_list[idx].net.parameters())[ii].data for idx in range(self.world_size)] dist.scatter(tensor=param.data, src=self.world_size - 1, scatter_list=param_list)
def scatter(self, scatter_list, src, size=None, device=None): """Scatters a list of tensors to all parties.""" assert dist.is_initialized(), "initialize the communicator first" if src != self.get_rank(): if size is None: size = scatter_list[self.get_rank()].size() if device is None: try: device = scatter_list[self.get_rank()].device except Exception: pass tensor = torch.empty(size=size, dtype=torch.long, device=device) dist.scatter(tensor.data, [], src, group=self.main_group) else: scatter_list = [s.data for s in scatter_list] tensor = scatter_list[self.get_rank()] dist.scatter(tensor.data, scatter_list, src, group=self.main_group) return tensor
def forward(ctx, group, out_tensor_list, *tensors): ctx.group = group ctx.input_tensor_size_list = [ tensors[i].size() for i in range(dist.get_world_size(group=group)) ] my_rank = dist.get_rank(group=group) tensors = tuple(t.contiguous() for t in tensors) # Implement it on means of scatter/gather, send/recv async operations have issues if dist.get_backend(group=group) is dist.Backend.GLOO: for i in range(dist.get_world_size(group=group)): to_send = None if i == my_rank: to_send = list(tensors) dist.scatter(out_tensor_list[i], to_send, i, group=group) else: dist.all_to_all( out_tensor_list, list(tensors), group=group, ) return tuple(out_tensor_list)
def main_func(numProcesses, group, src_tensor): while (True): t = torch.zeros(15) #THE FINAL ELEMENT IS LENGTH WHEN NOT PADDED gather_t = [torch.ones_like(t) for _ in range(numProcesses)] #every process in group sends tensor to this gather_t list dist.gather(tensor=t, gather_list=gather_t, dst=0, group=group) print('GATHERED DATA') print(gather_t[1][:15]) print(gather_t[2][:15]) to_scatter = torch.rand((5, 3)) outputTens = torch.rand((5)) #SIZE OF EACH TENSOR to scatter is main_params.num_children*2 +1 #where first part is the actions, then probs, then leaf value #print('len to scatter: {}'.format(len(to_scatter))) print(to_scatter) to_scatter = np.split(to_scatter, 3, axis=1) #this is vital to make sure memory isn't shared among these vectors to_scatter = [torch.clone(t).squeeze() for t in to_scatter] #to_scatter = [x.view(1,-1) for x in to_scatter] #print('TO SCATTER: ',to_scatter) print('just before scattering: ') #print(to_scatter[1].type) #print(to_scatter[1][:15]) #print(to_scatter[2][:15]) dist.scatter(tensor=outputTens, scatter_list=to_scatter, src=0, group=group) time.sleep(5) exit(1)
def distribute_samples(nodes, rank, dataset, eta, num_samples): """ The master node (rank 0) randomly chooses and transmits samples indices to each device for training. Upon reception of their assigned samples, the nodes create their training dataset """ if rank == 0: # Indices corresponding to each class indices_0 = np.asarray(torch.max(torch.sum(torch.FloatTensor(dataset.root.label[:]), dim=-1), dim=-1).indices == 0).nonzero()[0] indices_1 = np.asarray(torch.max(torch.sum(torch.FloatTensor(dataset.root.label[:]), dim=-1), dim=-1).indices == 1).nonzero()[0] assert len(indices_0) == len(indices_1) n_main_class = math.floor(num_samples * eta) n_secondary_class = num_samples - n_main_class assert (n_main_class + n_secondary_class) == num_samples # Randomly select samples for each worker indices_worker_0 = np.hstack((np.random.choice(indices_0, [n_main_class], replace=False), np.random.choice(indices_1, [n_secondary_class], replace=False))) np.random.shuffle(indices_worker_0) remaining_indices_0 = [i for i in indices_0 if i not in indices_worker_0] remaining_indices_1 = [i for i in indices_1 if i not in indices_worker_0] indices_worker_1 = np.hstack((np.random.choice(remaining_indices_0, [n_secondary_class], replace=False), np.random.choice(remaining_indices_1, [n_main_class], replace=False))) np.random.shuffle(indices_worker_1) assert len(indices_worker_0) == len(indices_worker_1) # Send samples to the workers indices = [torch.zeros([num_samples], dtype=torch.int), torch.IntTensor(indices_worker_0), torch.IntTensor(indices_worker_1)] indices_local = torch.zeros([num_samples], dtype=torch.int) dist.scatter(tensor=indices_local, src=0, scatter_list=indices, group=nodes) # Save samples sent to the workers at master to evaluate train loss and accuracy later indices_local = torch.IntTensor(np.hstack((indices_worker_0, indices_worker_1))) else: indices_local = torch.zeros([num_samples], dtype=torch.int) dist.scatter(tensor=indices_local, src=0, scatter_list=[], group=nodes) assert torch.sum(indices_local) != 0 return indices_local
def run(rank, numProcesses, group, trg_tensor): print('gathering rank: ', rank) #now just continually gather and scatter until scatter gives a #negative value which means we can exit #and also tell main_func that length is 0 while (True): padded_output = torch.rand((15)) print('Gathering rank: ', rank) print('rank: {}, sending to gather: {}'.format(rank, padded_output)) dist.gather(tensor=padded_output, gather_list=None, dst=0, group=group) #send to process 2 print('Finished gather: ', rank) model_response = torch.rand(5) dist.scatter(tensor=model_response, scatter_list=None, src=0, group=group) print('scatter rank: {}, given: {}'.format(rank, model_response))
def alltoall_cpu(rank, world_size, output_tensor_list, input_tensor_list): """Each process scatters list of input tensors to all processes in a cluster and return gathered list of tensors in output list. The tensors should have the same shape. Parameters ---------- rank : int The rank of current worker world_size : int The size of the entire output_tensor_list : List of tensor The received tensors input_tensor_list : List of tensor The tensors to exchange """ input_tensor_list = [ tensor.to(th.device('cpu')) for tensor in input_tensor_list ] for i in range(world_size): dist.scatter(output_tensor_list[i], input_tensor_list if i == rank else [], src=i)
def bp_send_proc(conv_wid, conv_wn, fc_wid, fc_wn, wid, wn, pred_wid, succ_wid, comm_rank, world_sz, bs, subbs, pd, input_shp, output_shp, bp_head_list, shared_cnters, global_step, sta_lidx, end_lidx): #fp_send:0; fp_recv:1; bp_send:2; bp_recv:3 iter_thresh = int(bs / subbs) allreduce_group, fp_gather_group, bp_scatter_group = init_processes( comm_rank, world_sz) print("bp_send_proc comm_rank=", comm_rank) if wid == 0 or wid == 1: shared_cnters[2] = 0 return local_bp_sent_counter = 0 dst_rank = pred_wid * 4 + 3 scatter_src = 2 * 4 + 2 place_tensor = torch.zeros(1) while True: if local_bp_sent_counter < shared_cnters[2]: # hard code if wid == 3: dist.send(tensor=bp_head_list[local_bp_sent_counter], dst=dst_rank) elif wid == 2: slist = list(bp_head_list[local_bp_sent_counter].chunk( chunks=2, dim=0)) place_tensor = slist[0] slist.append(place_tensor) dist.scatter(tensor=place_tensor, scatter_list=slist, src=scatter_src, group=bp_scatter_group, async_op=False) #print("wid=",wid, " bp send ") local_bp_sent_counter += 1 else: time.sleep(0.001) if local_bp_sent_counter == iter_thresh: local_bp_sent_counter = 0 shared_cnters[2].zero_()
def scatter(data, src=0, group=None): """ Run scatter on arbitrary picklable data (not necessarily tensors). Args: data: any picklable object src (int): source rank from which to scatter group: a torch process group. By default, will use a group which contains all ranks on gloo backend. Returns: data_scattered: the object scattered from src. """ if get_world_size() == 1: return data if group is None: group = _get_global_gloo_group() assert dist.get_world_size(group) == dist.get_world_size() if dist.get_world_size(group=group) == 1: return data rank = dist.get_rank(group=group) input_tensor = _serialize_to_tensor(data, group) # receiving Tensor from the source ranks output_tensor = torch.empty((input_tensor.numel(),), dtype=torch.uint8, device=input_tensor.device) if rank == src: dist.scatter(tensor=output_tensor, scatter_list=[input_tensor] * get_world_size(), src=src, group=group) return data else: dist.scatter(output_tensor, [], src=src, group=group) buffer = output_tensor.cpu().numpy().tostring() data_scattered = pickle.loads(buffer) return data_scattered
def average_gradients(model): size = float(dist.get_world_size()) for param in model.parameters(): """ using all_reduce """ # dist.all_reduce(param.grad.data, op=dist.ReduceOp.SUM) # param.grad.data /= size """ using gather and scatter """ # group = dist.new_group(list(range(int(size)))) gather_list, scatter_list = None, None if args.rank == 0: gather_list = [torch.zeros_like(param.grad.data)] * int(size) scatter_list = [torch.zeros_like(param.grad.data)] * int(size) dist.gather(tensor=param.grad.data, dst=0, gather_list=gather_list) # dist.gather(tensor=param.grad.data, dst=0) if args.rank == 0: param.grad.data /= size dist.scatter(tensor=param.grad.data, src=0, scatter_list=scatter_list) # dist.scatter(tensor=param.grad.data, src=0) """ using ring-reduce """
def transfer_gradients(self, grad_update_conv): """transfering avaraged sparse gradient to the all nodes for the optimization of the model at each node :parameter grad_update_conv : tensor, final avaraged sparse gradient tensor :return upd_grads : final gradient accessable at each node """ upd_grads = [] for idx in range(len(self.shapes)): updated = torch.zeros(self.shapes[idx]) if self.rank == 0: reciever_list = [] for i in range(self.size): reciever_list.append(grad_update_conv[idx].to('cpu')) dist.scatter(tensor=updated, src=0, scatter_list=reciever_list) else: dist.scatter(tensor=updated, src=0, scatter_list=[]) upd_grads.append(updated.cuda(self.device_id)) return upd_grads
def run(rank,numProcesses,group,maxlen,main_params,trg_tensor): mcts = MCTS(tgt_tensor=trg_tensor,group=group,rankInGroup=rank, max_len=maxlen,main_params=main_params) #here actions is list of actions corresponding to the #200 probabilities in mcts_probs bleu, output_states, mcts_probs,actions = mcts.translate_sentence() #write to file fileName = globalsFile.CODEPATH+'MCTSFiles/rank'+str(rank)+'.json' with open(fileName,'w') as f: json.dump([bleu,output_states,mcts_probs,actions],f) print('rank: ',rank, ' is done NOW WAITING FOR REST') while(True): #now just gathering and scattering until main exits padded_output = torch.zeros(maxlen+1)*globalsFile.BLANK_WORD_ID dist.gather(tensor=padded_output,gather_list=None, dst=0,group=group) #send to process 2 model_response = torch.ones(2*main_params.num_children + 1).double() dist.scatter(tensor=model_response,scatter_list=None,src=0,group=group)
def run(): src = dst = 0; mytensor = torch.zeros(1000) dist.scatter(mytensor,src=src) #processing features,num_frames,freqs = mysimpl(mytensor) frames_features = {} for frame in range(num_frames+1): frames_features[frame] = [] for x in features: # print(x[0],x[1],x[2]) x[1] = framenumber x[0] amp x[2] freq frames_features[int(x[1])].append((x[0],x[2])) frame_freq_bins =[] for x in range(num_frames+1): freq_bins = np.zeros(2048) #dict with key as freqbin to_be_added ={} for y in frames_features[x]: index_i = np.abs(freqs-y[1]).argmin(); if(y[1] < freqs[index_i]): index_i -=1; if index_i not in to_be_added.keys(): to_be_added[index_i] = [] to_be_added[index_i].append(y[0]) all_non_zero_bins = to_be_added.keys() for x in all_non_zero_bins: amp_array =to_be_added[x] amp_array = np.array(amp_array) avg_amp = np.mean(amp_array) freq_bins[x] += avg_amp # freq_bins = torch.LongTensor(freq_bins) frame_freq_bins.append(freq_bins) frame_freq_bins = np.array(frame_freq_bins) frame_freq_bins = torch.from_numpy(frame_freq_bins) dist.gather(frame_freq_bins,dst=dst) return;
def distribute_samples(nodes, rank, args): """ The master node (rank 0) randomly chooses and transmits samples indices to each device for training. Upon reception of their assigned samples, the nodes create their training dataset """ if rank == 0: # Indices corresponding to each class indices_worker_0 = np.zeros([args.num_samples_train]) indices_worker_1 = np.zeros([args.num_samples_train]) num_samples_per_class = int(args.num_samples_train / (len(args.labels)/2)) for i, label in enumerate(args.labels[:int(len(args.labels)/2)]): indices_worker_0[i * num_samples_per_class: (i + 1) * num_samples_per_class] =\ np.random.choice(misc.find_indices_for_labels(args.dataset.root.train, [label]), [num_samples_per_class], replace=True) for i, label in enumerate(args.labels[int(len(args.labels)/2):]): indices_worker_1[i * num_samples_per_class: (i + 1) * num_samples_per_class] =\ np.random.choice(misc.find_indices_for_labels(args.dataset.root.train, [label]), [num_samples_per_class], replace=True) random.shuffle(indices_worker_0) random.shuffle(indices_worker_1) # Send samples to the workers indices_local = torch.zeros([args.num_samples_train], dtype=torch.int) indices = [indices_local, torch.IntTensor(indices_worker_0), torch.IntTensor(indices_worker_1)] dist.scatter(tensor=indices_local, src=0, scatter_list=indices, group=nodes) # Save samples sent to the workers at master to evaluate train loss and accuracy later indices_local = torch.IntTensor(np.hstack((indices_worker_0, indices_worker_1))) else: args.local_labels = args.labels[int(len(args.labels)/2) * (rank - 1): int(len(args.labels)/2) * rank] indices_local = torch.zeros([args.num_samples_train], dtype=torch.int) dist.scatter(tensor=indices_local, src=0, scatter_list=[], group=nodes) return indices_local
def transfer_gradients(self, grad_update_conv): """ transferring averaged sparse gradients to all the nodes for the optimization of the model at each node :parameter grad_update_conv : tensor, final averaged sparse gradient tensor :return upd_grads : final gradient accessible at each node """ grad_update_conv = self.converter.str_to_gradient(grad_update_conv) upd_grads = [] for idx in range(len(self.shapes)): updated = torch.zeros(self.shapes[idx]) if self.rank == 0: receiver_list = [] for i in range(self.size): receiver_list.append(grad_update_conv[idx].to('cpu')) dist.scatter(tensor=updated, src=0, scatter_list=receiver_list) else: dist.scatter(tensor=updated, src=0, scatter_list=[]) upd_grads.append(updated.to('cpu')) return upd_grads
for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]: tensor = torch.ByteTensor(bytes).fill_(42) for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]: for i in range(0, num_tensors): dist.all_reduce(tensor) dist.barrier() if rank == 0: print_header("scatter") for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]: tensor = torch.ByteTensor(bytes).fill_(42) tensors = [tensor for n in range(0, dist.get_world_size())] for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]: start = timer() for i in range(0, num_tensors): dist.scatter(tensor, scatter_list=tensors) end = timer() print_stats(bytes, num_tensors, end - start) print() else: for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]: tensor = torch.ByteTensor(bytes).fill_(42) for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]: for i in range(0, num_tensors): dist.scatter(tensor, src=0) dist.barrier() if rank == 0: print_header("gather") for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]: tensor = torch.ByteTensor(bytes).fill_(42)