def broad_func(node_count, am_partitions, inputs, rank, size, group): global device global comm_time global comp_time global scomp_time global bcast_comm_time global run # n_per_proc = math.ceil(float(adj_matrix.size(1)) / size) n_per_proc = math.ceil(float(node_count) / size) # z_loc = torch.cuda.FloatTensor(adj_matrix.size(0), inputs.size(1), device=device).fill_(0) z_loc = torch.cuda.FloatTensor(am_partitions[0].size(0), inputs.size(1), device=device).fill_(0) # z_loc = torch.zeros(adj_matrix.size(0), inputs.size(1)) inputs_recv = torch.cuda.FloatTensor(n_per_proc, inputs.size(1), device=device).fill_(0) # inputs_recv = torch.zeros(n_per_proc, inputs.size(1)) for i in range(size): if i == rank: inputs_recv = inputs.clone() elif i == size - 1: inputs_recv = torch.cuda.FloatTensor(am_partitions[i].size(1), inputs.size(1), device=device).fill_(0) # inputs_recv = torch.zeros(list(am_partitions[i].t().size())[1], inputs.size(1)) tstart_comm = start_time(group, rank) dist.broadcast(inputs_recv, src=i, group=group) dur = stop_time(group, rank, tstart_comm) comm_time[run][rank] += dur bcast_comm_time[run][rank] += dur tstart_comp = start_time(group, rank) spmm_gpu(am_partitions[i].indices()[0].int(), am_partitions[i].indices()[1].int(), am_partitions[i].values(), am_partitions[i].size(0), am_partitions[i].size(1), inputs_recv, z_loc) dur = stop_time(group, rank, tstart_comp) comp_time[run][rank] += dur scomp_time[run][rank] += dur return z_loc
def split3dspmm_sparse(adj_matrix, inputs, rank, row, col, rank_c, size, acc_per_rank, row_groups, col_groups, c_groups, height, middim, width): proc_row = proc_row_size(size) proc_col = proc_col_size(size) proc_c = proc_c_size(size) # Compute the height, middim, and width for the local spmm height_per_proc = height // proc_row width_per_proc = width // proc_col middim_per_proc = middim // (proc_col * proc_c) device = torch.device('cuda:{}'.format(rank_to_devid(rank, acc_per_rank))) # Handle boundary conditions if this rank is in the last process row or column if row == proc_row - 1: height_per_proc = height - height_per_proc * (proc_row - 1) if col == proc_col - 1: width_per_proc = width - width_per_proc * (proc_col - 1) # Initialize output matrix for local spmm z_loc = torch.cuda.FloatTensor(height_per_proc, width_per_proc, device=device).fill_(0) # Determine column size to split output matrix after local spmm's chunk_sizes_col = [] chunk_len = inputs.size(1) // proc_c for i in range(proc_c): if i == proc_c - 1: chunk_sizes_col.append(inputs.size(1) - chunk_len * (proc_c - 1)) else: chunk_sizes_col.append(chunk_len) for k in range(proc_col): row_src_rank = row * ( proc_col * proc_c) + rank_c + k * proc_c # src rank for row bcast col_src_rank = col * proc_row + rank_c + k * proc_c * proc_row # src rank for col bcast # Determine middle dimension of matrices for local spmm middim_per_col = middim // proc_col if k == proc_col - 1: middim_per_col = middim - middim_per_col * (proc_col - 1) middim_per_proc = middim_per_col // proc_c if rank_c == proc_c - 1: middim_per_proc = middim_per_col - middim_per_proc * (proc_c - 1) if row_src_rank == rank: acol_indices_len = torch.cuda.LongTensor( [adj_matrix.indices().contiguous()[0].size(0)], device=device) acol_values_len = torch.cuda.LongTensor( [adj_matrix.values().contiguous().size(0)], device=device) else: acol_indices_len = torch.cuda.LongTensor([0], device=device) acol_values_len = torch.cuda.LongTensor([0], device=device) # Broadcast nnz across rows (necessary for row bcast) dist.broadcast(acol_indices_len, row_src_rank, row_groups[row][rank_c]) acol_indices_len = acol_indices_len.item() # nnz acol_values_len = acol_indices_len # Initialize new empty matrix for row bcast if this rank is not the src rank if row_src_rank == rank: acol_indices = adj_matrix.indices().contiguous().long() acol_values = adj_matrix.values().contiguous().float() else: acol_indices = torch.cuda.LongTensor(2, acol_indices_len, device=device).fill_(0) acol_values = torch.cuda.FloatTensor(acol_values_len, device=device).fill_(0) acol = torch.cat((acol_indices.float(), acol_values.unsqueeze(0)), dim=0) # Row bcast dist.broadcast(acol.contiguous(), row_src_rank, row_groups[row][rank_c]) acol_indices = acol[:2].long() acol_values = acol[2].squeeze(0) if row_src_rank == rank: acol = adj_matrix else: acol = sparse_coo_tensor_gpu( acol_indices, acol_values, torch.Size([height_per_proc, middim_per_proc])) # Initialize new empty matrix for col bcast if this rank is not the src rank if col_src_rank == rank: brow = inputs else: brow = torch.cuda.FloatTensor(middim_per_proc, width_per_proc, device=device) # Col bcast brow = brow.contiguous() dist.broadcast(brow, col_src_rank, col_groups[col][rank_c]) # Local spmm spmm_gpu(acol_indices[0].int(), acol_indices[1].int(), acol_values, height_per_proc, middim_per_proc, brow, z_loc) z_loc = z_loc.contiguous() # All-Reduce across third process grid dimension dist.all_reduce(z_loc, group=c_groups[int(rank // proc_c)]) # Split the output of the all-reduce across third process grid dimension # Each rank only keeps its submatrix z_loc = torch.split(z_loc, chunk_sizes_col, dim=1) z_loc = z_loc[rank_c].contiguous() return z_loc
def broad_func(node_count, am_partitions, inputs, rank, size, row_groups, col_groups, group): global device global comm_time global comp_time global scomp_time global bcast_comm_time global bcast_words global reduce_comm_time global run global replication # n_per_proc = math.ceil(float(adj_matrix.size(1)) / size) n_per_proc = math.ceil(float(node_count) / (size / replication)) # z_loc = torch.cuda.FloatTensor(adj_matrix.size(0), inputs.size(1), device=device).fill_(0) z_loc = torch.cuda.FloatTensor(am_partitions[0].size(0), inputs.size(1), device=device).fill_(0) # z_loc = torch.zeros(adj_matrix.size(0), inputs.size(1)) inputs_recv = torch.cuda.FloatTensor(n_per_proc, inputs.size(1), device=device).fill_(0) # inputs_recv = torch.zeros(n_per_proc, inputs.size(1)) rank_c = rank // replication rank_col = rank % replication stages = size // (replication**2) if rank_col == replication - 1: stages = (size // replication) - (replication - 1) * stages for i in range(stages): # q = rank_c // (size // (replication ** 2)) * (size // (replication ** 2)) + i # = q * replication + rank_c // (size // (replication **2)) q = (rank_col * (size // (replication**2)) + i) * replication + rank_col q_c = q // replication am_partid = rank_col * (size // replication**2) + i if q == rank: inputs_recv = inputs.clone() elif q_c == size // replication - 1: inputs_recv = torch.cuda.FloatTensor( am_partitions[am_partid].size(1), inputs.size(1), device=device).fill_(0) # inputs_recv = torch.zeros(list(am_partitions[i].t().size())[1], inputs.size(1)) tstart_comm = start_time(col_groups[rank_col], rank) inputs_recv = inputs_recv.contiguous() bcast_words[run][rank] += inputs_recv.size(0) * inputs_recv.size(1) dist.broadcast(inputs_recv, src=q, group=col_groups[rank_col]) dur = stop_time(col_groups[rank_col], rank, tstart_comm) comm_time[run][rank] += dur bcast_comm_time[run][rank] += dur tstart_comp = start_time(col_groups[rank_col], rank) spmm_gpu(am_partitions[am_partid].indices()[0].int(), am_partitions[am_partid].indices()[1].int(), am_partitions[am_partid].values(), am_partitions[am_partid].size(0), am_partitions[am_partid].size(1), inputs_recv, z_loc) dur = stop_time(col_groups[rank_col], rank, tstart_comp) comp_time[run][rank] += dur scomp_time[run][rank] += dur z_loc = z_loc.contiguous() tstart_comm = start_time(row_groups[rank_c], rank) dist.all_reduce(z_loc, op=dist.reduce_op.SUM, group=row_groups[rank_c]) dur = stop_time(row_groups[rank_c], rank, tstart_comm) comm_time[run][rank] += dur reduce_comm_time[run][rank] += dur return z_loc
def dspmm(node_count, am_partitions, inputs, rank, size, replication, row_groups, col_groups, group, device): global comm_time global comp_time global bcast_comm_time global reduce_comm_time n_per_proc = math.ceil(float(node_count) / (size / replication)) z_loc = torch.cuda.FloatTensor(am_partitions[0].size(0), inputs.size(1), device=device).fill_(0) inputs_recv = torch.cuda.FloatTensor(n_per_proc, inputs.size(1), device=device).fill_(0) rank_c = rank // replication # effectively row-rank rank_col = rank % replication stages = size // (replication**2) if rank_col == replication - 1: stages = (size // replication) - (replication - 1) * stages for i in range(stages): # Compute src rank in bcast q = (rank_col * (size // (replication**2)) + i) * replication + rank_col q_c = q // replication am_partid = rank_col * (size // replication**2) + i # If this rank is the src rank for bcast, set inputs_recv to the local matrix # Else, instantiate a new empty matrix if q == rank: inputs_recv = inputs.clone() elif q_c == size // replication - 1: inputs_recv = torch.cuda.FloatTensor( am_partitions[am_partid].size(1), inputs.size(1), device=device).fill_(0) inputs_recv = inputs_recv.contiguous() tstart_comm = start_time(col_groups[rank_col], rank) dist.broadcast(inputs_recv, src=q, group=col_groups[rank_col]) dur = stop_time(col_groups[rank_col], rank, tstart_comm) comm_time[rank] += dur bcast_comm_time[rank] += dur tstart_comp = start_time(col_groups[rank_col], rank) spmm_gpu(am_partitions[am_partid].indices()[0].int(), am_partitions[am_partid].indices()[1].int(), am_partitions[am_partid].values(), am_partitions[am_partid].size(0), am_partitions[am_partid].size(1), inputs_recv, z_loc) dur = stop_time(col_groups[rank_col], rank, tstart_comp) comp_time[rank] += dur z_loc = z_loc.contiguous() tstart_comm = start_time(row_groups[rank_c], rank) dist.all_reduce(z_loc, op=dist.reduce_op.SUM, group=row_groups[rank_c]) dur = stop_time(row_groups[rank_c], rank, tstart_comm) comm_time[rank] += dur reduce_comm_time[rank] += dur return z_loc
def summa_sparse(adj_matrix, inputs, rank, row, col, size, acc_per_rank, row_groups, col_groups, height, middim, width): proc_row = proc_row_size(size) proc_col = proc_col_size(size) # Compute the height, middim, and width for the local spmm height_per_proc = height // proc_row width_per_proc = width // proc_col middim_per_proc = middim // proc_col device = torch.device('cuda:{}'.format(rank_to_devid(rank, acc_per_rank))) # Handle boundary conditions if this rank is in the last process row or column if row == proc_row - 1: height_per_proc = height - height_per_proc * (proc_row - 1) if col == proc_col - 1: width_per_proc = width - width_per_proc * (proc_col - 1) # Initialize output matrix for local spmm z_loc = torch.cuda.FloatTensor(height_per_proc, width_per_proc, device=device).fill_(0) for k in range(proc_col): row_src_rank = k + proc_col * row # src rank for row bcast col_src_rank = k * proc_col + col # src rank for col bcast # Determine middle dimension of matrices for local spmm if k == proc_col - 1: middim_per_proc = middim - middim_per_proc * (proc_col - 1) if row_src_rank == rank: acol_indices_len = torch.cuda.LongTensor( [adj_matrix.indices().contiguous()[0].size(0)], device=device) acol_values_len = torch.cuda.LongTensor( [adj_matrix.values().contiguous().size(0)], device=device) else: acol_indices_len = torch.cuda.LongTensor([0], device=device) acol_values_len = torch.cuda.LongTensor([0], device=device) # Broadcast nnz across rows (necessary for row bcast) dist.broadcast(acol_indices_len, row_src_rank, row_groups[row]) acol_indices_len = acol_indices_len.item() # nnz acol_values_len = acol_indices_len # Initialize new empty matrix for row bcast if this rank is not the src rank if row_src_rank == rank: acol_indices = adj_matrix.indices().contiguous().long() acol_values = adj_matrix.values().contiguous().float() else: acol_indices = torch.cuda.LongTensor(2, acol_indices_len, device=device).fill_(0) acol_values = torch.cuda.FloatTensor(acol_values_len, device=device).fill_(0) acol = torch.cat((acol_indices.float(), acol_values.unsqueeze(0)), dim=0).contiguous() # Row bcast dist.broadcast(acol, row_src_rank, row_groups[row]) acol_indices = acol[:2].long() acol_values = acol[2].squeeze(0) if row_src_rank == rank: acol = adj_matrix else: acol = sparse_coo_tensor_gpu( acol_indices, acol_values, torch.Size([height_per_proc, middim_per_proc])) # Initialize new empty matrix for col bcast if this rank is not the src rank if col_src_rank == rank: brow = inputs else: brow = torch.cuda.FloatTensor(middim_per_proc, width_per_proc, device=device) # Col bcast brow = brow.contiguous() dist.broadcast(brow, col_src_rank, col_groups[col]) # Local spmm spmm_gpu(acol_indices[0].int(), acol_indices[1].int(), acol_values, height_per_proc, middim_per_proc, brow, z_loc) return z_loc