def sample(self, model_output: torch.Tensor) -> torch.Tensor: """ Sample uniformly between [0, 1] (for each batch example) and return the linear interpolation between the fitted quantiles closest to the sampled value. model_output is of shape (batch_size, n_timesteps, n_components, n_quantiles) """ device = model_output.device num_samples, n_timesteps, n_components, n_quantiles = model_output.shape # obtain samples probs = torch.rand(size=( num_samples, n_timesteps, n_components, 1, )).to(device) # add dummy dim probas = probs.unsqueeze(-2) # tile and transpose p = torch.tile(probas, (1, 1, 1, n_quantiles, 1)).transpose(4, 3) # prepare quantiles tquantiles = torch.tensor(self.quantiles).reshape( (1, 1, 1, -1)).to(device) # calculate index of biggest quantile smaller than the sampled value left_idx = torch.sum(p > tquantiles, dim=-1) # obtain index of smallest quantile bigger than sampled value right_idx = left_idx + 1 # repeat the model output on the edges repeat_count = [1] * n_quantiles repeat_count[0] = 2 repeat_count[-1] = 2 repeat_count = torch.tensor(repeat_count).to(device) shifted_output = torch.repeat_interleave(model_output, repeat_count, dim=-1) # obtain model output values corresponding to the quantiles left and right of the sampled value left_value = torch.gather(shifted_output, index=left_idx, dim=-1) right_value = torch.gather(shifted_output, index=right_idx, dim=-1) # add 0 and 1 to quantiles ext_quantiles = [0.0] + self.quantiles + [1.0] expanded_q = torch.tile(torch.tensor(ext_quantiles), left_idx.shape).to(device) # calculate closest quantiles to the sampled value left_q = torch.gather(expanded_q, index=left_idx, dim=-1) right_q = torch.gather(expanded_q, index=right_idx, dim=-1) # linear interpolation weights = (probs - left_q) / (right_q - left_q) inter = left_value + weights * (right_value - left_value) return inter.squeeze(-1)
def forward(self, X_wordid, X_mask, pos, Y=None): # X_wordid is the batch-size x max-seq-len matrix of WordPiece token indices # X_mask is the batch-size x max-seq-len matrix of 0s and 1s. # X_mask = 0 for padded tokens, otherwise 1 # # pos is the batch-size x 4 matrix # (pos[i, 0], pos[i, 1]) is the opinion expression span # (pos[i, 2], pos[i, 3]) is the target span # # Y is the batch-size x max-seq-len matrix of the token labels # O = 0, B = 1, I = 2 # Padded tokens also have label O (= 0) # # If Y is not None, return loss # otherwise return Y_pred X_word = self.encoder(X_wordid, X_mask).last_hidden_state batch_size, max_seq_len = X_wordid.shape X_eposid = torch.tile(torch.LongTensor(np.arange(max_seq_len)), (batch_size, 1)) X_tposid = torch.tile(torch.LongTensor(np.arange(max_seq_len)), (batch_size, 1)) for i in range(batch_size): estart, eend, tstart, tend = pos[i] length = X_mask[i].sum() X_eposid[:estart] = estart - X_eposid[:estart] X_eposid[estart: eend] = 0 X_eposid[eend:] = X_eposid[eend:] - eend + max_seq_len X_eposid[length:] = 2*max_seq_len - 1 X_tposid[:tstart] = tstart - X_tposid[:tstart] X_tposid[tstart: tend] = 0 X_tposid[tend:] = X_tposid[tend:] - tend + max_seq_len X_tposid[length:] = 2*max_seq_len - 1 X_epos = self.expression_position_embedding(X_eposid) X_tpos = self.target_position_embedding(X_tposid) X = torch.cat((X_word, X_epos, X_tpos), dim=2) # X is of shape batch-size x max-seq-len x (Hw + 2 * Hpos) # Hw is the encoder hidden size, Hpos is the position embedding size A, _ = self.bilstm(X) A = A.contiguous() # A is of shape batch-size x max-seq-len x 2H # H is the hidden size B = self.output(A) # B is of shape batch-size x max-seq-len x 3 crf_mask = X_mask.byte() Y_pred = self.crf.decode(B, crf_mask) # Y_pred is a list of list of integers if Y is not None: loss = -self.crf(B, Y, crf_mask) return loss, Y_pred else: return Y_pred
def get_argmin_mat(uniq_scale_dict: dict): """ Calculate the mapping between the base scale and other scales. A segment from a longer scale is repeatedly mapped to a segment from a shorter scale or the base scale. Args: uniq_scale_dict (dict) : Dictionary of embeddings and timestamps for each scale. Returns: session_scale_mapping_dict (dict) : Dictionary containing argmin arrays indexed by scale index. """ scale_list = sorted(list(uniq_scale_dict.keys())) segment_anchor_dict = {} for scale_idx in scale_list: time_stamp_list = uniq_scale_dict[scale_idx]['time_stamps'] time_stamps_float = torch.tensor( [[float(x.split()[0]), float(x.split()[1])] for x in time_stamp_list]) segment_anchor_dict[scale_idx] = torch.mean(time_stamps_float, dim=1) base_scale_idx = max(scale_list) base_scale_anchor = segment_anchor_dict[base_scale_idx] session_scale_mapping_dict = {} for scale_idx in scale_list: curr_scale_anchor = segment_anchor_dict[scale_idx] curr_mat = torch.tile(curr_scale_anchor, (base_scale_anchor.shape[0], 1)) base_mat = torch.tile(base_scale_anchor, (curr_scale_anchor.shape[0], 1)).t() argmin_mat = torch.argmin(torch.abs(curr_mat - base_mat), dim=1) session_scale_mapping_dict[scale_idx] = argmin_mat return session_scale_mapping_dict
def layout_bbox(self, final_pred, batch_size, num_bboxes, num_classes, output_height, output_width): # 5, 188, 20 final_pred = torch.reshape(final_pred, [batch_size, num_bboxes, 4 + num_classes]) #print('Final pred:',final_pred.shape) return self.rectangle_render(final_pred) 0 / 0 final_pred = torch.reshape(final_pred, [batch_size, 4 + num_classes, num_bboxes]) print('Final pred requires grad:', final_pred.requires_grad) bbox_reg = final_pred[:, :4, :] cls_prob = final_pred[:, 4:, :] print('bbox requires grad:', bbox_reg.requires_grad) bbox_reg = torch.reshape(bbox_reg, [batch_size, num_bboxes, 4]) x_c = bbox_reg[:, :, 0] * output_width y_c = bbox_reg[:, :, 1] * output_height w = bbox_reg[:, :, 2] * output_width h = bbox_reg[:, :, 3] * output_height x1 = x_c - 0.5 * w x2 = x_c + 0.5 * w y1 = y_c - 0.5 * h y2 = y_c + 0.5 * h xt = torch.reshape( torch.range(start=0, end=output_width, dtype=torch.float32), [1, 1, 1, -1]) xt = torch.reshape( torch.tile(xt, [batch_size, num_bboxes, output_height, 1]), [batch_size, num_bboxes, -1]) yt = torch.reshape( torch.range(start=0, end=output_height, dtype=torch.float32), [1, 1, 1, -1]) yt = torch.reshape( torch.tile(yt, [batch_size, num_bboxes, 1, output_width]), [batch_size, num_bboxes, -1]) x1_diff = torch.reshape( xt - x1, [batch_size, num_bboxes, output_height, output_width, 1]) y1_diff = torch.reshape( yt - y1, [batch_size, num_bboxes, output_height, output_width, 1]) x2_diff = torch.reshape( x2 - xt, [batch_size, num_bboxes, output_height, output_width, 1]) y2_diff = torch.reshape( y2 - yt, [batch_size, num_bboxes, output_height, output_width, 1]) x1_line = self.relu(1.0 - torch.abs(x1_diff)) * torch.minimum( self.relu(y1_diff), 1.0) * torch.minimum(self.relu(y2_diff), 1.0) print(x1_line.shape) print(x1_line) 0 / 0
def __prepare(a, b): # extend as cols repetitions = b.shape[0] at = torch.tile(a, (repetitions, 1)) at = at.transpose(-1, 0) # extend as rows # bt = np.tile(b, (repetitions, 1)) repetitions = a.shape[0] bt = torch.tile(b, (repetitions, 1)) return at, bt
def get_alpha_beta(self, config, training=False): a_val = np.log(np.exp(config.model.alpha0) - 1) b_val = np.log(np.exp(1.) - 1) initial = torch.zeros(self.hidden[self.num_layers-1], dtype=torch.float32).to(config.device) self.a = Variable(initial) + a_val self.b = Variable(initial) + b_val beta_a = F.softplus(self.a) beta_b = F.softplus(self.b) beta_a = torch.unsqueeze(beta_a, 0) beta_b = torch.unsqueeze(beta_b, 0) self.beta_a = torch.tile(beta_a, [config.model.num_nodes, 1]) self.beta_a = Variable(self.beta_a, requires_grad=True) self.beta_b = torch.tile(beta_b, [config.model.num_nodes, 1]) self.beta_b = Variable(self.beta_b, requires_grad=True)
def train(): # Initialize torch.distributed init_distributed() print_rank_0('AutoMP: training GPT2...') # Use fake train data batch_size = args.batch_size sequence_length = args.sequence_length hidden_size = args.hidden_size vocab_size = args.vocab_size dropout_prob = args.hidden_dropout input_indices = torch.randint(low=0, high=vocab_size, size=(batch_size, sequence_length)) input_indices = input_indices.to(torch.cuda.current_device()) position_indices = torch.tile(torch.arange(start=0, end=sequence_length), (batch_size, 1)) position_indices = position_indices.to(torch.cuda.current_device()) print_rank_0(f'AutoMP: input_indices shape = {input_indices.size()}') print_rank_0(f'AutoMP: position_indices shape = {position_indices.size()}') def init_method_normal(tensor): return torch.nn.init.normal_(tensor, mean=0.0, std=1.0) embedding = Embedding(hidden_size=hidden_size, vocab_size=vocab_size, max_sequence_length=sequence_length, embedding_dropout_prob=dropout_prob, init_method=init_method_normal) optimizer = torch.optim.SGD(embedding.parameters(), lr=0.01) profiler = Profiler(os.path.join('benchmark', args.exp_name)) num_epochs = 5 tot_time = 0 nproc = torch.distributed.get_world_size() for epoch in range(num_epochs): overall_name = f'emb_np-{nproc}_vs-{vocab_size}' profiler.start(overall_name) # Forward pass profiler.start(f'emb_forward_np-{nproc}_vs-{vocab_size}') embedding_output = embedding.forward(input_indices, position_indices) train_loss = torch.mean(embedding_output) torch.cuda.synchronize() profiler.stop(f'emb_forward_np-{nproc}_vs-{vocab_size}') # Backward pass profiler.start(f'emb_backward_np-{nproc}_vs-{vocab_size}') optimizer.zero_grad() train_loss.backward() optimizer.step() torch.cuda.synchronize() profiler.stop(f'emb_backward_np-{nproc}_vs-{vocab_size}') profiler.stop(overall_name)
def forward(self, x, mask=None, attn_weights=False, norm_attn_weights=True): # x.shape = [batch, length, token_dim] batch_size = x.shape[0] keys = self._transpose_multihead(self.keys(x)) values = self._transpose_multihead(self.values(x)) if self.fixed_queries: queries = torch.tile(self.queries, (batch_size, 1, 1)) else: queries = self._transpose_multihead(self.queries(x)) # dot_products.shape = [batch, length (query), length (key)] dot_products = torch.matmul(queries, keys.transpose(1, 2)) if mask is not None: dot_products = dot_products * mask - 1e9 * (1 - mask) weights = self.attention_function( dot_products / np.sqrt(self.d_model // self.n_heads), dim=-1) result = self._untranspose_multihead(torch.matmul(weights, values)) if self.n_heads > 1: result = self.output_linear(result) if attn_weights: weights_cpu = weights.detach().cpu() if norm_attn_weights: values_norm = torch.norm(values.detach().cpu(), dim=-1).unsqueeze(1) weights_cpu = weights_cpu * values_norm return result, weights_cpu return result
def concat_dependencies(self, hidden, other_features_hidden): if len(self.dependencies) > 0: dependencies_hidden = [] for dependency in self.dependencies: # the dependent feature is ensured to be present in final_hidden # because we did the topological sort of the features before dependency_final_hidden = other_features_hidden[dependency] if len(hidden.shape) > 2: if len(dependency_final_hidden.shape) > 2: # matrix matrix -> concat assert hidden.shape[ 1] == dependency_final_hidden.shape[1] dependencies_hidden.append(dependency_final_hidden) else: # matrix vector -> tile concat sequence_max_length = hidden.shape[1] multipliers = (1, sequence_max_length, 1) tiled_representation = torch.tile( torch.unsqueeze(dependency_final_hidden, 1), multipliers) # todo future: maybe modify this with TF2 mask mechanics sequence_length = sequence_length_3D(hidden) mask = sequence_mask(sequence_length, sequence_max_length) tiled_representation = torch.mul( tiled_representation, mask[:, :, np.newaxis].type(torch.float32)) dependencies_hidden.append(tiled_representation) else: if len(dependency_final_hidden.shape) > 2: # vector matrix -> reduce concat reducer = self.dependency_reducers[dependency] dependencies_hidden.append( reducer(dependency_final_hidden)) else: # vector vector -> concat dependencies_hidden.append(dependency_final_hidden) try: hidden = torch.cat([hidden] + dependencies_hidden, dim=-1) except Exception as e: raise ValueError( "Shape mismatch while concatenating dependent features of " "{}: {}, with exception {}. Concatenating the feature activations tensor {} " "with activation tensors of dependencies: {}. The error is " "likely due to a mismatch of the second dimension (sequence" " length) or a difference in ranks. Likely solutions are " "setting the maximum_sequence_length of all sequential " "features to be the same, or reduce the output of some " "features, or disabling the bucketing setting " "bucketing_field to None / null, as activating it will " "reduce the length of the field the bucketing is performed " "on.".format(self.column, self.dependencies, e, hidden, dependencies_hidden)) return hidden
def transformer(self, group_embeddings, positional_data): batch_size, input_feature_dim, num_neighbors = group_embeddings.shape if self.attention_mode == "scalar": group_embeddings += self.positional_embedder(positional_data) query = group_embeddings[:, :, :1] query_E = torch.tile(self.WQ(query), dims=(1, 1, num_neighbors)) key_E = self.WK(group_embeddings) value_E = self.WK(group_embeddings) # Vector attention if self.attention_mode == "vector": positional_encoding = self.positional_embedder(positional_data) scaled = torch.softmax(self.mapping(query_E - key_E + positional_encoding), dim=-1) aggregated = torch.sum(scaled * (value_E + positional_encoding), dim=-1) # Norm + Residual connection aggregated = query.squeeze(-1) + self.layer_norm(aggregated) # Scalar attention if self.attention_mode == "scalar": attention = torch.softmax(torch.matmul(query_E.transpose(-2, -1), key_E), dim=1) aggregated = torch.matmul(attention, value_E.transpose(-2, -1)).sum(dim=-2) return aggregated
def forward(self, x): weight = torch.tile(self.weight, (x.shape[1], 1, 1, 1)).to(x.device) return F.conv2d(x, weight, stride=self.stride, padding=self.padding, groups=x.shape[1])
def forward(self, x): N, C, H, W = x.shape x *= self.gain x = x.view(N, C, H, 1, W, 1) x = torch.tile(x, (1, 1, 1, self.factor, 1, self.factor)) x = x.view(N, C, H * self.factor, W * self.factor) return x
def forward(self, w): N = w.shape[0] x = torch.tile(self.const, (N, 1, 1, 1)) x = self.layer_epilogue1(x, w) x = self.conv(x) x = self.layer_epilogue2(x, w) return x
def get_environment(n_atoms, grid=None): if n_atoms == 1: neighborhood_idx = -torch.ones((1, 1), dtype=torch.float32) offsets = torch.zeros((n_atoms, 1, 3), dtype=torch.float32) else: neighborhood_idx = torch.arange( n_atoms, dtype=torch.float32).unsqueeze(0).repeat(n_atoms, 1) neighborhood_idx = neighborhood_idx[ ~torch.eye(n_atoms, dtype=torch.long).byte()].view( n_atoms, n_atoms - 1).long() if grid is not None: n_grid = grid.shape[0] neighborhood_idx = torch.concat( [neighborhood_idx, -torch.ones((n_atoms, 1))], 1) grid_nbh = torch.tile( torch.arange(n_atoms, dtype=torch.float32).unsqueeze(-1), (n_grid, 1)) neighborhood_idx = torch.concat([neighborhood_idx, grid_nbh], 0) offsets = torch.zeros( (neighborhood_idx.shape[0], neighborhood_idx.shape[1], 3), dtype=torch.float32) return neighborhood_idx, offsets
def forward(self, input, adj): input_new = [] for i in range(len(self.add_all)): index = torch.tensor([[i] * input.shape[1]]) aa = torch.gather(input, 0, torch.tensor([[i] * input.shape[1]])) aa_tile = torch.tile(aa, [len(self.add_all[i]), 1]) #expand central bb_nei_index2 = self.add_all[i] bb_nei_index2 = np.array([[i] * input.shape[1] for i in bb_nei_index2], dtype="int64") bb_nei_index2 = torch.tensor(bb_nei_index2) bb_nei = torch.gather(input, 0, torch.tensor(bb_nei_index2)) cen_nei = torch.cat([aa_tile, bb_nei], 1) mask0 = torch.mm(cen_nei, self.weights_mask0) mask0 = self.Sig(mask0) mask0 = F.dropout(mask0, self.drop_rate) self.mask.append(mask0) new_cen_nei = aa + torch.sum( mask0 * bb_nei, 0, keepdims=True ) #hadamard product of neighbors' features and mask aggregator, then applying sum aggregator input_new.append(new_cen_nei) input_new = torch.stack(input_new) input_new = torch.squeeze(input_new) support = torch.mm(input_new, self.weight_0) output = torch.spmm(adj, support) if self.bias is not None: return output + self.bias else: return output
def __call__(self, params, prediction, target): """Parameters (such as thresholds) are used calculate score. Args: params: list of float Returns: score: float """ thresholds = params output = torch.zeros_like(prediction) if self.N > prediction.size(0): batch_size = prediction.size(0) else: batch_size = self.N # Threshold to output output = torch.where( prediction > torch.tile(thresholds, (batch_size, 1)), 1, 0) # Calculate score tp = (output * target).sum(1) fp = (output * (1 - target)).sum(1) fn = ((1 - output) * target).sum(1) f1 = tp / (tp + (fp + fn) / 2) precision = tp / (tp + fp) recall = tp / (tp + fn) #return f1.mean(), precision.mean(), recall.mean() return f1.mean()
def __init__(self, image: Union[Tensor, np.ndarray], *, metalabels: Optional[str] = None) -> None: if isinstance(image, torch.Tensor): if image.dtype != torch.uint8: raise ValueError(f"Tensor uint8 expected, got {image.dtype}") if image.dim() != 3: raise ValueError("Pass individual images, not batches") if image.size(0) not in {1, 3}: raise ValueError("Only grayscale and RGB images are supported") # Handle Grayscale images if image.size(0) == 1: image = torch.tile(image, (3, 1, 1)) self.img = image.permute(1, 2, 0).cpu().numpy() self.is_bgr = False elif isinstance(image, np.ndarray): if image.dtype != np.uint8: raise ValueError(f"Numpy uint8 expected, got {image.dtype}") if image.ndim != 3: raise ValueError("Currently only BGR images are supported") self.img = image self.is_bgr = True else: raise TypeError(f"Tensor or numpy.ndarray expected, got {type(image)}") # Set dataset metadata (e.g. class names) self.metadata = None if metalabels is not None: self.metadata = np.loadtxt(metalabels, dtype="str", delimiter="\n") self.line_width = max(round(sum(self.img.shape) / 2 * 0.003), 2) self.assigned_colors = Colors() self.output = self.img
def __init__(self, args, config, subframe_gen): self.frame_height, self.frame_width = config.camera.frame_height, config.camera.frame_width self.nbhd_height, self.nbhd_width = config.camera.nbhd_size n_pixels = self.nbhd_height * self.nbhd_width self.S = config.camera.S self.device = args.device self.max_intensity = torch.tensor(2 ** config.camera.pixel_bit_depth) assert self.frame_height % self.nbhd_height == 0 assert self.frame_width % self.nbhd_width == 0 assert self.S % n_pixels == 0 self.subframe_gen = subframe_gen # self.scheme = torch.FloatTensor(config.camera.scheme) self.scheme = get_simple_scheme(self.nbhd_height, self.nbhd_width).to(self.device) # scheme = torch.tile(self.scheme, (height // self.nbhd_height, width // self.nbhd_width)).to(raw_subframes.device) # scheme_ = scheme_.unsqueeze(1).repeat(1, S // n_pixels, 1, 1).view(S, height, width) scheme_ = torch.tile(self.scheme, (self.frame_height // self.nbhd_height, self.frame_width // self.nbhd_width)).to(self.device) # shape: (n_pixels, height, width) self.scheme_ = scheme_.unsqueeze(1).repeat(1, self.S // n_pixels, 1, 1).view(self.S, self.frame_height, self.frame_width) # shape: (S, height, width) self.save_output = config.camera.save_output self.save_dir = config.camera.save_dir self.batch_size = config.data.batch_size self.noise_std = None if config.camera.add_noise: self.noise_std = config.camera.noise_std if self.save_output: os.makedirs(self.save_dir, exist_ok=True)
def forward(self, x, target=None): assert x.size()[1] >= 2 cce_prediction = self.cce_backend(x) #x = self.magnitude(x) * torch.nn.functional.normalize(x) if target==None: return x, cce_prediction x = x.reshape(-1,2,x.size()[-1]).squeeze(1) out_anchor = torch.mean(x[:,1:,:],1) out_positive = x[:,0,:] ap_sim_matrix = torch.nn.functional.cosine_similarity(out_positive.unsqueeze(-1),out_anchor.unsqueeze(-1).transpose(0,2)) torch.clamp(self.w, 1e-6) ap_sim_matrix = ap_sim_matrix * self.w + self.b1 labels = torch.arange(0, int(out_positive.shape[0]), device=torch.device("cuda:0")).unsqueeze(1) cos_sim_matrix = torch.mm(out_positive, out_anchor.T) cos_sim_matrix = cos_sim_matrix + self.b2 cos_sim_matrix = cos_sim_matrix + numpy.log(1/out_positive.shape[0] / (1 - 1/out_positive.shape[0])) mask = (torch.tile(labels, (1, labels.shape[0])) == labels.T).float() batch_loss = self.criterion(ap_sim_matrix, torch.arange(0, int(out_positive.shape[0]), device=torch.device("cuda:0"))) \ + self.magnet_criterion(cos_sim_matrix.flatten().unsqueeze(1), mask.flatten().unsqueeze(1)) return batch_loss, cce_prediction
def _single_interaction(self, current_input, current_baseline, current_alphas, num_samples, batch_size, use_expectation, output_index, interaction_index): """ A helper function to compute path interactions for a single sample. Args: current_input: A single sample. Assumes that it is of shape (...) where ... represents the input dimensionality baseline: A tensor representing the baseline input. current_alphas: Which alphas to use when interpolating num_samples: The number of samples to draw batch_size: Batch size to input to the model use_expectation: Whether or not to sample the baseline output_index: Whether or not to index into a given class interaction_index: The index to take the interactions with respect to. """ current_input = current_input.unsqueeze(0) current_alpha, current_beta = current_alphas current_alpha = torch.tensor(current_alpha).float().to( current_input.device) current_beta = torch.tensor(current_beta).float().to( current_input.device) current_alpha = torch.reshape(current_alpha, (num_samples,) + \ (1,) * (len(current_input.shape) - 1)) current_beta = torch.reshape(current_beta, (num_samples,) + \ (1,) * (len(current_input.shape) - 1)) attribution_array = [] for j in range(0, num_samples, batch_size): number_to_draw = min(batch_size, num_samples - j) batch_baseline = self._sample_baseline(current_baseline, number_to_draw, use_expectation) batch_alpha = current_alpha[j:min(j + batch_size, num_samples)] batch_beta = current_beta[j:min(j + batch_size, num_samples)] reps = np.ones(len(current_input.shape)).astype(int) reps[0] = number_to_draw batch_input = torch.tile(current_input, tuple(reps)) batch_baseline.requires_grad = True batch_attributions = self.accumulation_function( batch_input, batch_baseline, batch_alphas=(batch_alpha, batch_beta), output_index=output_index, second_order=True, interaction_index=interaction_index) attribution_array.append(batch_attributions.detach().cpu()) attribution_array = np.concatenate(attribution_array, axis=0) attributions = np.mean(attribution_array, axis=0) return attributions
def tensor_indexing_ops(self): x = torch.randn(2, 4) y = torch.randn(4, 4) t = torch.tensor([[0, 0], [1, 0]]) mask = x.ge(0.5) i = [0, 1] return len( torch.cat((x, x, x), 0), torch.concat((x, x, x), 0), torch.conj(x), torch.chunk(x, 2), torch.dsplit(torch.randn(2, 2, 4), i), torch.column_stack((x, x)), torch.dstack((x, x)), torch.gather(x, 0, t), torch.hsplit(x, i), torch.hstack((x, x)), torch.index_select(x, 0, torch.tensor([0, 1])), x.index(t), torch.masked_select(x, mask), torch.movedim(x, 1, 0), torch.moveaxis(x, 1, 0), torch.narrow(x, 0, 0, 2), torch.nonzero(x), torch.permute(x, (0, 1)), torch.reshape(x, (-1, )), torch.row_stack((x, x)), torch.select(x, 0, 0), torch.scatter(x, 0, t, x), x.scatter(0, t, x.clone()), torch.diagonal_scatter(y, torch.ones(4)), torch.select_scatter(y, torch.ones(4), 0, 0), torch.slice_scatter(x, x), torch.scatter_add(x, 0, t, x), x.scatter_(0, t, y), x.scatter_add_(0, t, y), # torch.scatter_reduce(x, 0, t, reduce="sum"), torch.split(x, 1), torch.squeeze(x, 0), torch.stack([x, x]), torch.swapaxes(x, 0, 1), torch.swapdims(x, 0, 1), torch.t(x), torch.take(x, t), torch.take_along_dim(x, torch.argmax(x)), torch.tensor_split(x, 1), torch.tensor_split(x, [0, 1]), torch.tile(x, (2, 2)), torch.transpose(x, 0, 1), torch.unbind(x), torch.unsqueeze(x, -1), torch.vsplit(x, i), torch.vstack((x, x)), torch.where(x), torch.where(t > 0, t, 0), torch.where(t > 0, t, t), )
def test_StackDenseFixedSizeArray(self): # happy path: value is type Tensor; check cast to float value = torch.eye(4).to(dtype=torch.int) # start as int data = {"a": value} out = transforms.StackDenseFixedSizeArray(data.keys(), size=4)(data) expected = {"a": value.to(dtype=torch.float)} self.assertDictOfTensorEqual(out, expected) self.assertTrue(out["a"].dtype == torch.float, msg="dtype != float") # happy path: value is list w/ elements type Tuple[Tensor, Tensor] presence = torch.tensor([[1, 1, 1], [1, 1, 1]]) data = { "a": [ (torch.tensor([[0, 0, 0], [1, 1, 1]]), presence), (torch.tensor([[2, 2, 2], [3, 3, 3]]), presence), ], "b": [ (torch.tensor([[3, 3, 3], [2, 2, 2]]), presence), (torch.tensor([[1, 1, 1], [0, 0, 0]]), presence), ], } out = transforms.StackDenseFixedSizeArray(data.keys(), size=3)(data) expected = { "a": torch.tile( torch.arange(4).view(-1, 1).to(dtype=torch.float), (1, 3)), "b": torch.tile( torch.arange(4).flip(dims=(0, )).view(-1, 1).to(dtype=torch.float), (1, 3), ), } self.assertDictOfTensorEqual(out, expected) # raise for tensor wrong shape with self.assertRaisesRegex(ValueError, "Wrong shape"): sdf = transforms.StackDenseFixedSizeArray(["a"], size=3) sdf({"a": torch.ones(2)}) # raise for tensor wrong ndim with self.assertRaisesRegex(ValueError, "Wrong shape"): sdf = transforms.StackDenseFixedSizeArray(["a"], size=2) sdf({"a": torch.zeros(2, 2, 2)})
def make_mask(self, target: torch.Tensor) -> torch.Tensor: size = target.size(1) look_ahead_mask = ~(torch.triu( torch.ones(size, size, device=target.device)) == 1).transpose( 0, 1)[:, None] target_padding_mask = torch.eq(target, self.vocab_size + 2) # Pad symbol combined_mask = target_padding_mask | look_ahead_mask return torch.tile(combined_mask.permute(1, 0, 2), (self.num_heads, 1, 1))
def build(self, input_shape): cfg = self.cfg tgt = input_shape[0] assert tgt[0] == cfg.batch_size y = torch.constant([[0.0] + [-float("inf")] * (cfg.beam_size - 1)]) self._logp = torch.tile(y, [cfg.batch_size, 1]) sh = (cfg.batch_size, cfg.beam_size) self._score = torch.ones(shape=sh) * utils.big_neg self._flag = torch.zeros(dtype="bool", shape=sh) return super().build(input_shape)
def forward(self, query_feature: Dict[str, Tensor], passage_features: Iterable[Dict[str, Tensor]], labels: Tensor): n = labels.shape[1] query_embedding = self.query_model(query_feature)['sentence_embedding'] # its the scaling vector, so each element in vector should be [0, 1] psg_embeddings = torch.stack([self.psg_model(passages)['sentence_embedding'] for passages in passage_features], dim=1) scaled_psg_embeddings = torch.tile(query_embedding.unsqueeze(1), (1, n, 1)) * psg_embeddings return scaled_psg_embeddings
def test(): import numpy as np word_embeddings = nn.Embedding(10000, 300) lstm = nn.LSTM(300, 100) h0 = Variable(torch.zeros(1, 128, 100)) c0 = Variable(torch.zeros(1, 128, 100)) hidden = (h0, c0) sentence = Variable( torch.LongTensor(np.zeros((128, 30), dtype=np.int64))) embeds = word_embeddings(sentence) torch.tile(sentence) sentence.size()[0] # x= Variable(torch.zeros(30, 128, 300)) x = embeds.view(sentence.size()[1], self.batch_size, -1) embeds = embeds.permute(1, 0, 2) lstm_out, hidden = lstm(embeds, hidden)
def concatenate(tensor1, tensor2): """ Concatenates two 2D or 4D tensors. Parameters ---------- tensor1 : torch.Tensor 2D or 4D tensor. tensor2 : torch.Tensor 2D or 4D tensor. Returns ------- torch.Tensor Cncatenation of tensor1 and tensor2. Raises ------ NotImplementedError If tensors do not have 2 or 4 dimensions. """ assert tensor1.shape[0] == tensor2.shape[0], ( "Tensors to concatenate must have same dim 0. Tensor1: {}. Tensor2: {}." .format(tensor1.shape[0], tensor2.shape[0])) batch_size = tensor1.shape[0] if tensor1.shape == tensor2.shape: return torch.cat((tensor1, tensor2), axis=1).float() elif (len(tensor1.shape) == 2) and (len(tensor2.shape) == 2): return torch.cat((tensor1, tensor2), axis=1).float() elif (len(tensor1.shape) == 4) and (len(tensor2.shape) == 2): y_dim = tensor2.shape[1] tensor2 = torch.reshape(tensor2, shape=(batch_size, y_dim, 1, 1)) tensor2 = torch.tile(tensor2, dims=(1, 1, *tensor1.shape[2:])) elif (len(tensor1.shape) == 2) and (len(tensor2.shape) == 4): y_dim = tensor1.shape[1] tensor1 = torch.reshape(tensor1, shape=(batch_size, y_dim, 1, 1)) tensor1 = torch.tile(tensor1, dims=(1, 1, *tensor2.shape[2:])) elif (len(tensor1.shape) == 4) and (len(tensor2.shape) == 4): return torch.cat((tensor1, tensor2), axis=1).float() else: raise AssertionError( "tensor1 and tensor2 must have 2 or 4 dimensions. Given: {} and {}." .format(tensor1.shape, tensor2.shape)) return torch.cat((tensor1, tensor2), axis=1).float()
def eval_step(inputs, model): images = inputs['features'] labels = inputs['labels'] images = torch.from_numpy(images._numpy()).view( eval_batch_size, 3, # pylint: disable=protected-access image_h, image_w).to(device) labels = torch.from_numpy( labels._numpy()).to(device).float().unsqueeze(-1) # pylint: disable=protected-access with torch.no_grad(): logits = torch.stack([ model(images) for _ in range(FLAGS.num_dropout_samples_eval) ], dim=-1) # Logits dimension is (batch_size, 1, num_dropout_samples). logits = logits.squeeze() # It is now (batch_size, num_dropout_samples). probs = sigmoid(logits) # labels_tiled shape is (batch_size, num_dropout_samples). labels_tiled = torch.tile(labels, (1, FLAGS.num_dropout_samples_eval)) log_likelihoods = -loss_fn(probs, labels_tiled) negative_log_likelihood = torch.mean( -torch.logsumexp(log_likelihoods, dim=-1) + torch.log(torch.tensor(float(FLAGS.num_dropout_samples_eval)))) probs = torch.mean(probs, dim=-1) # Convert to NumPy for metrics updates negative_log_likelihood = negative_log_likelihood.detach() labels = labels.detach() probs = probs.detach() if device != 'cpu': negative_log_likelihood = negative_log_likelihood.cpu() labels = labels.cpu() probs = probs.cpu() negative_log_likelihood = negative_log_likelihood.numpy() labels = labels.numpy() probs = probs.numpy() metrics[dataset_split + '/negative_log_likelihood'].update_state( negative_log_likelihood) metrics[dataset_split + '/accuracy'].update_state(labels, probs) metrics[dataset_split + '/auprc'].update_state(labels, probs) metrics[dataset_split + '/auroc'].update_state(labels, probs) metrics[dataset_split + '/ece'].add_batch(probs, label=labels)
def meshgrid(*args, **kwargs): """ meshgrid code that builds on (copies) tensorflow's meshgrid but dramatically improves runtime by changing the last step to tiling instead of multiplication. https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/python/ops/array_ops.py#L1921 Broadcasts parameters for evaluation on an N-D grid. Given N one-dimensional coordinate arrays `*args`, returns a list `outputs` of N-D coordinate arrays for evaluating expressions on an N-D grid. Notes: `meshgrid` supports cartesian ('xy') and matrix ('ij') indexing conventions. When the `indexing` argument is set to 'xy' (the default), the broadcasting instructions for the first two dimensions are swapped. Examples: Calling `X, Y = meshgrid(x, y)` with the tensors ```python x = [1, 2, 3] y = [4, 5, 6] X, Y = meshgrid(x, y) # X = [[1, 2, 3], # [1, 2, 3], # [1, 2, 3]] # Y = [[4, 4, 4], # [5, 5, 5], # [6, 6, 6]] ``` Args: *args: `Tensor`s with rank 1. **kwargs: - indexing: Either 'xy' or 'ij' (optional, default: 'xy'). - name: A name for the operation (optional). Returns: outputs: A list of N `Tensor`s with rank N. Raises: TypeError: When no keyword arguments (kwargs) are passed. ValueError: When indexing keyword argument is not one of `xy` or `ij`. """ # with ops.name_scope(name, "meshgrid", args) as name: ndim = len(args) s0 = (1, ) * ndim # Prepare reshape by inserting dimensions with size 1 where needed output = [] for i, x in enumerate(args): output.append(torch.stack(x).view((s0[:i] + (-1, ) + s0[i + 1::]))) shapes = [x.size() for x in args] sz = [x.size()[0] for x in args] for i in range(len(output)): output[i] = torch.tile(output[i], torch.stack([*sz[:i], 1, *sz[(i + 1):]])) return output
def train(): # Initialize torch.distributed init_distributed() print_rank_0('AutoMP: training GPT2...') # Use fake train data args = get_args() sequence_length = 1024 vocab_size = 4096 dropout_prob = 0.1 input_indices = torch.randint(low=0, high=vocab_size, size=(args.batch_size, sequence_length)) input_indices = input_indices.to(torch.cuda.current_device()) position_indices = torch.tile(torch.arange(start=0, end=sequence_length), (args.batch_size, 1)) position_indices = position_indices.to(torch.cuda.current_device()) print_rank_0(f'AutoMP: input_indices shape = {input_indices.size()}') print_rank_0(f'AutoMP: position_indices shape = {position_indices.size()}') def init_method_normal(tensor): return torch.nn.init.normal_(tensor, mean=0.0, std=1.0) embedding = Embedding(hidden_size=args.hidden_size, vocab_size=vocab_size, max_sequence_length=sequence_length, embedding_dropout_prob=dropout_prob, init_method=init_method_normal) embedding_output = embedding.forward(input_indices, position_indices) # print_rank_0(f'AutoMP: embedding_output = {embedding_output}') def gpt2_attention_mask_func(attention_scores, ltor_mask): attention_scores.masked_fill_(ltor_mask, -10000.0) return attention_scores transformer = ParallelTransformer( attention_mask_func=gpt2_attention_mask_func, num_layers=args.num_layers, hidden_size=args.hidden_size, layernorm_epsilon=args.layernorm_epsilon, num_attention_heads=args.num_attention_heads, attention_dropout=0.1, hidden_dropout=0.1) attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( input_indices, vocab_size - 1) transformer_output = transformer.forward(hidden_states=embedding_output, attention_mask=attention_mask) print_rank_0(f'AutoMP: transformer_output = {transformer_output}')