def test_regression(sru_prev_version): """ IMPORTANT: You need to run: test/regression/build_artifact.sh [SRU VERSION] and add the resulting artifact in test/regression/artifacts into github, for each sru_prev_version you want to test """ torch.manual_seed(2) # so the model is initialized differently than first stage artifact_path = f'{ARTIFACT_DIR}/{sru_prev_version}.pt' artifact_dict = torch.load(artifact_path) assert artifact_dict['sru.__version__'] == sru_prev_version model = sru.SRU(**artifact_dict['sru_kwargs']).eval() output_artifact = artifact_dict['outputs'] model.load_state_dict(artifact_dict['model_state']) with torch.no_grad(): output_current = model(artifact_dict['inputs']) assert len(output_artifact) == len(output_current) == 2 max_diff0 = (output_artifact[0] - output_current[0]).abs().max().item() max_diff1 = (output_artifact[1] - output_current[1]).abs().max().item() assert max_diff0 <= EPSILON assert max_diff1 <= EPSILON
def __init__(self, words, args): super(Model, self).__init__() self.args = args if args.n_e: self.n_e = args.n_e else: self.n_e = len(words) if len(words) < args.n_d else args.n_d self.n_d = args.n_d self.depth = args.depth self.drop = nn.Dropout(args.dropout) self.embedding_layer = nn.Embedding(len(words), self.n_e) self.n_V = len(words) custom_m_list = [CustomLinear(self.n_e, self.n_d * 4, bias=False)] for i in range(self.depth - 1): custom_m_list.append( flop.ProjectedLinear(self.n_d, self.n_d * 3, proj_features=args.n_proj, bias=False)) self.rnn = sru.SRU( self.n_e, self.n_d, self.depth, dropout=args.dropout, highway_bias=args.bias, layer_norm=args.layer_norm, rescale=args.rescale, custom_m=custom_m_list, ) self.output_layer = nn.Linear(self.n_d, self.n_V) self.init_weights()
def __init__(self, words, args): super(Model, self).__init__() self.args = args if args.n_e: self.n_e = args.n_e else: self.n_e = len(words) if len(words) < args.n_d else args.n_d self.n_d = args.n_d self.depth = args.depth self.drop = nn.Dropout(args.dropout) self.embedding_layer = nn.Embedding(len(words), self.n_e) self.n_V = len(words) if args.lstm: self.rnn = nn.LSTM(self.n_e, self.n_d, self.depth, dropout = args.dropout ) else: self.rnn = sru.SRU(self.n_e, self.n_d, self.depth, dropout = args.dropout, n_proj = args.n_proj, #use_tanh = 0, highway_bias = args.bias, layer_norm = args.layer_norm ) self.output_layer = nn.Linear(self.n_d, self.n_V) self.init_weights()
def __init__(self, words, args): super(Model, self).__init__() self.args = args self.n_d = args.d self.depth = args.depth self.drop = nn.Dropout(args.dropout) self.embedding_layer = EmbeddingLayer(self.n_d, words) self.n_V = self.embedding_layer.n_V if args.lstm: self.rnn = nn.LSTM(self.n_d, self.n_d, self.depth, dropout=args.rnn_dropout) else: self.rnn = sru.SRU( self.n_d, self.n_d, self.depth, dropout=args.rnn_dropout, rnn_dropout=args.rnn_dropout, use_tanh=0, rescale=False, # make sure the behavior is the same as before v1=True, # highway_bias=args.bias) self.output_layer = nn.Linear(self.n_d, self.n_V) # tie weights self.output_layer.weight = self.embedding_layer.embedding.weight self.init_weights()
def __init__(self, labels): super().__init__() self.labels = labels self.output_numbers = max(labels.values()) + 1 self.rnn_size = self.output_numbers print_normal("Creating resSru with " + str(self.output_numbers) + " labels") self.convolutions = torch.nn.Sequential(OrderedDict([ ('conv1', torch.nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)), ('bn1', torch.nn.BatchNorm2d(64)), ('activation', torch.nn.ReLU(inplace=True)), ('maxpool', torch.nn.MaxPool2d(kernel_size=3, stride=2, padding=(1, 1))), ('resnet', ResNet(BasicBlock, [2, 2, 2, 2], strides=[1, (2, 1), (2, 1), (2, 1)], bn=True)), ])) self.convolutions_output_size = self.get_cnn_output_size() self.rnn = sru.SRU(self.convolutions_output_size[1] * self.convolutions_output_size[2], self.output_numbers, num_layers=4, bidirectional=False, rnn_dropout=0.3, use_tanh=1, use_relu=0, layer_norm=False, weight_norm=True) # self.rnn = torch.nn.GRU(self.convolutions_output_size[1] * self.convolutions_output_size[2], self.rnn_size, num_layers=1, bidirectional=True) # self.rnn = IndRNN(self.convolutions_output_size[1] * self.convolutions_output_size[2], self.rnn_size, n_layer=3, bidirectional=True, batch_norm=True, batch_first=True, dropout=0.1, nonlinearity='relu') # self.fc = torch.nn.Linear(2 * self.rnn_size, self.output_numbers) self.softmax = torch.nn.Softmax(dim=2)
def test_all(bidirectional, rescale, proj, layer_norm): eps = 1e-4 torch.manual_seed(1234) L = 16 B = 8 D = 32 x = torch.randn(L, B, D) model = sru.SRU(D, D, bidirectional=bidirectional, projection_size=proj, layer_norm=layer_norm, rescale=rescale) model.eval() h, c = model(x) h, c = h.detach(), c.detach() with torch.no_grad(): h_, c_ = model(x) assert (h - h_).abs().max() <= eps assert (c - c_).abs().max() <= eps ts_model = torch.jit.script(model) h_, c_ = ts_model(x) assert (h - h_).abs().max() <= eps assert (c - c_).abs().max() <= eps
def test_all(cuda, bidirectional, rescale, proj, layer_norm): eps = 1e-4 torch.manual_seed(1234) if cuda: torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False L = 16 B = 8 D = 32 x = torch.randn(L, B, D) model = sru.SRU(D, D, bidirectional=bidirectional, projection_size=proj, layer_norm=layer_norm, rescale=rescale) if cuda: model = model.cuda() x = x.cuda() model.eval() h, c = model(x) h, c = h.detach(), c.detach() with torch.no_grad(): h_, c_ = model(x) assert (h - h_).abs().max() <= eps assert (c - c_).abs().max() <= eps ts_model = torch.jit.script(model) h_, c_ = ts_model(x) assert (h - h_).abs().max() <= eps assert (c - c_).abs().max() <= eps
def run(args): torch.manual_seed(1) batch_size = 3 input_size = 5 hidden_size = 7 seq_len = 4 num_layers = 2 sru_kwargs = { 'input_size': input_size, 'hidden_size': hidden_size, 'num_layers': num_layers, 'bidirectional': True, 'dropout': 0.1, 'rescale': False } inputs = torch.rand(seq_len, batch_size, input_size) model = sru.SRU(**sru_kwargs).eval() with torch.no_grad(): outputs = model(inputs) artifact_dict = { 'outputs': outputs, 'inputs': inputs, 'model_state': model.state_dict(), 'sru_kwargs': sru_kwargs, 'sru.__version__': sru.__version__ } torch.save(artifact_dict, args.out_artifact)
def test_sru_backward_simple(cuda, bidirectional, layer_norm, normalize_after, rescale, has_skip_term): torch.manual_seed(123) if cuda: torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False input_length = 3 batch_size = 5 input_size = 4 hidden_size = 2 encoder = sru.SRU(input_size, hidden_size, bidirectional=bidirectional, layer_norm=layer_norm, normalize_after=normalize_after, rescale=rescale, has_skip_term=has_skip_term) if cuda: encoder = encoder.cuda() def run(x): if cuda: x = x.cuda() output, state = encoder(x) output.mean().backward() # test batch size > 1 input_data = torch.rand(input_length, batch_size, input_size) run(input_data)
def test_sru_backward(bidirectional, layer_norm, normalize_after): eps = 1e-4 torch.manual_seed(123) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False input_length = 3 batch_size = 5 input_size = 4 hidden_size = 2 encoder = sru.SRU(input_size, hidden_size, bidirectional=bidirectional, layer_norm=layer_norm, normalize_after=normalize_after) x = torch.randn(input_length, batch_size, input_size) # backward in CPU mode h, c = encoder(x) h.sum().backward() grads = [p.grad.clone() for p in encoder.parameters() if p.requires_grad] # backward in GPU mode encoder.zero_grad() encoder, x = encoder.cuda(), x.cuda() h_, c_ = encoder(x) h_.sum().backward() grads_ = [ p.grad.cpu().clone() for p in encoder.parameters() if p.requires_grad ] assert len(grads) == len(grads_) for g1, g2 in zip(grads, grads_): assert (g1 - g2).abs().max() <= eps
def run(args): D = 4 model = sru.SRU(D, D, num_layers=2, normalize_after=args.normalize_after) model.eval() ts_model = torch.jit.script(model) ts_model.save('sru_ts.pt') with torch.no_grad(): x = torch.ones(3, 2, D) h, c = model(x) h, c = h.view(-1), c.view(-1) print(''.join(["{:.4f} ".format(x.item()) for x in h])) print(''.join(["{:.4f} ".format(x.item()) for x in c]))
def __init__(self, args): super(Model, self).__init__() self.args = args # self.cutoffs = [20000, 60000] self.cutoffs = [10000, 20000, 40000, 60000, 100000] self.n_V = args.n_token self.n_e = args.n_e or args.n_proj self.n_d = args.n_d self.depth = args.depth self.drop = nn.Dropout(args.dropout) self.embedding_layer = AdaptiveEmbedding( self.n_V, self.n_e, self.n_d, self.cutoffs, div_val=args.div_val, div_freq=2, dropout=args.dropout_e, ) self.rnn = sru.SRU( self.n_d, self.n_d, self.depth, projection_size=args.n_proj, dropout=args.dropout, highway_bias=args.bias, layer_norm=args.layer_norm, rescale=args.rescale, custom_m=flop.ProjectedLinear(self.n_d, self.n_d * 3, proj_features=args.n_proj, bias=False), ) self.output_layer = AdaptiveLogSoftmax( self.n_V, self.n_e, self.n_d, self.cutoffs, div_val=args.div_val, div_freq=2, dropout=args.dropout_e, keep_order=False, ) self.init_weights() if not args.not_tie: self.tie_weights()
def run(): eps = 1e-4 num_sentences = 3 embedding_size = 7 rnn_hidden = 4 max_len = 4 layers = 5 bidirectional = True encoder = sru.SRU( embedding_size, rnn_hidden, layers, bidirectional=bidirectional, nn_rnn_compatible_return=compat, ) words_embeddings = torch.rand((max_len, num_sentences, embedding_size), dtype=torch.float32) if cuda: words_embeddings = words_embeddings.to("cuda") encoder.cuda() encoder.eval() hidden, cell = encoder(words_embeddings) def cell_to_emb(cell, batch_size): if compat: # should arrive as: # (num_layers * num_directions, batch, hidden_size) cell = cell.view(layers, 2 if bidirectional else 1, batch_size, rnn_hidden) cell = cell[-1].transpose(0, 1) # (batch, num_directions, hidden_size) cell = cell.contiguous().view(batch_size, -1) else: # should arrive as: # (num_layers, batch_size, num_directions * hidden_size) cell = cell[-1].view(batch_size, -1) return cell scores = cell_to_emb(cell, num_sentences) for i in range(num_sentences): hidden, cell = encoder(words_embeddings[:, i:i + 1]) score = cell_to_emb(cell, 1) assert (score.detach() - scores[i].detach()).abs().max() <= eps
def __init__(self, vocabulary_size, embedding_dim=1024, dropout=0.5, rnn_dropout=0.1, depth=2, bias=-3, trainable=False): """ num_embeddings -- size of Embedding Layer input (num of words) embedding_dim -- size of Embedding Layer output (n_d) dropout -- % of neurons to disable in dropout layer rnn_dropout -- % of neurons to disable in rnn layer (HAHA NO!) depth -- number of rnn layers bias -- I don't know what is it """ super(Model, self).__init__() self.embedding_dim = embedding_dim self.depth = depth self.bias = bias self.trainable = trainable self.relu = nn.ReLU() self.rnn = sru.SRU( input_size=embedding_dim, hidden_size=embedding_dim, num_layers=depth, dropout=dropout, rnn_dropout=rnn_dropout, bidirectional=True, rescale=False, # wtf is rescale? v1=True, # wtf is v1? highway_bias=bias) if trainable: self.output_layer = nn.Linear(embedding_dim, vocabulary_size) self.drop = nn.Dropout(dropout) self.init_weights()
import torch import sru D = 4 model = sru.SRU(D, D, num_layers=2) model.eval() ts_model = torch.jit.script(model) ts_model.save('sru_ts.pt') with torch.no_grad(): x = torch.ones(3, 2, D) h, c = model(x) h, c = h.view(-1), c.view(-1) print(''.join(["{:.4f} ".format(x.item()) for x in h])) print(''.join(["{:.4f} ".format(x.item()) for x in c]))