Beispiel #1
0
def train_loop_fn(data_loader, model, optimizer, scheduler=None):
    model.train()
    device = config.DEVICE
    losses = []
    for batch_idx, data in enumerate(data_loader):
        ids = data['ids']
        mask = data['mask']
        token_type_ids = data['token_type_ids']
        targets = data['targets']

        if device:
            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

        optimizer.zero_grad()
        outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        batch_loss = loss.item()
        losses.append(batch_loss)
        if scheduler:
            scheduler.step()
    return np.mean(losses)
Beispiel #2
0
    def prepare_model(self):
        self.rnn_x = tf.placeholder(tf.int32, [None, None], name='input')
        self.rnn_y = tf.placeholder(tf.int64, [None, self.num_items], name='output')
        self.mask = tf.placeholder(tf.float32, [None, None], name='mask')
        self.keep_prob_input = tf.placeholder(tf.float32, name='keep_prob_input')
        self.keep_prob_ho = tf.placeholder(tf.float32, name='keep_prob_ho')
        self.batch_var_length = tf.placeholder(tf.int32, name="variable_length")

        Wemb = tf.get_variable('Wemb', [self.num_items, self.embedding_size], initializer=self.embed_init)
        W_output = tf.get_variable('W_output', [1 * self.rnn_hidden_size, self.num_items], initializer=self.weight_init)
        b_output = tf.get_variable('b_output', [1, self.num_items], initializer=self.bias_init)

        emb = tf.nn.embedding_lookup(Wemb, self.rnn_x)
        emb = tf.nn.dropout(emb, self.keep_prob_input)

        custom_cell = tf.contrib.rnn.GRUCell(num_units=self.rnn_hidden_size)
        outputs, states = tf.nn.dynamic_rnn(custom_cell, emb, sequence_length=self.batch_var_length,dtype=tf.float32)

        self.outputs = outputs
        self.last_hidden = states  # 512 x 100

        # num_items x 2*100
        if self.loss_type == "TOP1":
            proj = tf.nn.dropout(self.last_hidden, self.keep_prob_ho)
            pred = tf.matmul(proj, W_output) + b_output
            self.pred = tf.nn.tanh(pred)
            self.cost = loss_fn(self.rnn_y, self.pred, self.loss_type)
        elif self.loss_type == "CE":
            proj = tf.nn.dropout(self.last_hidden, self.keep_prob_ho)
            pred = tf.matmul(proj, W_output) + b_output
            self.pred = tf.nn.softmax(pred)
            self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=pred, labels=self.rnn_y))

        self.optimizer = tf.train.AdamOptimizer(self.lr).minimize(self.cost)
def main(argv=None):
    print("Running on {}".format(device))
    parser = argparse.ArgumentParser(
        description="Train a transformer for a copy task"
    )
    add_optimizer_arguments(parser)
    add_transformer_arguments(parser)
    add_auxiliary_arguments(parser)
    args = parser.parse_args(argv)
    print("args:\n-----\n", args)

    data_points = []
    data_points_acc = []
    n_of_each_model = 10
    n_trials = 8
    for model_type in [ 'transformer','lstm','rnn',]: #add back transformers and rnn
        for max_trained_depth in range(1, 12):
            for ii in range(n_of_each_model):
                print(f'dep{max_trained_depth}_ii_{ii}')
                if model_type == "transformer":
                    d_model = 16
                    model = SequencePredictorRecurrentTransformer(
                                d_model=d_model, n_classes=5,
                                sequence_length=args.sequence_length,
                                attention_type=args.attention_type,
                                n_layers=args.n_layers,
                                n_heads=args.n_heads,
                                d_query=d_model, # used to be d_query
                                dropout=args.dropout,
                                softmax_temp=None,
                                attention_dropout=args.attention_dropout,
                            )
                else:
                    d_model = 8
                    model = SequencePredictorRNN(
                                d_model=d_model, n_classes=5,
                                n_layers=args.n_layers,
                                dropout=args.dropout,
                                rnn_type=model_type
                            )
                print(f"Created model:\n{model}")
                model.to(device)
                model.load_state_dict(torch.load(f"models_from_colab/agreement_models/model_{model_type}_depth_{max_trained_depth}_num_{ii}.zip", map_location=device)['model_state'])
                for test_depth in range(1, 21): # was 1, 32
                    stack_size = test_depth # Change this value to test longer / shorter sequences
                    n_correct = 0
                    for i_trial in range(n_trials):
                        x, y, m = SubjectVerbAgreement.get_seq(stack_size)
                        model.eval()
                        yhat = model(x.unsqueeze(1))
                        loss, acc = loss_fn(y.unsqueeze(1), yhat, m.unsqueeze(1))
                        n_correct += acc
                    data_points.append({'model_type': model_type, 'max_trained_depth': max_trained_depth,
                                        'test_depth': test_depth, 'accuracy': n_correct / n_trials})

    print("data points")
    print(data_points)
    with open("data_points_pr_acc_r.txt", "wb") as fp:
        pickle.dump(data_points, fp)
    """
Beispiel #4
0
def eval_loop_fn(data_loader, model):
    model.eval()
    fin_targets = []
    fin_outputs = []
    losses = []
    device = config.DEVICE
    for batch_idx, data in enumerate(data_loader):
        ids = data['ids']
        mask = data['mask']
        token_type_ids = data['token_type_ids']
        targets = data['targets']

        if device:
            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)


        outputs =  model(ids=ids, mask=mask, token_type_ids=token_type_ids)
        loss = loss_fn(outputs, targets)
        losses.append(loss.item())
        fin_targets.append(targets.cpu().detach().numpy())
        fin_outputs.append(outputs.cpu().detach().numpy())
    return np.vstack(fin_outputs), np.vstack(fin_targets), np.mean(losses)
Beispiel #5
0
def train(model, iterator, optimizer):
    model.train()
    running_loss = 0.0
    running_acc = 0.0
    running_f1 = 0.0

    for words, labels, lens in iterator:
        words, labels = words.to(device), labels.to(device)

        optimizer.zero_grad()
        pred = model(words.long(), hidden)
        loss = loss_fn(pred, labels)
        #compute the binary accuracy
        (acc, f1) = accuracy(pred, labels)

        #backpropage the loss and compute the gradients
        loss.backward()

        #update the weights
        optimizer.step()
        running_loss += loss.item()
        running_acc += acc
        running_f1 += f1

    return running_loss / len(iterator), running_acc / len(
        iterator), running_f1 / len(iterator)
Beispiel #6
0
 def forward(self, ids, attention_mask, type_ids=None, label=None):
     output = self.bert(ids, attention_mask)
     output = self.dropout(output[1])
     output = self.l0(output)
     # output = torch.sigmoid(output)
     if label is not None:
         return loss_fn(output, label)
     else:
         return output
def train(dataset, data_loader, model, optimizer):
    model.train()
    final_loss = 0
    counter = 0
    final_outputs = []
    final_targets = []

    for bi, d in tqdm(enumerate(data_loader),
                      total=int(len(dataset) / data_loader.batch_size)):
        counter = counter + 1
        image = d["image"]
        grapheme_root = d["grapheme_root"]
        vowel_diacritic = d["vowel_diacritic"]
        consonant_diacritic = d["consonant_diacritic"]
        DEVICE = "cuda"
        image = image.to(DEVICE, dtype=torch.float)
        grapheme_root = grapheme_root.to(DEVICE, dtype=torch.long)
        vowel_diacritic = vowel_diacritic.to(DEVICE, dtype=torch.long)
        consonant_diacritic = consonant_diacritic.to(DEVICE, dtype=torch.long)

        optimizer.zero_grad()
        outputs = model(image)
        targets = (grapheme_root, vowel_diacritic, consonant_diacritic)
        loss = loss_fn(outputs, targets)

        loss.backward()
        optimizer.step()

        final_loss += loss

        o1, o2, o3 = outputs
        t1, t2, t3 = targets
        final_outputs.append(torch.cat((o1, o2, o3), dim=1))
        final_targets.append(torch.stack((t1, t2, t3), dim=1))

        #if bi % 10 == 0:
        #    break
    final_outputs = torch.cat(final_outputs)
    final_targets = torch.cat(final_targets)

    print("=================Train=================")
    macro_recall_score = macro_recall(final_outputs, final_targets)

    return final_loss / counter, macro_recall_score
Beispiel #8
0
def test(model, iterator):
    running_loss = 0.0
    running_acc = 0.0
    running_f1 = 0.0
    with torch.no_grad():
        for words, labels, lens in iterator:
            words, labels = words.to(device), labels.to(device)

            pred = model(words.long(), hidden)
            loss = loss_fn(pred, labels)
            #compute the binary accuracy
            (acc, f1) = accuracy(pred, labels)

            running_loss += loss.item()
            running_acc += acc
            running_f1 += f1

    return running_loss / len(iterator), running_acc / len(
        iterator), running_f1 / len(iterator)
def evaluate(dataset, data_loader, model):
    with torch.no_grad():
        model.eval()
        final_loss = 0
        counter = 0
        final_outputs = []
        final_targets = []
        for bi, d in tqdm(enumerate(data_loader),
                          total=int(len(dataset) / data_loader.batch_size)):
            counter = counter + 1
            image = d["image"]
            grapheme_root = d["grapheme_root"]
            vowel_diacritic = d["vowel_diacritic"]
            consonant_diacritic = d["consonant_diacritic"]
            DEVICE = "cuda"
            image = image.to(DEVICE, dtype=torch.float)
            grapheme_root = grapheme_root.to(DEVICE, dtype=torch.long)
            vowel_diacritic = vowel_diacritic.to(DEVICE, dtype=torch.long)
            consonant_diacritic = consonant_diacritic.to(DEVICE,
                                                         dtype=torch.long)

            outputs = model(image)
            targets = (grapheme_root, vowel_diacritic, consonant_diacritic)
            loss = loss_fn(outputs, targets)
            final_loss += loss

            o1, o2, o3 = outputs
            t1, t2, t3 = targets
            #print(t1.shape)
            final_outputs.append(torch.cat((o1, o2, o3), dim=1))
            final_targets.append(torch.stack((t1, t2, t3), dim=1))

        final_outputs = torch.cat(final_outputs)
        final_targets = torch.cat(final_targets)

        print("=================VALID============")
        macro_recall_score = macro_recall(final_outputs, final_targets)

    return final_loss / counter, macro_recall_score
Beispiel #10
0
    def build(self):
        # p = pitch, r = rhythm
        self.n_p_inputs = len(self.X_tr_p[0][0])
        self.n_p_outputs = len(self.y_tr_p[0][0])
        self.n_r_inputs = len(self.X_tr_r[0][0])
        self.n_r_outputs = len(self.y_tr_r[0][0])

        self.X_p = tf.placeholder(tf.float32, [None, None, self.n_p_inputs],
                                  name="X_p")
        self.y_p = tf.placeholder(tf.float32, [None, None, self.n_p_outputs],
                                  name="y_p")

        self.X_r = tf.placeholder(tf.float32, [None, None, self.n_r_inputs],
                                  name="X_r")
        self.y_r = tf.placeholder(tf.float32, [None, None, self.n_r_outputs],
                                  name="Y_r")

        # CNN pitch
        network_p = tf.layers.conv1d(inputs=self.X_p,
                                     filters=12,
                                     kernel_size=8,
                                     padding='valid',
                                     activation=tf.nn.relu,
                                     name='conv_p_1')

        # CNN rhythm
        network_r = tf.layers.conv1d(inputs=self.X_r,
                                     filters=12,
                                     kernel_size=8,
                                     padding='valid',
                                     activation=tf.nn.relu,
                                     name='conv_r_1')

        # batch normalization parameters
        self.is_training = tf.placeholder(tf.bool,
                                          shape=(),
                                          name="is_training")

        bn_params = {
            "is_training": self.is_training,
            "decay": 0.99,
            "updates_collections": None,
            "scale": True
        }
        bn_params_out = {
            "is_training": self.is_training,
            "decay": 0.99,
            "updates_collections": None
        }

        if self.model_type == "combine":

            combined = tf.concat([network_p, network_r], axis=2)

            #full connected combined
            n_hidden_comb1 = 128
            n_hidden_comb2 = 128
            keep_prob = 0.7

            stacked_combined = tf.reshape(combined, [-1, 24],
                                          name='stacked_outs_p')

            fc1 = fully_connected(stacked_combined,
                                  num_outputs=n_hidden_comb1,
                                  activation_fn=tf.nn.elu,
                                  normalizer_fn=batch_norm,
                                  normalizer_params=bn_params,
                                  scope='comb_1')

            drop1 = tf.contrib.layers.dropout(fc1,
                                              keep_prob,
                                              is_training=self.is_training)

            fc2 = fully_connected(drop1,
                                  num_outputs=n_hidden_comb2,
                                  activation_fn=tf.nn.elu,
                                  normalizer_fn=batch_norm,
                                  normalizer_params=bn_params,
                                  scope='comb_2')

            drop2 = tf.contrib.layers.dropout(fc1,
                                              keep_prob,
                                              is_training=self.is_training)
            outs_p = drop2
            outs_r = drop2

        elif self.model_type == "separate":
            # pitch
            n_neurons_p1 = 256
            n_neurons_p2 = 256
            keep_prob_p = 0.7

            cnn_outs_p = tf.reshape(network_p, [-1, 12], name='cnn_outs_p')

            fc1_p = fully_connected(cnn_outs_p,
                                    num_outputs=n_neurons_p1,
                                    activation_fn=tf.nn.elu,
                                    normalizer_fn=batch_norm,
                                    normalizer_params=bn_params,
                                    scope='fc1_p1')

            drop_p = tf.contrib.layers.dropout(fc1_p,
                                               keep_prob_p,
                                               is_training=self.is_training)

            fc2_p = fully_connected(drop_p,
                                    num_outputs=n_neurons_p2,
                                    activation_fn=tf.nn.elu,
                                    normalizer_fn=batch_norm,
                                    normalizer_params=bn_params,
                                    scope='fc2_p1')

            outs_p = tf.contrib.layers.dropout(fc2_p,
                                               keep_prob_p,
                                               is_training=self.is_training)

            # rhythm
            n_neurons_r1 = 256
            n_neurons_r2 = 256
            keep_prob_r = 0.7

            cnn_outs_r = tf.reshape(network_r, [-1, 12], name='cnn_outs_r')

            fc1_r = fully_connected(cnn_outs_r,
                                    num_outputs=n_neurons_r1,
                                    activation_fn=tf.nn.elu,
                                    normalizer_fn=batch_norm,
                                    normalizer_params=bn_params,
                                    scope='fc1_r1')

            drop_r = tf.contrib.layers.dropout(fc1_r,
                                               keep_prob_r,
                                               is_training=self.is_training)

            fc2_r = fully_connected(drop_r,
                                    num_outputs=n_neurons_r2,
                                    activation_fn=tf.nn.elu,
                                    normalizer_fn=batch_norm,
                                    normalizer_params=bn_params,
                                    scope='fc2_r1')

            outs_r = tf.contrib.layers.dropout(fc2_r,
                                               keep_prob_r,
                                               is_training=self.is_training)

        # fully connected pitch
        n_hidden_1_p = 48
        n_hidden_2_p = 32
        keep_prob_p = 0.6

        stacked_logits_p1 = fully_connected(outs_p,
                                            num_outputs=n_hidden_1_p,
                                            activation_fn=tf.nn.elu,
                                            normalizer_fn=batch_norm,
                                            normalizer_params=bn_params,
                                            scope='dense_p1')

        p_drop1 = tf.contrib.layers.dropout(stacked_logits_p1,
                                            keep_prob_p,
                                            is_training=self.is_training)

        stacked_logits_p2 = fully_connected(p_drop1,
                                            num_outputs=n_hidden_2_p,
                                            activation_fn=tf.nn.elu,
                                            normalizer_fn=batch_norm,
                                            normalizer_params=bn_params,
                                            scope='dense_p2')

        p_drop2 = tf.contrib.layers.dropout(stacked_logits_p2,
                                            keep_prob_p,
                                            is_training=self.is_training)

        stacked_logits_p3 = fully_connected(p_drop2,
                                            num_outputs=self.n_p_outputs,
                                            activation_fn=None,
                                            normalizer_fn=batch_norm,
                                            normalizer_params=bn_params_out,
                                            scope='dense_p_out')

        self.logits_p = tf.reshape(
            stacked_logits_p3,
            [-1, tf.shape(self.y_p)[1], self.n_p_outputs],
            name='logits_p')

        # fully connected rhythm
        n_hidden_1_r = 48
        n_hidden_2_r = 32
        keep_prob_r = 0.6

        # separate rhythm

        stacked_logits_r1 = fully_connected(outs_r,
                                            n_hidden_1_r,
                                            activation_fn=tf.nn.elu,
                                            normalizer_fn=batch_norm,
                                            normalizer_params=bn_params,
                                            scope='dense_r1')

        r_drop1 = tf.contrib.layers.dropout(stacked_logits_r1,
                                            keep_prob_r,
                                            is_training=self.is_training)

        stacked_logits_r2 = fully_connected(r_drop1,
                                            n_hidden_2_r,
                                            activation_fn=tf.nn.elu,
                                            normalizer_fn=batch_norm,
                                            normalizer_params=bn_params,
                                            scope='dense_r2')

        r_drop2 = tf.contrib.layers.dropout(stacked_logits_r2,
                                            keep_prob_r,
                                            is_training=self.is_training)

        stacked_logits_r3 = fully_connected(r_drop2,
                                            self.n_r_outputs,
                                            activation_fn=None,
                                            normalizer_fn=batch_norm,
                                            normalizer_params=bn_params_out,
                                            scope='dense_r_out')

        self.logits_r = tf.reshape(
            stacked_logits_r3,
            [-1, tf.shape(self.y_r)[1], self.n_r_outputs],
            name='logits_r')

        # loss params
        learn_rate = 0.02
        clip = 5

        # loss
        self.loss_r = loss_fn(self.logits_r, self.y_r)
        self.loss_p = loss_fn(self.logits_p, self.y_p)
        self.total_loss = tf.add(self.loss_r, self.loss_p)

        # training op
        if self.model_type == "combine":

            optimizer = tf.train.AdamOptimizer(learning_rate=learn_rate)
            gradients = optimizer.compute_gradients(self.total_loss)
            capped_gradients = [(tf.clip_by_norm(grad, clip),
                                 var) if grad != None else (grad, var)
                                for grad, var in gradients]
            self.train_op = optimizer.apply_gradients(capped_gradients)

        elif self.model_type == "separate":

            optimizer = tf.train.AdamOptimizer(learning_rate=learn_rate)

            # rhythm
            gradients_r = optimizer.compute_gradients(self.loss_r)
            capped_gradients_r = [(tf.clip_by_norm(grad, clip),
                                   var) if grad != None else (grad, var)
                                  for grad, var in gradients_r]
            self.train_op_r = optimizer.apply_gradients(capped_gradients_r)

            # pitch
            gradients_p = optimizer.compute_gradients(self.loss_p)
            capped_gradients_p = [(tf.clip_by_norm(grad, clip),
                                   var) if grad != None else (grad, var)
                                  for grad, var in gradients_p]
            self.train_op_p = optimizer.apply_gradients(capped_gradients_p)

        # evaluation
        self.accuracy_r = accuracy_fn(self.logits_r, self.y_r)
        self.accuracy_p = accuracy_fn(self.logits_p, self.y_p)

        self.execute()
Beispiel #11
0
    def build(self):
        # p = pitch, r = rhythm
        self.n_p_inputs = len(self.X_tr_p[0][0])
        self.n_p_outputs = len(self.y_tr_p[0][0])
        self.n_r_inputs = len(self.X_tr_r[0][0])
        self.n_r_outputs = len(self.y_tr_r[0][0])

        self.X_p = tf.placeholder(tf.float32, [None, None, self.n_p_inputs],
                                  name="X_p")
        self.y_p = tf.placeholder(tf.float32, [None, None, self.n_p_outputs],
                                  name="y_p")

        self.X_r = tf.placeholder(tf.float32, [None, None, self.n_r_inputs],
                                  name="X_r")
        self.y_r = tf.placeholder(tf.float32, [None, None, self.n_r_outputs],
                                  name="Y_r")

        if self.model_type == "combine":
            # concat pitch and rhythm
            combined = tf.concat([self.X_p, self.X_r], axis=2)

            # RNN
            nu_rnn = 64

            cells = []
            cells.append(
                tf.contrib.rnn.LSTMCell(nu_rnn,
                                        use_peepholes=True,
                                        activation=tf.tanh))
            cells.append(
                tf.contrib.rnn.LSTMCell(nu_rnn,
                                        use_peepholes=True,
                                        activation=tf.tanh))
            multi = tf.contrib.rnn.MultiRNNCell(cells)
            outs, _ = tf.nn.dynamic_rnn(multi,
                                        combined,
                                        dtype=tf.float32,
                                        swap_memory=True,
                                        scope="rhythm")
            outs_r = outs
            outs_p = outs

        elif self.model_type == "separate":
            # RNN
            nu_rnn = 64

            # RNN pitch
            cells_p = []
            cells_p.append(
                tf.contrib.rnn.LSTMCell(nu_rnn,
                                        use_peepholes=True,
                                        activation=tf.tanh))
            cells_p.append(
                tf.contrib.rnn.LSTMCell(nu_rnn,
                                        use_peepholes=True,
                                        activation=tf.tanh))
            multi_p = tf.contrib.rnn.MultiRNNCell(cells_p)
            outs_p, _ = tf.nn.dynamic_rnn(multi_p,
                                          self.X_p,
                                          dtype=tf.float32,
                                          swap_memory=True,
                                          scope="pitch")

            # RNN rhythm
            cells_r = []
            cells_r.append(
                tf.contrib.rnn.LSTMCell(nu_rnn,
                                        use_peepholes=True,
                                        activation=tf.tanh))
            cells_r.append(
                tf.contrib.rnn.LSTMCell(nu_rnn,
                                        use_peepholes=True,
                                        activation=tf.tanh))
            multi_r = tf.contrib.rnn.MultiRNNCell(cells_r)
            outs_r, _ = tf.nn.dynamic_rnn(multi_r,
                                          self.X_r,
                                          dtype=tf.float32,
                                          swap_memory=True,
                                          scope="rhythm")

        # batch normalization parameters
        self.is_training = tf.placeholder(tf.bool,
                                          shape=(),
                                          name="is_training")

        bn_params = {
            "is_training": self.is_training,
            "decay": 0.999,
            "updates_collections": None,
            "scale": True
        }
        bn_params_out = {
            "is_training": self.is_training,
            "decay": 0.999,
            "updates_collections": None
        }

        # fully connected pitch
        n_hidden_1_p = 48
        n_hidden_2_p = 32
        keep_prob_p = 0.6

        stacked_outs_p = tf.reshape(outs_p, [-1, nu_rnn],
                                    name='stacked_outs_p')

        stacked_logits_p1 = fully_connected(stacked_outs_p,
                                            num_outputs=n_hidden_1_p,
                                            activation_fn=tf.nn.elu,
                                            normalizer_fn=batch_norm,
                                            normalizer_params=bn_params,
                                            scope='dense_p1')

        p_drop1 = tf.contrib.layers.dropout(stacked_logits_p1,
                                            keep_prob_p,
                                            is_training=self.is_training)

        stacked_logits_p2 = fully_connected(p_drop1,
                                            num_outputs=n_hidden_2_p,
                                            activation_fn=tf.nn.elu,
                                            normalizer_fn=batch_norm,
                                            normalizer_params=bn_params,
                                            scope='dense_p2')

        p_drop2 = tf.contrib.layers.dropout(stacked_logits_p2,
                                            keep_prob_p,
                                            is_training=self.is_training)

        stacked_logits_p3 = fully_connected(p_drop2,
                                            num_outputs=self.n_p_outputs,
                                            activation_fn=None,
                                            normalizer_fn=batch_norm,
                                            normalizer_params=bn_params_out,
                                            scope='dense_p_out')

        self.logits_p = tf.reshape(
            stacked_logits_p3,
            [-1, tf.shape(self.y_p)[1], self.n_p_outputs],
            name='logits_p')

        # fully connected rhythm
        n_hidden_1_r = 48
        n_hidden_2_r = 32
        keep_prob_r = 0.6

        # separate rhythm
        stacked_outs_r = tf.reshape(outs_r, [-1, nu_rnn],
                                    name='stacked_outs_r')

        stacked_logits_r1 = fully_connected(stacked_outs_r,
                                            n_hidden_1_r,
                                            activation_fn=tf.nn.elu,
                                            normalizer_fn=batch_norm,
                                            normalizer_params=bn_params,
                                            scope='dense_r1')

        r_drop1 = tf.contrib.layers.dropout(stacked_logits_r1,
                                            keep_prob_r,
                                            is_training=self.is_training)

        stacked_logits_r2 = fully_connected(r_drop1,
                                            n_hidden_2_r,
                                            activation_fn=tf.nn.elu,
                                            normalizer_fn=batch_norm,
                                            normalizer_params=bn_params,
                                            scope='dense_r2')

        r_drop2 = tf.contrib.layers.dropout(stacked_logits_r2,
                                            keep_prob_r,
                                            is_training=self.is_training)

        stacked_logits_r3 = fully_connected(r_drop2,
                                            self.n_r_outputs,
                                            activation_fn=None,
                                            normalizer_fn=batch_norm,
                                            normalizer_params=bn_params_out,
                                            scope='dense_r_out')

        self.logits_r = tf.reshape(
            stacked_logits_r3,
            [-1, tf.shape(self.y_r)[1], self.n_r_outputs],
            name='logits_r')

        # loss params
        learn_rate = 0.02
        clip = 5

        # loss
        self.loss_r = loss_fn(self.logits_r, self.y_r)
        self.loss_p = loss_fn(self.logits_p, self.y_p)
        self.total_loss = tf.add(self.loss_r, self.loss_p)

        # training op
        if self.model_type == "combine":

            optimizer = tf.train.AdamOptimizer(learning_rate=learn_rate)
            gradients = optimizer.compute_gradients(self.total_loss)
            capped_gradients = [(tf.clip_by_norm(grad, clip),
                                 var) if grad != None else (grad, var)
                                for grad, var in gradients]
            self.train_op = optimizer.apply_gradients(capped_gradients)

        elif self.model_type == "separate":

            optimizer = tf.train.AdamOptimizer(learning_rate=learn_rate)

            # rhythm
            gradients_r = optimizer.compute_gradients(self.loss_r)
            capped_gradients_r = [(tf.clip_by_norm(grad, clip),
                                   var) if grad != None else (grad, var)
                                  for grad, var in gradients_r]
            self.train_op_r = optimizer.apply_gradients(capped_gradients_r)

            # pitch
            gradients_p = optimizer.compute_gradients(self.loss_p)
            capped_gradients_p = [(tf.clip_by_norm(grad, clip),
                                   var) if grad != None else (grad, var)
                                  for grad, var in gradients_p]
            self.train_op_p = optimizer.apply_gradients(capped_gradients_p)

        # evaluation
        self.accuracy_r = accuracy_fn(self.logits_r, self.y_r)
        self.accuracy_p = accuracy_fn(self.logits_p, self.y_p)

        self.execute()
def main(argv=None):
    print("Running on {}".format(device))
    parser = argparse.ArgumentParser(
        description="Train a transformer for a copy task"
    )
    add_optimizer_arguments(parser)
    add_transformer_arguments(parser)
    add_auxiliary_arguments(parser)
    args = parser.parse_args(argv)
    print("args:\n-----\n", args)
    if args.model_type == "transformer":
        model = SequencePredictorRecurrentTransformer(
                    d_model=args.d_model, n_classes=5,
                    sequence_length=args.sequence_length,
                    attention_type=args.attention_type,
                    n_layers=args.n_layers,
                    n_heads=args.n_heads,
                    d_query=args.d_model, # used to be d_query
                    dropout=args.dropout,
                    softmax_temp=None,
                    attention_dropout=args.attention_dropout,
                )
    else:
        model = SequencePredictorRNN(
                    d_model=args.d_model, n_classes=5,
                    n_layers=args.n_layers,
                    dropout=args.dropout,
                    rnn_type=args.model_type
                )
    print(f"Created model:\n{model}")
    model.to(device)
    print("Number of epochs model was trained on: ",torch.load(args.continue_from, map_location=device)['epoch'])
    model.load_state_dict(torch.load(args.continue_from, map_location=device)['model_state'])

    def format_preds(x, y, preds, mask):
        n = len(x)
        n_dig = math.floor(math.log10(n)) + 1
        nums = []
        for p_dig in range(n_dig):
            nums.append( "# |" + "".join([str((i//10**p_dig)%10) for i in range(n)]) + "\n")
        nums = "".join(nums[::-1])
        xs = "x |" + "".join([str(int(v)) for v in x]) + "\n"
        ys = "y |" + "".join([elt if mask[i] == 1 else '?' for i, elt in enumerate([str(int(v)) for v in y])]) + "\n"
        yh = "yh|" + "".join([elt if mask[i] == 1 else '?' for i, elt in enumerate([str(int(v)) for v in preds])]) + "\n"
        return nums + xs + ys + yh

    acc_list = []
    max_acc = None
    for stack_size in range(1, 64):
        x, y, m = SubjectVerbAgreement.get_seq(stack_size)
        # print(x.shape, y.shape, m.shape)
        model.eval()
        yhat = model(x.unsqueeze(1))
        hdn = model.hidden_state # batch x seq x hdn
        loss, acc = loss_fn(y.unsqueeze(1), yhat, m.unsqueeze(1))
        acc_list.append((stack_size, acc))
        if acc == 1:
            max_acc = stack_size
    print("Highest perfect score at depth:", max_acc)
    plot_hidden_state_2d(np.array(acc_list), pca=False)

    stack_size = 7 # Change this value to test longer / shorter sequences
    x, y, m = SubjectVerbAgreement.get_seq(stack_size)
    model.eval()
    yhat = model(x.unsqueeze(1))
    hdn = model.hidden_state # batch x seq x hdn
    loss, acc = loss_fn(y.unsqueeze(1), yhat, m.unsqueeze(1))
    print("Model loss: ", loss)
    print("Model accuracy: ", acc)
    print(format_preds(x, y, torch.argmax(yhat, dim=2)[0], m))
    plot_hidden_state_2d(hdn[0].detach().cpu().numpy(), pca=True)

    """
Beispiel #13
0
# Iterations
step = 0
best_mefssim_val = 0.
torch.backends.cudnn.benchmark = True
for epoch in range(num_epoch):
    ''' train '''
    for i, img in enumerate(loader['train']):
        img = img.cuda()
        img = torch.rot90(img, int(torch.randint(4, [1])), [-1, -2])

        #1. update
        net.train()
        net.zero_grad()
        optimizer.zero_grad()
        imgf = net(img)
        _ssim = loss_fn(imgf, img)
        _l1penalty = halo_fn(imgf)
        loss = _ssim + loss_weight * _l1penalty
        loss.backward()
        optimizer.step()

        loss_weight = min(loss_weight + 0.25, 10)  # update loss weight

        #2.  print information
        print("[%d,%d] MEFSSIM: %.4f, L1: %.4f, Loss: %.4f" %
              (epoch + 1, i + 1, _ssim.item(), _l1penalty.item(), loss.item()))

        #3. log the scalar values
        writer.add_scalar('loss', loss.item(), step)
        step += 1
    ''' validation '''
Beispiel #14
0
    def prepare_model(self):
        self.rnn_x = tf.placeholder(tf.int32, [None, None], name='input')
        self.rnn_y = tf.placeholder(tf.int64, [None, self.num_items],
                                    name='output')
        self.mask = tf.placeholder(tf.float32, [None, None], name='mask')
        self.keep_prob_input = tf.placeholder(tf.float32,
                                              name='keep_prob_input')
        self.keep_prob_ho = tf.placeholder(tf.float32, name='keep_prob_ho')
        self.batch_var_length = tf.placeholder(tf.int32,
                                               name="variable_length")

        Wemb = tf.get_variable('Wemb', [self.num_items, self.embedding_size],
                               initializer=self.embed_init)
        W_encoder = tf.get_variable(
            'W_encoder', [self.rnn_hidden_size, self.rnn_hidden_size],
            initializer=self.weight_init)
        W_decoder = tf.get_variable(
            'W_decoder', [self.rnn_hidden_size, self.rnn_hidden_size],
            initializer=self.weight_init)
        Bi_vector = tf.get_variable('Bi_vector', [1, self.rnn_hidden_size],
                                    initializer=self.weight_init)
        if self.loss_type == 'EMB':
            bili = tf.get_variable(
                'bili', [self.embedding_size, 2 * self.rnn_hidden_size],
                initializer=self.weight_init)
        elif self.loss_type == "Trilinear":
            ws = tf.get_variable('ws',
                                 [self.embedding_size, self.embedding_size],
                                 initializer=self.weight_init)
            bs = tf.get_variable('bs', [self.embedding_size],
                                 initializer=self.bias_init)
            wt = tf.get_variable('wt',
                                 [self.embedding_size, self.embedding_size],
                                 initializer=self.weight_init)
            bt = tf.get_variable('bt', [self.embedding_size],
                                 initializer=self.bias_init)
        elif self.loss_type == "TOP1":
            W_top1 = tf.get_variable(
                'W_top1', [2 * self.rnn_hidden_size, self.num_items],
                initializer=self.weight_init)
            b_top1 = tf.get_variable('b_top1', [1, self.num_items],
                                     initializer=self.bias_init)
        elif self.loss_type == "TOP1_variant":
            bili = tf.get_variable(
                'bili', [self.embedding_size, 2 * self.rnn_hidden_size],
                initializer=self.weight_init)
            W_top1 = tf.get_variable(
                'W_top1', [2 * self.rnn_hidden_size, self.num_items],
                initializer=self.weight_init)
            b_top1 = tf.get_variable('b_top1', [1, self.num_items],
                                     initializer=self.bias_init)

        emb = tf.nn.embedding_lookup(Wemb, self.rnn_x)
        emb = tf.nn.dropout(emb, self.keep_prob_input)

        custom_cell = tf.contrib.rnn.GRUCell(num_units=self.rnn_hidden_size)
        outputs, states = tf.nn.dynamic_rnn(
            custom_cell,
            emb,
            sequence_length=self.batch_var_length,
            dtype=tf.float32)

        self.outputs = outputs
        self.last_hidden = states  # 512 x 100
        outputs = tf.transpose(outputs, perm=[1, 0, 2])  # 19x512x100

        squares = tf.map_fn(lambda x: compute_alpha(
            x, self.last_hidden, W_encoder, W_decoder, Bi_vector),
                            outputs)  # 19x512
        weight = tf.nn.softmax(tf.transpose(squares) + 100000000. *
                               (self.mask - 1),
                               axis=1)  # batch_size * max_len
        attention_proj = tf.reduce_sum(outputs *
                                       tf.transpose(weight)[:, :, None],
                                       axis=0)

        # num_items x 2*100
        if self.loss_type == 'EMB':
            proj = tf.concat([attention_proj, states], 1)
            proj = tf.nn.dropout(proj, self.keep_prob_ho)
            ytem = tf.matmul(Wemb, bili)
            pred = tf.matmul(proj, tf.transpose(ytem))
            self.pred = tf.nn.softmax(pred)
            self.cost = tf.reduce_mean(
                tf.nn.softmax_cross_entropy_with_logits_v2(logits=pred,
                                                           labels=self.rnn_y))
        elif self.loss_type == "Trilinear":
            hs = tf.nn.tanh(tf.matmul(attention_proj, ws) +
                            bs)  # batch * hidden
            ht = tf.nn.tanh(tf.matmul(states, wt) + bt)  # batch * hidden
            pred = tf.nn.sigmoid(
                tf.matmul(tf.multiply(ht, hs),
                          tf.transpose(Wemb)))  # batch * n_item
            self.pred = tf.nn.softmax(pred)
            self.cost = tf.reduce_mean(
                tf.nn.softmax_cross_entropy_with_logits_v2(logits=pred,
                                                           labels=self.rnn_y))
        elif self.loss_type == "TOP1":
            proj = tf.concat([attention_proj, states], 1)
            proj = tf.nn.dropout(proj, self.keep_prob_ho)
            pred = tf.matmul(proj, W_top1) + b_top1
            self.pred = tf.nn.tanh(pred)
            self.cost = loss_fn(self.rnn_y, self.pred, self.loss_type)
        elif self.loss_type == "TOP1_variant":
            proj = tf.concat([attention_proj, states], 1)
            proj = tf.nn.dropout(proj, self.keep_prob_ho)
            ytem = tf.matmul(Wemb, bili)
            pred = tf.matmul(proj, tf.transpose(ytem))
            self.pred = tf.nn.tanh(pred)
            self.cost = loss_fn(self.rnn_y, self.pred, self.loss_type)

        self.optimizer = tf.train.AdamOptimizer(self.lr).minimize(self.cost)
Beispiel #15
0
    def prepare_model(self):
        self.rnn_x1 = tf.placeholder(tf.int32, [None, self.maxlen],
                                     name='input1')
        self.rnn_x2 = tf.placeholder(tf.int32, [None, 1], name='input2')
        self.rnn_y = tf.placeholder(tf.int64, [None, self.num_items],
                                    name='output')
        self.mask_x1 = tf.placeholder(tf.float32, [None, self.maxlen],
                                      name='mask_x1')  # batch_size * maxlen
        self.mask_x2 = tf.placeholder(tf.float32, [None, 1], name='mask_x2')
        self.keep_prob_input = tf.placeholder(tf.float32,
                                              name='keep_prob_input')
        self.keep_prob_ho = tf.placeholder(tf.float32, name='keep_prob_ho')
        self.batch_var_length = tf.placeholder(tf.float32,
                                               name="variable_length")

        Wemb = tf.get_variable('Wemb', [self.num_items, self.embedding_size],
                               initializer=self.embed_init)
        w0 = tf.get_variable('w0', [self.embedding_size, 1],
                             initializer=self.weight_init)
        w1 = tf.get_variable('w1', [self.embedding_size, self.embedding_size],
                             initializer=self.weight_init)
        w2 = tf.get_variable('w2', [self.embedding_size, self.embedding_size],
                             initializer=self.weight_init)
        w3 = tf.get_variable('w3', [self.embedding_size, self.embedding_size],
                             initializer=self.weight_init)
        ba = tf.get_variable('ba', [self.embedding_size],
                             initializer=self.bias_init)

        if self.loss_type == 'EMB':
            bili = tf.get_variable(
                'bili', [self.embedding_size, 2 * self.rnn_hidden_size],
                initializer=self.weight_init)
        elif self.loss_type == "Trilinear":
            ws = tf.get_variable('ws',
                                 [self.embedding_size, self.embedding_size],
                                 initializer=self.weight_init)
            bs = tf.get_variable('bs', [self.embedding_size],
                                 initializer=self.bias_init)
            wt = tf.get_variable('wt',
                                 [self.embedding_size, self.embedding_size],
                                 initializer=self.weight_init)
            bt = tf.get_variable('bt', [self.embedding_size],
                                 initializer=self.bias_init)
        elif self.loss_type == "TOP1":
            W_top1 = tf.get_variable(
                'W_top1', [2 * self.rnn_hidden_size, self.num_items],
                initializer=self.weight_init)
            b_top1 = tf.get_variable('b_top1', [1, self.num_items],
                                     initializer=self.bias_init)
        elif self.loss_type == "TOP1_variant":
            bili = tf.get_variable(
                'bili', [self.embedding_size, 2 * self.rnn_hidden_size],
                initializer=self.weight_init)
            W_top1 = tf.get_variable(
                'W_top1', [2 * self.rnn_hidden_size, self.num_items],
                initializer=self.weight_init)
            b_top1 = tf.get_variable('b_top1', [1, self.num_items],
                                     initializer=self.bias_init)

        emb_x1 = tf.nn.embedding_lookup(
            Wemb, self.rnn_x1)  # xi (batch_size * maxlen * num_hidden)
        emb_x2 = tf.squeeze(tf.nn.embedding_lookup(Wemb, self.rnn_x2),
                            axis=1)  # xt (batch_size * num_hidden)
        tiled_mask = tf.tile(tf.expand_dims(self.mask_x1, 2),
                             [1, 1, self.rnn_hidden_size
                              ])  # xt (batch_size * maxlen * num_hidden)
        ms = tf.reduce_sum(tf.multiply(emb_x1, tiled_mask),
                           axis=1)  # batch_size * num_hidden
        tiled_var_length = tf.tile(
            tf.reshape(self.batch_var_length, [-1, 1]),
            [1, self.rnn_hidden_size])  # (batch_size * num_hidden)
        ms = tf.reshape(tf.div(ms, tiled_var_length),
                        [-1, self.rnn_hidden_size])  # batch_size * num_hidden

        outputs1 = tf.transpose(emb_x1,
                                perm=[1, 0,
                                      2])  # maxlen * batch_size * num_hidden
        unnormalized_alpha = tf.map_fn(
            lambda x: compute_alpha_STAMP(x, emb_x2, ms, w0, w1, w2, w3, ba),
            outputs1)  # maxlen * batch_size
        unnormalized_alpha = tf.multiply(tf.transpose(unnormalized_alpha),
                                         self.mask_x1)  # batch_size * maxlen
        self.unnormalized_alpha = unnormalized_alpha
        alpha = unnormalized_alpha  # batch_size * maxlen
        #alpha = tf.nn.softmax(unnormalized_alpha + 100000000. * (self.mask_x1 - 1), dim=1)  # batch_size * max_len
        self.alpha = alpha
        tiled_alpha = tf.tile(
            tf.expand_dims(alpha, axis=2),
            [1, 1, self.rnn_hidden_size])  # batch_size * maxlen * hidden_size
        self.tiled_alpha = tiled_alpha
        ma = tf.reduce_sum(tf.multiply(emb_x1, tiled_alpha),
                           axis=1)  # batch * hidden
        hs = tf.nn.tanh(tf.matmul(ma, ws) + bs)  # batch * hidden
        ht = tf.nn.tanh(tf.matmul(emb_x2, wt) + bt)  # batch * hidden

        if self.loss_type == 'EMB':
            proj = tf.concat([hs, ht], 1)
            proj = tf.nn.dropout(proj, self.keep_prob_ho)
            ytem = tf.matmul(Wemb, bili)
            pred = tf.matmul(proj, tf.transpose(ytem))
            self.pred = tf.nn.softmax(pred)
            self.cost = tf.reduce_mean(
                tf.nn.softmax_cross_entropy_with_logits_v2(logits=pred,
                                                           labels=self.rnn_y))
        elif self.loss_type == "Trilinear":
            pred = tf.nn.sigmoid(
                tf.matmul(tf.multiply(ht, hs),
                          tf.transpose(Wemb)))  # batch * n_item
            self.pred = tf.nn.softmax(pred)
            self.cost = tf.reduce_mean(
                tf.nn.softmax_cross_entropy_with_logits_v2(logits=pred,
                                                           labels=self.rnn_y))
        elif self.loss_type == "TOP1":
            proj = tf.concat([hs, ht], 1)
            proj = tf.nn.dropout(proj, self.keep_prob_ho)
            pred = tf.matmul(proj, W_top1) + b_top1
            self.pred = tf.nn.tanh(pred)
            self.cost = loss_fn(self.rnn_y, self.pred, self.loss_type)
        elif self.loss_type == "TOP1_variant":
            pred = tf.nn.sigmoid(
                tf.matmul(tf.multiply(ht, hs),
                          tf.transpose(Wemb)))  # batch * n_item
            self.pred = tf.nn.tanh(pred)
            self.cost = loss_fn(self.rnn_y, self.pred, self.loss_type)

        self.optimizer = tf.train.AdamOptimizer(self.lr).minimize(self.cost)
Beispiel #16
0
def validate(ldr_dir, hdr_dir, gen_dir, logs_dir, img_height, img_width):
    X = tf.placeholder(tf.float32, [1, img_height, img_width, 3])
    Y = tf.placeholder(tf.float32, [1, img_height, img_width, 3])

    valid_ldr_path, _ = get_filepath(ldr_dir, '.png')
    valid_hdr_path, valid_hdr_name = get_filepath(hdr_dir, '.hdr')
    num_valid = len(valid_hdr_path)

    # Data loader
    dataset = tf.data.Dataset.from_tensor_slices(
        (valid_ldr_path, valid_hdr_path))
    dataset = dataset.map(valid_parse, num_parallel_calls=4)
    dataset = dataset.batch(1)
    iter = dataset.make_one_shot_iterator()
    ldr_img, hdr_img, Hth = iter.get_next()

    alpha = alpha_msk(X)

    # Prediction
    with tf.name_scope('HDR_CNN'):
        hdr_nn = hdrcnn(X, is_training=False, reuse=False)
    hdr_final = get_final_hdr(X, hdr_nn)

    # Loss functions
    with tf.name_scope('Loss'):
        irloss, dirloss = loss_fn(X, hdr_nn, Y)

    saver = tf.train.Saver(tf.global_variables())

    with tf.Session() as sess:
        # Initialize variables
        sess.run(tf.global_variables_initializer())

        # Restore weights of model
        saver.restore(sess, model_dir)

        # Validation
        log = "\n========== Validation Begin ==========\n"
        write_logs(logs_dir, log, True)
        valid_start = time.time()
        avg_irloss = 0
        avg_dirloss = 0
        for f in valid_hdr_name:
            valid_img_start = time.time()
            ldr_image, hdr_image, Hth_val = sess.run([ldr_img, hdr_img, Hth])
            alpha_val, hdr_pred, irloss_val, dirloss_val = sess.run(
                [alpha, hdr_final, irloss, dirloss],
                feed_dict={
                    X: ldr_image,
                    Y: hdr_image
                })
            avg_irloss += irloss_val
            avg_dirloss += dirloss_val

            f1, _ = f.split("_")
            img_write(gen_dir, 'alpha_' + f1 + '_HDR.png', alpha_val, 'PNG-FI')

            # Gamma correction
            hdr_pred_save = np.multiply(Hth_val, np.maximum(hdr_pred, 0.0))
            img_write(gen_dir, 'pred_' + f, hdr_pred_save, 'HDR-FI')

            # Tone mapping
            hdr_pred_gamma = np.power(np.maximum(hdr_pred, 0.0), gamma)
            ldr_tone = reinhard02(hdr_pred_gamma, a=0.18)
            img_write(gen_dir, 'tm_' + f1 + '_HDR.png', ldr_tone, 'PNG-FI')

            log = "Image {}, Time {:2.5f}, Shape = {}, I/R Loss = {:2.5f}, Direct Loss = {:2.5f}".format(
                f,
                time.time() - valid_img_start, hdr_pred.shape, irloss_val,
                dirloss_val)
            write_logs(logs_dir, log, False)
        log = "\nAverage I/R Loss = {:2.5f}, Average Direct Loss = {:2.5f}".format(
            avg_irloss / num_valid, avg_dirloss / num_valid)
        write_logs(logs_dir, log, False)
        log = "\nValidation Time: {:2.5f}".format(time.time() - valid_start)
        write_logs(logs_dir, log, False)
        log = "\n========== Validation End ==========\n"
        write_logs(logs_dir, log, False)

        sess.close()
Beispiel #17
0

if __name__ == "__main__":

    EMBEDDING_DIM = 100
    HIDDEN_DIM = 100
    BATCH_SIZE = 256

    vocab = build_vocab('data')
    word_vocab, label_vocab = vocab
    train_dataset = NERDataset('data', vocab, type='/train')
    train_loader = DataLoader(train_dataset,
                              batch_size=BATCH_SIZE,
                              num_workers=2,
                              collate_fn=custom_collate,
                              shuffle=True)
    sample_data, sample_target, sample_len = next(iter(train_loader))
    sample_data = sample_data.long()

    model = RNN(EMBEDDING_DIM, HIDDEN_DIM, len(word_vocab), len(label_vocab))
    hidden = model.init_hidden(BATCH_SIZE)

    with torch.no_grad():
        tag_scores = model(sample_data, hidden)
        print(tag_scores.shape)

    loss = loss_fn(tag_scores, sample_target)
    print(loss.item())
    acc, f1 = accuracy(tag_scores, sample_target)
    print(acc, f1)
Beispiel #18
0
def main(argv=None):
    print("Running on {}".format(device))
    parser = argparse.ArgumentParser(
        description="Train a transformer for a copy task")
    add_optimizer_arguments(parser)
    add_transformer_arguments(parser)
    add_auxiliary_arguments(parser)
    args = parser.parse_args(argv)
    print("args:\n-----\n", args)

    data_points = []
    for model_type in ['rnn', 'lstm', 'transformer']:
        for max_trained_depth in range(1, 12):
            for test_depth in range(1, 21):
                for ii in range(10):
                    if model_type == "transformer":
                        model = SequencePredictorRecurrentTransformer(
                            d_model=16,
                            n_classes=5,
                            sequence_length=args.sequence_length,
                            attention_type=args.attention_type,
                            n_layers=args.n_layers,
                            n_heads=args.n_heads,
                            d_query=8,  # used to be d_query
                            dropout=args.dropout,
                            softmax_temp=None,
                            attention_dropout=args.attention_dropout,
                        )
                    else:
                        model = SequencePredictorRNN(
                            d_model=8 if model_type == 'lstm' else 8,
                            n_classes=5,
                            n_layers=args.n_layers,
                            dropout=args.dropout,
                            rnn_type=model_type)
                    print(f"Created model:\n{model}")
                    model.to(device)
                    model_name = "models_from_colab/agreement_models/model_" + model_type + "_depth_" + str(
                        max_trained_depth) + "_num_" + str(ii) + ".zip"
                    model.load_state_dict(
                        torch.load(model_name,
                                   map_location=device)['model_state'])

                    stack_size = test_depth
                    x, y, m = SubjectVerbAgreement.get_seq(stack_size)
                    model.eval()
                    yhat = model(x.unsqueeze(1))
                    hdn = model.hidden_state  # batch x seq x hdn
                    loss, acc = loss_fn(y.unsqueeze(1), yhat, m.unsqueeze(1))
                    data_points.append({
                        'model_type': model_type,
                        'max_trained_depth': max_trained_depth,
                        'test_depth': test_depth,
                        'accuracy': acc
                    })
    print("data points:")
    print(data_points)

    with open("data_points_sva.txt", "wb") as fp:
        pickle.dump(data_points, fp)
    """
Beispiel #19
0
def main():
    # Training settings
    parser = ArgumentParser()
    parser.add_argument('-d',
                        '--device',
                        default=None,
                        type=str,
                        help='indices of GPUs to enable (default: None)')
    parser.add_argument('-b',
                        '--batch-size',
                        type=int,
                        default=1024,
                        help='number of batch size for training')
    parser.add_argument('-e',
                        '--epochs',
                        type=int,
                        default=100,
                        help='number of epochs to train (default: 100)')
    parser.add_argument('--save-path',
                        type=str,
                        default='result/model.pth',
                        help='path to trained model to save')
    parser.add_argument('--model',
                        choices=['MLP', 'BiLSTM', 'BiLSTMAttn', 'CNN'],
                        default='MLP',
                        help='model name')
    parser.add_argument('--env',
                        choices=['local', 'server'],
                        default='server',
                        help='development environment')
    parser.add_argument('--word-dim',
                        type=int,
                        default=128,
                        help='the dimension of embedding')
    parser.add_argument(
        '--word-lim',
        type=int,
        default=None,
        help='If specified, input sequence length is limited from tail.')
    parser.add_argument('--lr', type=float, default=1e-3, help='learning rate')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        help='random seed (default: 1)')
    args = parser.parse_args()

    torch.manual_seed(args.seed)
    os.makedirs(os.path.dirname(args.save_path), exist_ok=True)

    if args.device:
        os.environ["CUDA_VISIBLE_DEVICES"] = args.device
    device = torch.device('cuda:0' if torch.cuda.is_available()
                          and args.device is not None else 'cpu')

    model_w2v = KeyedVectors.load_word2vec_format(W2V_MODEL_FILE[args.env],
                                                  binary=True)
    word_to_id = word2id(model_w2v)
    initial_embedding = load_word_embedding(model_w2v)

    # setup data_loader instances
    train_data_loader = PosNegDataLoader(TRAIN_FILE[args.env],
                                         word_to_id,
                                         args.word_lim,
                                         args.batch_size,
                                         shuffle=True,
                                         num_workers=2)
    valid_data_loader = PosNegDataLoader(VALID_FILE[args.env],
                                         word_to_id,
                                         args.word_lim,
                                         args.batch_size,
                                         shuffle=False,
                                         num_workers=2)

    # build model architecture
    if args.model == 'MLP':
        model = MLP(word_dim=args.word_dim,
                    hidden_size=100,
                    vocab_size=len(word_to_id))
    elif args.model == 'BiLSTM':
        model = BiLSTM(word_dim=args.word_dim,
                       hidden_size=100,
                       vocab_size=len(word_to_id))
    elif args.model == 'BiLSTMAttn':
        model = BiLSTMAttn(word_dim=args.word_dim,
                           hidden_size=100,
                           vocab_size=len(word_to_id))
    elif args.model == 'CNN':
        model = CNN(word_dim=args.word_dim,
                    word_lim=args.word_lim,
                    vocab_size=len(word_to_id))
    else:
        raise ValueError(
            f'model name should be "MLP", "BiLSTM", "BiLSTMAttn", or "CNN", but given {args.model}'
        )

    model.set_initial_embedding(initial_embedding)
    model.to(device)

    # build optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    best_valid_acc = -1

    for epoch in range(1, args.epochs + 1):
        print(f'*** epoch {epoch} ***')
        # train
        model.train()
        total_loss = 0
        total_correct = 0
        for batch_idx, (source, mask, target) in enumerate(train_data_loader):
            source = source.to(device)  # (b, len)
            mask = mask.to(device)  # (b, len)
            target = target.to(device)  # (b)

            # Forward pass
            output = model(source, mask)  # (b, 2)
            loss = loss_fn(output, target)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            total_correct += metric_fn(output, target)
        print(f'train_loss={total_loss / train_data_loader.n_samples:.3f}',
              end=' ')
        print(
            f'train_accuracy={total_correct / train_data_loader.n_samples:.3f}'
        )

        # validation
        model.eval()
        with torch.no_grad():
            total_loss = 0
            total_correct = 0
            for batch_idx, (source, mask,
                            target) in enumerate(valid_data_loader):
                source = source.to(device)  # (b, len)
                mask = mask.to(device)  # (b, len)
                target = target.to(device)  # (b)

                output = model(source, mask)  # (b, 2)

                total_loss += loss_fn(output, target)
                total_correct += metric_fn(output, target)
        valid_acc = total_correct / valid_data_loader.n_samples
        print(f'valid_loss={total_loss / valid_data_loader.n_samples:.3f}',
              end=' ')
        print(f'valid_accuracy={valid_acc:.3f}\n')
        if valid_acc > best_valid_acc:
            torch.save(model.state_dict(), args.save_path)
            best_valid_acc = valid_acc
Beispiel #20
0
model = VAE(device).to(device)
if args.pretrained != 'None':
    model.load_state_dict(torch.load(args.pretrained))
optimizer = Adam(model.parameters(), lr)
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=10, eps=1e-4)
clip_norm = args.clip_norm
criterion = loss_fn
num_epochs = args.num_epochs
logdir = './logdir'

for epoch in range(args.num_epochs):
    for idx, images in enumerate(train_loader):

        recon_images, mu, logvar = model(images.to(device))
        #print(recon_images, mu, logvar)
        loss, bce, kld = loss_fn(recon_images, images.to(device), mu, logvar)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        to_print = "Epoch[{}/{}] Loss: {:.3f} {:.3f} {:.3f}".format(
            epoch + 1, args.num_epochs, loss.data / args.train_batch_size,
            bce.data / args.train_batch_size, kld.data / args.train_batch_size)
        print(to_print)
    torch.save(model.state_dict(), args.output_weights)
'''# model runner
runner = SupervisedRunner()

# model training
runner.train(
    model=model,