Esempio n. 1
0
  def __init__(self, hparams):
    super(PositionwiseFF, self).__init__()
    self.hparams = hparams

    self.w_1 = nn.Linear(hparams.d_model, hparams.d_inner, bias=False)
    self.w_2 = nn.Linear(hparams.d_inner, hparams.d_model, bias=False)
    self.dropout = nn.Dropout(hparams.dropout)
    self.relu = nn.ReLU()
    self.layer_norm = LayerNormalization(hparams.d_model, hparams)

    init_param(self.w_1.weight, init_type="uniform", init_range=hparams.init_range)
    init_param(self.w_2.weight, init_type="uniform", init_range=hparams.init_range)
Esempio n. 2
0
    def __init__(self, input_dim, output_dim, use_bias=True, name=''):
        super(SigmoidLinear, self).__init__()
        self.name = name
        self.use_bias = use_bias
        self.W = init_param((input_dim, output_dim))
        self.params[self.name + '/W'] = self.W
        if self.use_bias:
            self.b = init_param((output_dim, ))
            self.params[self.name + '/b'] = self.b

        self.input = None
        self.output = None
Esempio n. 3
0
  def __init__(self, hparams):
    super(MultiHeadAttn, self).__init__()

    self.hparams = hparams

    self.attention = ScaledDotProdAttn(hparams)
    self.layer_norm = LayerNormalization(hparams.d_model, hparams)

    # projection of concatenated attn
    n_heads = self.hparams.n_heads
    d_model = self.hparams.d_model
    d_q = self.hparams.d_k
    d_k = self.hparams.d_k
    d_v = self.hparams.d_v
    # d_q == d_k == k_v
    self.q = nn.Linear(d_model, n_heads * d_q, bias=False)
    self.k = nn.Linear(d_model, n_heads * d_k, bias=False)
    self.v = nn.Linear(d_model, n_heads * d_v, bias=False)
    init_param(self.q.weight, init_type="uniform", init_range=hparams.init_range)
    init_param(self.k.weight, init_type="uniform", init_range=hparams.init_range)
    init_param(self.v.weight, init_type="uniform", init_range=hparams.init_range)

    # Q, K, V = [], [], []
    # for head_id in range(n_heads):
    #   q = nn.Linear(d_model, d_q, bias=False)
    #   k = nn.Linear(d_model, d_k, bias=False)
    #   v = nn.Linear(d_model, d_v, bias=False)
    #   init_param(q.weight, init_type="uniform", init_range=hparams.init_range)
    #   init_param(k.weight, init_type="uniform", init_range=hparams.init_range)
    #   init_param(v.weight, init_type="uniform", init_range=hparams.init_range)
    #   Q.append(q)
    #   K.append(k)
    #   V.append(v)
    # self.Q = nn.ModuleList(Q)
    # self.K = nn.ModuleList(K)
    # self.V = nn.ModuleList(V)
    if self.hparams.cuda:
      #self.Q = self.Q.cuda()
      #self.K = self.K.cuda()
      #self.V = self.V.cuda()
      self.q = self.q.cuda()
      self.k = self.k.cuda()
      self.v = self.v.cuda()

    self.w_proj = nn.Linear(n_heads * d_v, d_model, bias=False)
    init_param(self.w_proj.weight, init_type="uniform", init_range=hparams.init_range)
    if self.hparams.cuda:
      self.w_proj = self.w_proj.cuda()
Esempio n. 4
0
def train(x_train, y_train):
    num_of_vectors = m.shape[1]
    vectors = []
    for i in range(num_of_vectors):
        vector = {}
        vector['i'] = i
        vector['wi'] = init_param(x_train[0])
        vector['win'], vector['lose'] = get_win_lose_classes_by_column(m, i)
        vectors.append(vector)

    for i in range(len(y_train)):
        x = x_train[i]
        y = y_train[i]
        for v in vectors:
            v['wi'], lossi = train_custom(v, x, y, i)

    weights = [v['wi'] for v in vectors]
    return weights
Esempio n. 5
0
  def __init__(self, hparams):
    super(RelativeMultiHeadAttn, self).__init__()

    self.hparams = hparams

    self.attention = ScaledDotProdAttn(hparams)
    self.layer_norm = LayerNormalization(hparams.d_model, hparams)
    self.temp = np.power(hparams.d_model, 0.5)
    self.softmax = nn.Softmax(dim=-1)
    self.pos_emb = PositionalEmbedding(hparams)
    self.dropout = nn.Dropout(hparams.dropout)
    # projection of concatenated attn
    n_heads = self.hparams.n_heads
    d_model = self.hparams.d_model
    d_q = self.hparams.d_k
    d_k = self.hparams.d_k
    d_v = self.hparams.d_v

    #self.q = nn.Linear(d_model, n_heads * d_q, bias=False)
    #self.k = nn.Linear(d_model, n_heads * d_k, bias=False)
    #self.v = nn.Linear(d_model, n_heads * d_v, bias=False)
    #self.r = nn.Linear(d_model, n_heads * d_v, bias=False)
    #init_param(self.q.weight, init_type="uniform", init_range=hparams.init_range)
    #init_param(self.k.weight, init_type="uniform", init_range=hparams.init_range)
    #init_param(self.v.weight, init_type="uniform", init_range=hparams.init_range)
    #init_param(self.r.weight, init_type="uniform", init_range=hparams.init_range)

    Q, K, V, R = [], [], [], []
    for head_id in range(n_heads):
      q = nn.Linear(d_model, d_q, bias=False)
      k = nn.Linear(d_model, d_k, bias=False)
      v = nn.Linear(d_model, d_v, bias=False)
      r = nn.Linear(self.hparams.d_word_vec, d_k, bias=False)
      init_param(q.weight, init_type="uniform", init_range=hparams.init_range)
      init_param(k.weight, init_type="uniform", init_range=hparams.init_range)
      init_param(v.weight, init_type="uniform", init_range=hparams.init_range)
      init_param(r.weight, init_type="uniform", init_range=hparams.init_range)
      Q.append(q)
      K.append(k)
      V.append(v)
      R.append(r)
    self.Q = nn.ModuleList(Q)
    self.K = nn.ModuleList(K)
    self.V = nn.ModuleList(V)
    self.R = nn.ModuleList(R)
    if self.hparams.cuda:
      self.Q = self.Q.cuda()
      self.K = self.K.cuda()
      self.V = self.V.cuda()
      self.R = self.R.cuda()
      #self.q = self.q.cuda()
      #self.k = self.k.cuda()
      #self.r = self.r.cuda()
    if self.hparams.relative_pos_c:
      #self.u = nn.Linear(1, d_q, bias=False)
      self.u = nn.Linear(d_q, 1, bias=False)
      init_param(self.u.weight, init_type="uniform", init_range=hparams.init_range)
    if self.hparams.relative_pos_d:
      #self.v = nn.Linear(1, d_q, bias=False)
      self.v = nn.Linear(d_q, 1, bias=False)
      init_param(self.v.weight, init_type="uniform", init_range=hparams.init_range)
    self.w_proj = nn.Linear(n_heads * d_v, d_model, bias=False)
    init_param(self.w_proj.weight, init_type="uniform", init_range=hparams.init_range)
    if self.hparams.cuda:
      self.w_proj = self.w_proj.cuda()
      if self.hparams.relative_pos_c:
        self.u = self.u.cuda()
      if self.hparams.relative_pos_d:
        self.v = self.v.cuda()
Esempio n. 6
0
            with open("result.txt", "a") as f:
                f.write("episode{0}\tReward: {1:.2f}".format(
                    i_episode, val_reward))
                f.write("\n")

        RM.reset()
        # break

        with open("total_reward.txt", "a") as f:
            f.write("{0}\t{1}".format(i_episode, total_reward))
            f.write("\n")


if __name__ == "__main__":

    args = parser.parse_args()
    model = DQN().to(device)
    init_param(model)
    train(model)
    # NOTE: this is required for the ``fork`` method to work

    # model = DQN().to(device)
    # num_processes = 2
    # model.share_memory()
    # processes = []
    # for rank in range(num_processes):
    #     p = mp.Process(target=train, args=(model,))
    #     p.start()
    #     processes.append(p)
    # for p in processes:
    #     p.join()
Esempio n. 7
0
  def __init__(self, hparams, enc=False, n_layer=-1):
    super(RelativeMultiHeadAttn, self).__init__()

    self.hparams = hparams
    self.set_sep = (n_layer in self.hparams.sep_layer) and enc
    self.enc = enc
    self.n_layer = n_layer
    #self.layer_norm = LayerNormalization(hparams.d_model, hparams)
    self.layer_norm = torch.nn.LayerNorm(hparams.d_model)
    self.temp = np.power(hparams.d_model, 0.5)
    self.softmax = nn.Softmax(dim=2)
    self.pos_emb = PositionalEmbedding(hparams)
    self.dropout = nn.Dropout(hparams.dropout)
    # projection of concatenated attn
    n_heads = self.hparams.n_heads
    d_model = self.hparams.d_model
    d_q = self.hparams.d_k
    d_k = self.hparams.d_k
    d_v = self.hparams.d_v

    self.q = nn.Linear(d_model, n_heads * d_q, bias=False)
    self.k = nn.Linear(d_model, n_heads * d_k, bias=False)
    self.v = nn.Linear(d_model, n_heads * d_v, bias=False)
    init_param(self.q.weight, init_type="uniform", init_range=hparams.init_range)
    init_param(self.k.weight, init_type="uniform", init_range=hparams.init_range)
    init_param(self.v.weight, init_type="uniform", init_range=hparams.init_range)

    if self.hparams.sep_head_weight and self.enc:
      self.head_w = []
      for i in range(self.hparams.lan_size):
        h_w = nn.Linear(d_model, n_heads, bias=False)
        init_param(h_w.weight, init_type="uniform", init_range=hparams.init_range)
        self.head_w.append(h_w)
      self.head_w = nn.ModuleList(self.head_w)
      if self.hparams.cuda: self.head_w = self.head_w.cuda()
    
    if self.enc and self.n_layer < self.hparams.max_loc_layer:
      self.r = []

      r = nn.Linear(d_model, n_heads * d_v, bias=False)
      init_param(r.weight, init_type="uniform", init_range=hparams.init_range)
      self.r.append(r)
      self.r = nn.ModuleList(self.r)

    if self.hparams.cuda:
      self.q = self.q.cuda()
      self.k = self.k.cuda()
      self.v = self.v.cuda()
      if self.enc and self.n_layer < self.hparams.max_loc_layer:
        self.r = self.r.cuda()
    if self.hparams.relative_pos_c:
      ub = nn.Linear(d_q, 1, bias=False)
      init_param(ub.weight, init_type="uniform", init_range=hparams.init_range)
      self.ub = ub
    if self.hparams.relative_pos_d and (self.enc and self.n_layer < self.hparams.max_loc_layer):
      self.vb = []
      vb = nn.Linear(d_q, 1, bias=False)
      init_param(vb.weight, init_type="uniform", init_range=hparams.init_range)
      self.vb.append(vb)
      self.vb = nn.ModuleList(self.vb)
      if self.hparams.cuda: self.vb = self.vb.cuda()

    self.w_proj = nn.Linear(n_heads * d_v, d_model, bias=False)
    init_param(self.w_proj.weight, init_type="uniform", init_range=hparams.init_range)
    if self.hparams.cuda:
      self.w_proj = self.w_proj.cuda()
      if self.hparams.relative_pos_c:
        self.ub = self.ub.cuda()
Esempio n. 8
0
def train_continuous_mnist(args, model, device, train_loader, test_loader):
    ava_test = []
    weight_lst = utils.weight_lst(model)
    w_mat_lst, m_mat_lst, a_mat_lst, b_mat_lst, avg_psi_mat_lst, e_a_mat_lst, e_b_mat_lst = \
        utils.init_param(weight_lst, args.s_init, device, True, args.alpha)
    for task in range(len(test_loader)):
        for epoch in range(1, args.epochs + 1):
            for batch_idx, (data, target) in enumerate(train_loader[0]):
                model.train()
                data, target = data.to(device), target.to(device)
                data = data.view(-1, 784)
                for mc_iter in range(args.train_mc_iters):
                    # Phi ~ MN(0,I,I)
                    phi_mat_lst = utils.gen_phi(w_mat_lst, device)
                    # W = M +B*Phi*A^t
                    utils.randomize_weights(weight_lst, w_mat_lst, m_mat_lst,
                                            a_mat_lst, b_mat_lst, phi_mat_lst)
                    output = model(data)
                    criterion = nn.CrossEntropyLoss()
                    loss = args.batch_size * criterion(output, target)
                    utils.zero_grad(weight_lst)
                    loss.backward()
                    grad_mat_lst = utils.weight_grad(weight_lst, device)
                    utils.aggregate_grads(args, avg_psi_mat_lst, grad_mat_lst)
                    utils.aggregate_e_a(args, e_a_mat_lst, grad_mat_lst,
                                        b_mat_lst, phi_mat_lst)
                    utils.aggregate_e_b(args, e_b_mat_lst, grad_mat_lst,
                                        a_mat_lst, phi_mat_lst)
                # M = M - B*B^t*avg_Phi*A*A^t
                utils.update_m(m_mat_lst, a_mat_lst, b_mat_lst,
                               avg_psi_mat_lst, args.eta)
                utils.update_a_b(a_mat_lst, b_mat_lst, e_a_mat_lst,
                                 e_b_mat_lst, device, args.use_gsvd)
                utils.zero_matrix(avg_psi_mat_lst, e_a_mat_lst, e_b_mat_lst)
            model.eval()
            with torch.no_grad():
                correct = 0
                for data, target in test_loader[task]:
                    data, target = data.to(device), target.to(device)
                    data = data.view(-1, 784)
                    for mc_iter in range(args.train_mc_iters):
                        phi_mat_lst = utils.gen_phi(w_mat_lst, device)
                        utils.randomize_weights(weight_lst, w_mat_lst,
                                                m_mat_lst, a_mat_lst,
                                                b_mat_lst, phi_mat_lst)
                        output = model(data)
                        pred = output.argmax(
                            dim=1, keepdim=True
                        )  # get the index of the max log-probability
                        correct += pred.eq(target.view_as(pred)).sum().item()
                test_acc = 100. * correct / (len(test_loader[task].dataset) *
                                             args.train_mc_iters)
            print(
                '\nTask num {}, Epoch num {} Test Accuracy: {:.2f}%\n'.format(
                    task, epoch, test_acc))
        test_acc_lst = []
        for i in range(task + 1):
            model.eval()
            with torch.no_grad():
                correct = 0
                for data, target in test_loader[i]:
                    data, target = data.to(device), target.to(device)
                    data = data.view(-1, 784)
                    for mc_iter in range(args.train_mc_iters):
                        phi_mat_lst = utils.gen_phi(w_mat_lst, device)
                        utils.randomize_weights(weight_lst, w_mat_lst,
                                                m_mat_lst, a_mat_lst,
                                                b_mat_lst, phi_mat_lst)
                        output = model(data)
                        pred = output.argmax(
                            dim=1, keepdim=True
                        )  # get the index of the max log-probability
                        correct += pred.eq(target.view_as(pred)).sum().item()
                test_acc = 100. * correct / (len(test_loader[i].dataset) *
                                             args.train_mc_iters)
                test_acc_lst.append(test_acc)
            print('\nTraning task Num: {} Test Accuracy of task {}: {:.2f}%\n'.
                  format(task, i, test_acc))
        print(test_acc_lst)
        ava_test.append(np.average(np.asanyarray(test_acc_lst)))
    return ava_test