コード例 #1
0
ファイル: optimizer.py プロジェクト: leiloong/kge-rl
 def nll(self,scores,volatile):
     y_pos = util.to_var(np.ones(scores.size()[0],dtype='float32'),volatile=volatile)
     y_neg = util.to_var(-1.*np.ones(scores.size()[0], dtype='float32'),volatile=volatile)
     loss = self.logistic(scores[:, 0],y_pos)
     for i in range(1,scores.size()[1]):
         loss += self.logistic(scores[:, i], y_neg)
     return loss/scores.size()[1]
コード例 #2
0
 def logistic(self,scores):
     y_pos = util.to_var(np.ones(scores.size()[0],dtype='float32'),requires_grad=False)
     y_neg = util.to_var(np.zeros(scores.size()[0], dtype='float32'),requires_grad=False)
     loss = self.bce(scores[:, 0],y_pos)
     for i in range(1,scores.size()[1]):
         loss += self.bce(scores[:, i], y_neg)
     return loss/scores.size()[1]
コード例 #3
0
def mixup(image, label, num_classes):
    alpha = 1.0

    rand_idx = torch.randperm(label.shape[0])
    image2 = image[rand_idx].clone()
    label2 = label[rand_idx].clone()

    y_one_hot = torch.eye(num_classes, device='cuda')[label].clone()
    y2_one_hot = torch.eye(num_classes, device='cuda')[label2].clone()
    mix_rate = np.random.beta(alpha, alpha, image.shape[0])

    mix_rate2 = None
    if image.ndim == 2:
        mix_rate2 = util.to_var(
            torch.from_numpy(mix_rate.reshape((image.shape[0], 1))).float())
    elif image.ndim == 4:
        mix_rate2 = util.to_var(
            torch.from_numpy(mix_rate.reshape(
                (image.shape[0], 1, 1, 1))).float())

    mix_rate = util.to_var(
        torch.from_numpy(mix_rate.reshape((image.shape[0], 1))).float())

    x_mixed = image.clone() * mix_rate2 + image2.clone() * (1 - mix_rate2)
    y_soft = y_one_hot * mix_rate + y2_one_hot * (1 - mix_rate)

    # util.save_images(x_mixed)

    return x_mixed, y_soft
コード例 #4
0
    def init_hidden(self, num_layers, batch_size, hidden_dim):
        """
		initialize h0, c0
		"""
        h = util.to_var(torch.zeros(num_layers, batch_size, hidden_dim))
        c = util.to_var(torch.zeros(num_layers, batch_size, hidden_dim))

        return h, c
コード例 #5
0
 def output(self, entities, rels, is_target):
     entities = self.entities(util.to_var(entities, True)).unsqueeze(2)
     rels = self.rels(util.to_var(rels, True))
     if is_target:
         out = entities + rels
     else:
         out = entities - rels
     return out.view(-1, out.size()[1] * out.size()[2]).data.cpu().numpy()
コード例 #6
0
    def sample_old(self, n=4, z=None, max_length=60):


        if z is None:
            z = to_var(torch.randn([n, self.latent_size]))

        batch_size, l = z.size()

        hidden = self.tohidden(z)
        hidden = hidden.view(2, batch_size, self.hidden_size)

        # required for dynamic stopping of sentence generation
        sequence_idx     = torch.arange(0, batch_size, out=self.tensor()).long() # all idx of batch
        sequence_running = torch.arange(0, batch_size, out=self.tensor()).long() # all idx of batch which are still generating
        sequence_mask    = torch.ones(batch_size, out=self.tensor()).byte()

        running_seqs = torch.arange(0, batch_size, out=self.tensor()).long() # idx of still generating sequences with respect to current loop

        generations = self.tensor(batch_size, max_length).fill_(PAD).long()

        t=0
        while t < max_length and len(running_seqs) > 0:

            if t == 0:
                input_sequence = to_var(torch.Tensor(batch_size).fill_(SOS).long())

            input_sequence = input_sequence.unsqueeze(1)

            input_embedding = self.embedding(input_sequence)

            output, hidden = self.decoder_rnn(input_embedding, hidden)

            logits = self.outputs2vocab(output)

            input_sequence = self._sample_old(logits)

            # save next input
            generations = self._save_sample(generations, input_sequence, sequence_running, t)

            # update global running sequence
            sequence_mask[sequence_running] = (input_sequence != EOS).data
            sequence_running = sequence_idx.masked_select(sequence_mask)

            # update local running sequences
            running_mask = (input_sequence != EOS).data
            running_seqs = running_seqs.masked_select(running_mask)

            # prune input and hidden state according to local update
            if len(running_seqs.size()) > 0:

                input_sequence = input_sequence[running_seqs]
                hidden = hidden[:, running_seqs]

                running_seqs = torch.arange(0, len(running_seqs), out=self.tensor()).long()

            t += 1

        return generations
コード例 #7
0
	def eval_epoch(self, model, data_loader, criterion):
		total_loss = 0.
		total_words = 0.
		for i, (data, target) in enumerate(data_loader):
			x, y = util.to_var(data, volatile=True), util.to_var(target, volatile=True) # x: (None, sequence_len + 1), y: (None, sequence_len + 1). Should use volatile if no backward operation
			logits = model(x) # (None, vocab_size, sequence_len+1)
			loss = criterion(logits, y)
			total_loss += loss.data.cpu()[0]
			total_words += x.size(0) * x.size(1)
		return total_loss / total_words
コード例 #8
0
def train():

    train_file_idxs = np.arange(0, len(TRAIN_FILES))
    np.random.shuffle(train_file_idxs)
    model = get_model()
    print(model)
    dtype = torch.FloatTensor
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.001)
    losses = []
    # writer = SummaryWriter()
    global_step = 0
    for epoch in range(num_epochs):
        running_loss = 0
        num_total_btches = 0
        for i in range(len(TRAIN_FILES)):
            train_data, current_labels = util.loadDataFile(
                TRAIN_FILES[train_file_idxs[i]])
            train_data = util.to_var(torch.from_numpy(train_data))
            current_labels = util.to_var(torch.from_numpy(current_labels))
            num_batches = train_data.shape[0] // args.batch_size
            print('Training file: {:5d} |num of batches: {:5d}'.format(
                i, num_batches))
            for btch in range(num_batches):
                # print('Batch [{:5d}/{:5d}]'.format(btch, num_batches))
                optimizer.zero_grad()
                start_idx = btch * args.batch_size
                end_idx = (btch + 1) * args.batch_size
                current_train = train_data[start_idx:end_idx, :, :]
                btch_label = current_labels[start_idx:end_idx, :].type(
                    torch.long).cuda()
                # pdb.set_trace()
                logits = model(current_train)
                # pdb.set_trace()
                loss = criterion(logits, btch_label.view(-1))
                loss.backward()
                optimizer.step()
                preds = F.log_softmax(logits, 1)
                pred_choice = preds.data.max(1)[1]
                correct = pred_choice.eq(btch_label.data).cpu().sum()
                running_loss += loss.item() * args.batch_size
                losses.append(loss.item())
                # writer.add_scalar('loss',loss.item(), global_step)
                # writer.add_graph(model,current_train)
                if btch % args.print_every == 0:
                    print(
                        'Epoch [{:5d}/{:5d}] | loss: {:6.4f} | accuracy:{:6.4f}'
                        .format(epoch + 1, num_epochs, loss.item(),
                                correct.item() / float(args.batch_size)))
                global_step += 1
                num_total_btches += 1
        total_training_samples = num_total_btches * args.batch_size
        print("Training loss {:6.4f}".format(running_loss /
                                             total_training_samples))
コード例 #9
0
 def output(self, entities, rels, is_target):
     entities_i = self.entities_i(util.to_var(entities, True)).unsqueeze(2)
     rels_i = self.rels_i(util.to_var(rels, True))
     entities = self.entities(util.to_var(entities, True)).unsqueeze(2)
     rels = self.rels(util.to_var(rels, True))
     if is_target:
         out = torch.mul(entities,rels) + torch.mul(entities_i,rels) \
               + torch.mul(rels_i,entities) - torch.mul(rels_i,entities_i)
     else:
         out = torch.mul(entities, rels) + torch.mul(entities_i, rels) \
               + torch.mul(rels_i, entities_i) - torch.mul(rels_i, entities)
     return out.squeeze(2).data.cpu().numpy()
コード例 #10
0
	def train_epoch(self, model, data_loader, criterion, optim):
		total_loss = 0.
		total_words = 0.
		for i, (data, target) in enumerate(data_loader):
			optim.zero_grad()
			x, y = util.to_var(data), util.to_var(target) # x: (None, sequence_len + 1), y: (None, sequence_len + 1)
			logits = model(x) # (None, vocal_size, sequence_len+1)
			loss = criterion(logits, y)
			total_loss += loss.data.cpu()[0]
			total_words += x.size(0) * x.size(1)
			loss.backward()
			optim.step()
		return total_loss / total_words
コード例 #11
0
    def update(self):
        # 如果儲存的資訊太少就不更新
        if self.memory_counter <= 5000:
            return

        # 將evaluate network的參數複製進入target network中
        self.softCopy()

        # 決定輸入的batch data
        if self.memory_counter > self.memory_size:
            sample_idx = np.random.choice(self.memory_size,
                                          size=self.batch_size)
        else:
            sample_idx = np.random.choice(self.memory_counter,
                                          size=self.batch_size)

        # 從記憶庫中擷取要訓練的資料
        batch_data = self.memory[sample_idx, :]
        batch_s = batch_data[:, :self.n_state]
        batch_a = batch_data[:, self.n_state:self.n_state + self.n_action]
        batch_r = batch_data[:, -self.n_state - 1:-self.n_state]
        batch_s_ = batch_data[:, -self.n_state:]

        # 送入Pytorch中
        batch_s = to_var(batch_s)
        batch_a = to_var(batch_a)
        batch_r = to_var(batch_r)
        batch_s_ = to_var(batch_s_)

        # 用target network計算target Q值
        next_q_target = self.target_critic(batch_s_,
                                           self.target_actor(batch_s_))
        q_target = batch_r + self.gamma * next_q_target

        # 更新critic
        self.critic_optimizer.zero_grad()
        q_batch = self.eval_critic(batch_s, batch_a)
        value_loss = F.mse_loss(input=q_batch, target=q_target)
        value_loss.backward()
        self.critic_optimizer.step()

        # 更新actor
        self.actor_optimizer.zero_grad()
        policy_loss = -self.eval_critic(batch_s,
                                        self.eval_actor(batch_s)).mean()
        policy_loss.backward()
        self.actor_optimizer.step()

        # 降低action隨機搜索廣度
        self.var *= .9995
コード例 #12
0
	def train_gan(self, backend):

		rollout = Rollout(self.generator, self.discriminator, self.update_rate)
		print('\nStart Adeversatial Training......')
		gen_optim, dis_optim = torch.optim.Adam(self.generator.parameters(), self.lr), torch.optim.Adam(self.discriminator.parameters(), self.lr)
		dis_criterion = util.to_cuda(nn.BCEWithLogitsLoss(size_average=False))
		gen_criterion = util.to_cuda(nn.CrossEntropyLoss(size_average=False, reduce=True))

		for epoch in range(self.gan_epochs):

			start = time.time()
			for _ in range(1):
				samples = self.generator.sample(self.batch_size, self.sequence_len) # (batch_size, sequence_len)
				zeros = util.to_var(torch.zeros(self.batch_size, 1).long()) # (batch_size, 1)
				inputs = torch.cat([samples, zeros], dim=1)[:, :-1] # (batch_size, sequence_len)
				rewards = rollout.reward(samples, 16) # (batch_size, sequence_len)
				rewards = util.to_var(torch.from_numpy(rewards))
				logits = self.generator(inputs) # (None, vocab_size, sequence_len)
				pg_loss = self.pg_loss(logits, samples, rewards)
				gen_optim.zero_grad()
				pg_loss.backward()
				gen_optim.step()

			print 'generator updated via policy gradient......'

			if epoch % 10 == 0:
				util.generate_samples(self.generator, self.batch_size, self.sequence_len, self.generate_sum, self.eval_file)
				eval_data = GenData(self.eval_file)
				eval_data_loader = DataLoader(eval_data, batch_size=self.batch_size, shuffle=True, num_workers=8)
				loss = self.eval_epoch(self.target_lstm, eval_data_loader, gen_criterion)
				print 'epoch: [{0:d}], true loss: [{1:.4f}]'.format(epoch, loss)



			for _ in range(1):
				util.generate_samples(self.generator, self.batch_size, self.sequence_len, self.generate_sum, self.fake_file)
				dis_data = DisData(self.real_file, self.fake_file)
				dis_data_loader = DataLoader(dis_data, batch_size=self.batch_size, shuffle=True, num_workers=8)
				for _ in range(1):
					loss = self.train_epoch(self.discriminator, dis_data_loader, dis_criterion, dis_optim)

			print 'discriminator updated via gan loss......'

			rollout.update_params()

			end = time.time()

			print 'time: [{:.3f}s/epoch] in {}'.format(end-start, backend)
コード例 #13
0
ファイル: encoder.py プロジェクト: zoey-wong/conv-emotion
    def init_h(self, batch_size=None, hidden=None):
        """Return RNN initial state"""
        if hidden is not None:
            return hidden

        if self.use_lstm:
            return (to_var(
                torch.zeros(self.num_layers * self.num_directions, batch_size,
                            self.hidden_size)),
                    to_var(
                        torch.zeros(self.num_layers * self.num_directions,
                                    batch_size, self.hidden_size)))
        else:
            return to_var(
                torch.zeros(self.num_layers * self.num_directions, batch_size,
                            self.hidden_size))
コード例 #14
0
    def sample(self, batch_size, sequence_len, x=None):

        flag = False

        if x is None:
            x = util.to_var(torch.zeros(batch_size, 1).long())
            flag = True

        h, c = self.init_hidden(self.num_layers, batch_size, self.hidden_dim)
        samples = []
        if flag:
            for _ in range(sequence_len):
                logits, h, c = self.step(x, h, c)
                probs = F.softmax(logits, dim=1)
                sample = probs.multinomial(1)  # (batch_size, 1)
                samples.append(sample)
        else:
            given_len = x.size(1)
            lis = x.chunk(x.size(1), dim=1)
            for i in range(given_len):
                logits, h, c = self.step(lis[i], h, c)
                samples.append(lis[i])
            x = F.softmax(logits, dim=1).multinomial(1)
            for i in range(given_len, sequence_len):
                samples.append(x)
                logits, h, c = self.step(x, h, c)
                x = F.softmax(logits, dim=1).multinomial(1)
        output = torch.cat(samples, 1)
        return output  # (batch_size, sequence_len)
コード例 #15
0
ファイル: optimizer.py プロジェクト: leiloong/kge-rl
 def softmax(self,scores,volatile):
     # Need to recreate y every time because batch sizes may vary, though one could cache for batch sizes
     # Very large batch size, negligible effect
     y = np.zeros(scores.size()[0],dtype='int')
     y = util.to_var(y,volatile=volatile)
     loss = self.cross_ent(scores,y)
     return loss
コード例 #16
0
    def update(self):
        # 如果儲存的transition資訊太少,則不更新
        if self.memory_counter < self.batch_size:
            return

        # 將evaluate network中的所有參數複製到target network中
        if self.learn_step_counter % self.update_iter == 0:
            self.target_net.load_state_dict(self.eval_net.state_dict())
        self.learn_step_counter += 1

        # 從記憶庫中隨機挑選batch個transition來更新
        if self.had_fill_memory:
            sample_idx = np.random.choice(self.memory_size, self.batch_size)
        else:
            sample_idx = np.random.choice(self.memory_counter, self.batch_size)

        # 隨機從記憶庫中選取transition (Game re-play)
        batch_memory = self.memory[sample_idx, :]
        batch_s = to_var(batch_memory[:, :self.n_state])
        batch_a = to_var(batch_memory[:, self.n_state:self.n_state +
                                      1].astype(int),
                         to_float=False)
        batch_r = to_var(batch_memory[:, self.n_state + 1:self.n_state + 2])
        batch_s_ = to_var(batch_memory[:, -self.n_state:])

        # 移到GPU上
        if torch.cuda.is_available():
            batch_s = batch_s.cuda()
            batch_a = batch_a.cuda()
            batch_r = batch_r.cuda()
            batch_s_ = batch_s_.cuda()

        # ------------------------------------------------
        #                       更新
        # 1. 用eval_net做預測後,取出t+1時間點的V值
        # 2. 用target_net做預測後,得到t+2時間點的V值
        # 3. 假設t+2時間點是正確答案,套入DQN更新公式
        # 4. 更新參數
        # ------------------------------------------------
        q_eval = self.eval_net(batch_s).gather(1, batch_a)  # 1
        q_next = self.target_net(batch_s_).detach()  # 2
        q_target = batch_r + self.gamma * q_next.max(1)[0].view(
            self.batch_size, 1)  # 3
        loss = self.criterion(q_eval, q_target)
        self.optimizer.zero_grad()
        loss.backward()  # 4
        self.optimizer.step()
コード例 #17
0
 def output(self, entities, rels, is_target):
     '''
     Given source and rels output the target or given target and rels output the source vector
     :param entities: source or target entity ids
     :param rels: rel ids
     :param is_target: True for predicting targets
     :return: 
     '''
     entities = self.entities(util.to_var(entities, True)).unsqueeze(2)
     rels = self.rels(util.to_var(rels, True))
     # Reshape rels
     rels = rels.view(-1, self.dim, self.dim)
     if is_target:
         out = torch.bmm(torch.transpose(entities, 1, 2), rels)
     else:
         out = torch.bmm(rels, entities)
     out = out.view(-1, out.size()[1] * out.size()[2])
     return out.data.cpu().numpy()
コード例 #18
0
 def chooseAction(self, s):
     """
         給定輸入state,透過evaluate actor輸出[-1, 1]之間的實數動作值
     """
     s = to_var(s)
     a = self.eval_actor(s)
     a = a.cpu().data.numpy()
     if self.var > 0:
         a = np.clip(np.random.normal(a, self.var), -2, 2)
     return a
コード例 #19
0
def random_noise(image):
    noise_scale = 0.001
    noise = np.random.randn(
        np.array(image.data).shape[0],
        np.array(image.data).shape[1],
        np.array(image.data).shape[2],
        np.array(image.data).shape[3])  # ノイズ生成

    image = image + util.to_var(torch.from_numpy(noise_scale * noise).float())

    # util.save_images(image)

    return image
コード例 #20
0
 def chooseAction(self, x):
     """
         透過network決定action,
         極少數用隨機來決定
     """
     x = to_var(x)
     x = x.cuda() if torch.cuda.is_available() else x
     if np.random.uniform() > self.epsilon:  # 用DQN決定動作
         action_value = self.eval_net(x)
         _, action = torch.max(action_value, 0)
         action = action[0].cpu().data.numpy()[0]
     else:  # 用隨機指定動作
         action = np.random.randint(0, self.n_action)
     return action
コード例 #21
0
    def sample(self, batch_size, sequence_len):

        x = util.to_var(torch.zeros(batch_size, 1).long())
        h, c = self.init_hidden(self.num_layers, batch_size, self.hidden_dim)

        samples = []
        for _ in range(sequence_len):
            logits, h, c = self.step(x, h, c)
            probs = F.softmax(logits, dim=1)
            sample = probs.multinomial(1)  # (batch_size, 1)
            samples.append(sample)

        output = torch.cat(samples, 1)
        return output  # (batch_size, sequence_len)
コード例 #22
0
    def sample(self, z=None, n=4, max_length=60, temperature=1.0):
        """

        :param z: Batch of latent vectors
        :param n: Ignored if z is given
        :param max_length:
        :param temperature:
        :return:
        """

        if z is None:
            z = to_var(torch.randn([n, self.latent_size]))

        b, l = z.size()

        hidden = self.tohidden(z)

        input = self.tensor(b, max_length).fill_(PAD).long()
        input[:, 0] = SOS

        for t in range(max_length - 1):

            input_embedding = self.embedding(input)

            # input_embedding = rnn_utils.pack_padded_sequence(input_embedding, [t+1] * b,
            #                                             batch_first=True)

            output, _ = self.decoder_rnn(input_embedding, hidden.unsqueeze(0))

            # output = rnn_utils.pad_packed_sequence(output, batch_first=True)[0]

            logits = self.outputs2vocab(output)

            current = logits[:, t, :] # logits for the current step
            input[:, t+1] = util.sample_logits(current, temperature)

        return input
コード例 #23
0
ファイル: models.py プロジェクト: zoey-wong/conv-emotion
    def forward(self, input_sentences, input_sentence_length,
                input_conversation_length, input_masks):
        """
        Args:
            input_sentences: (Variable, LongTensor) [num_sentences, seq_len]
            target_sentences: (Variable, LongTensor) [num_sentences, seq_len]
        Return:
            decoder_outputs: (Variable, FloatTensor)
                - train: [batch_size, seq_len, vocab_size]
                - eval: [batch_size, seq_len]
        """
        num_sentences = input_sentences.size(0)
        max_len = input_conversation_length.max().item()

        # encoder_outputs: [num_sentences, max_source_length, hidden_size * direction]
        # encoder_hidden: [num_layers * direction, num_sentences, hidden_size]
        # encoder_outputs, encoder_hidden = self.encoder(input_sentences,
        #                                                input_sentence_length)
        all_encoder_layers, _ = self.encoder(input_sentences,
                                             token_type_ids=None,
                                             attention_mask=input_masks)

        bert_output = []
        for idx in range(self.config.num_bert_layers):
            layer = all_encoder_layers[idx]
            bert_output.append(layer[:, 0, :])
        bert_output = torch.stack(bert_output, dim=1)
        bert_output = torch.mean(bert_output, dim=1, keepdim=False)

        # encoder_hidden: [num_sentences, num_layers * direction * hidden_size]
        encoder_hidden = bert_output

        # pad and pack encoder_hidden
        start = torch.cumsum(
            torch.cat((to_var(input_conversation_length.data.new(1).zero_()),
                       input_conversation_length[:-1])), 0)

        # encoder_hidden: [batch_size, max_len, num_layers * direction * hidden_size]
        encoder_hidden = torch.stack([
            pad(encoder_hidden.narrow(0, s, l), max_len) for s, l in zip(
                start.data.tolist(), input_conversation_length.data.tolist())
        ], 0)

        # context_outputs: [batch_size, max_len, context_size]
        context_outputs, context_last_hidden = self.context_encoder(
            encoder_hidden, input_conversation_length)

        # flatten outputs
        # context_outputs: [num_sentences, context_size]
        context_outputs = torch.cat([
            context_outputs[i, :l, :]
            for i, l in enumerate(input_conversation_length.data)
        ])

        context_outputs = self.dropoutLayer(context_outputs)

        # project context_outputs to decoder init state
        decoder_init = self.context2decoder(context_outputs)

        output = self.decoder2output(decoder_init)

        return output
コード例 #24
0
 def max_margin(self,scores):
     y = util.to_var(np.ones(scores.size()[0],dtype='float32'), requires_grad=False)
     loss = self.mm(scores[:,0],scores[:,1],y)
     for i in range(2,scores.size()[1]):
         loss += self.mm(scores[:,0],scores[:,i],y)
     return loss/(scores.size()[1]-1.)
コード例 #25
0
ファイル: solver.py プロジェクト: zoey-wong/conv-emotion
    def evaluate(self, data_loader, mode=None):
        assert (mode is not None)

        self.model.eval()
        batch_loss_history, predictions, ground_truth = [], [], []
        for batch_i, (conversations, labels, conversation_length,
                      sentence_length, type_ids,
                      masks) in enumerate(data_loader):
            # conversations: (batch_size) list of conversations
            #   conversation: list of sentences
            #   sentence: list of tokens
            # conversation_length: list of int
            # sentence_length: (batch_size) list of conversation list of sentence_lengths

            input_conversations = conversations

            # flatten input and target conversations
            input_sentences = [
                sent for conv in input_conversations for sent in conv
            ]
            input_labels = [label for conv in labels for label in conv]
            input_sentence_length = [
                l for len_list in sentence_length for l in len_list
            ]
            input_conversation_length = [l for l in conversation_length]
            input_masks = [mask for conv in masks for mask in conv]
            orig_input_labels = input_labels

            with torch.no_grad():
                # transfering the input to cuda
                input_sentences = to_var(torch.LongTensor(input_sentences))
                input_labels = to_var(torch.LongTensor(input_labels))
                input_sentence_length = to_var(
                    torch.LongTensor(input_sentence_length))
                input_conversation_length = to_var(
                    torch.LongTensor(input_conversation_length))
                input_masks = to_var(torch.LongTensor(input_masks))

            sentence_logits = self.model(input_sentences,
                                         input_sentence_length,
                                         input_conversation_length,
                                         input_masks)

            present_predictions = list(
                np.argmax(sentence_logits.detach().cpu().numpy(), axis=1))

            loss_function = nn.CrossEntropyLoss()
            batch_loss = loss_function(sentence_logits, input_labels)

            predictions += present_predictions
            ground_truth += orig_input_labels

            assert not isnan(batch_loss.item())
            batch_loss_history.append(batch_loss.item())

        epoch_loss = np.mean(batch_loss_history)

        print_str = f'{mode} loss: {epoch_loss:.3f}\n'

        w_f1_score = self.print_metric(ground_truth, predictions, mode)
        return epoch_loss, w_f1_score, predictions
コード例 #26
0
def main(args):

    # cfg_file = os.path.join(args.example_config_path, args.primitive) + ".yaml"
    cfg = get_vae_defaults()
    # cfg.merge_from_file(cfg_file)
    cfg.freeze()

    batch_size = args.batch_size
    dataset_size = args.total_data_size

    if args.experiment_name is None:
        experiment_name = args.model_name
    else:
        experiment_name = args.experiment_name

    if not os.path.exists(os.path.join(args.log_dir, experiment_name)):
        os.makedirs(os.path.join(args.log_dir, experiment_name))

    description_txt = raw_input('Please enter experiment notes: \n')
    if isinstance(description_txt, str):
        with open(
                os.path.join(args.log_dir, experiment_name,
                             experiment_name + '_description.txt'), 'wb') as f:
            f.write(description_txt)

    writer = SummaryWriter(os.path.join(args.log_dir, experiment_name))

    # torch_seed = np.random.randint(low=0, high=1000)
    # np_seed = np.random.randint(low=0, high=1000)
    torch_seed = 0
    np_seed = 0

    torch.manual_seed(torch_seed)
    np.random.seed(np_seed)

    trained_model_path = os.path.join(args.model_path, args.model_name)
    if not os.path.exists(trained_model_path):
        os.makedirs(trained_model_path)

    if args.task == 'contact':
        if args.start_rep == 'keypoints':
            start_dim = 24
        elif args.start_rep == 'pose':
            start_dim = 7

        if args.goal_rep == 'keypoints':
            goal_dim = 24
        elif args.goal_rep == 'pose':
            goal_dim = 7

        if args.skill_type == 'pull':
            # + 7 because single arm palm pose
            input_dim = start_dim + goal_dim + 7
        else:
            # + 14 because both arms palm pose
            input_dim = start_dim + goal_dim + 14
        output_dim = 7
        decoder_input_dim = start_dim + goal_dim

        vae = VAE(input_dim,
                  output_dim,
                  args.latent_dimension,
                  decoder_input_dim,
                  hidden_layers=cfg.ENCODER_HIDDEN_LAYERS_MLP,
                  lr=args.learning_rate)
    elif args.task == 'goal':
        if args.start_rep == 'keypoints':
            start_dim = 24
        elif args.start_rep == 'pose':
            start_dim = 7

        if args.goal_rep == 'keypoints':
            goal_dim = 24
        elif args.goal_rep == 'pose':
            goal_dim = 7

        input_dim = start_dim + goal_dim
        output_dim = goal_dim
        decoder_input_dim = start_dim
        vae = GoalVAE(input_dim,
                      output_dim,
                      args.latent_dimension,
                      decoder_input_dim,
                      hidden_layers=cfg.ENCODER_HIDDEN_LAYERS_MLP,
                      lr=args.learning_rate)
    elif args.task == 'transformation':
        input_dim = args.input_dimension
        output_dim = args.output_dimension
        decoder_input_dim = args.input_dimension - args.output_dimension
        vae = GoalVAE(input_dim,
                      output_dim,
                      args.latent_dimension,
                      decoder_input_dim,
                      hidden_layers=cfg.ENCODER_HIDDEN_LAYERS_MLP,
                      lr=args.learning_rate)
    else:
        raise ValueError('training task not recognized')

    if torch.cuda.is_available():
        vae.encoder.cuda()
        vae.decoder.cuda()

    if args.start_epoch > 0:
        start_epoch = args.start_epoch
        num_epochs = args.num_epochs
        fname = os.path.join(
            trained_model_path,
            args.model_name + '_epoch_%d.pt' % args.start_epoch)
        torch_seed, np_seed = load_seed(fname)
        load_net_state(vae, fname)
        load_opt_state(vae, fname)
        args = load_args(fname)
        args.start_epoch = start_epoch
        args.num_epochs = num_epochs
        torch.manual_seed(torch_seed)
        np.random.seed(np_seed)

    data_dir = args.data_dir
    data_loader = DataLoader(data_dir=data_dir)

    data_loader.create_random_ordering(size=dataset_size)

    dataset = data_loader.load_dataset(start_rep=args.start_rep,
                                       goal_rep=args.goal_rep,
                                       task=args.task)

    total_loss = []
    start_time = time.time()
    print('Saving models to: ' + trained_model_path)
    kl_weight = 1.0
    print('Starting on epoch: ' + str(args.start_epoch))

    for epoch in range(args.start_epoch, args.start_epoch + args.num_epochs):
        print('Epoch: ' + str(epoch))
        epoch_total_loss = 0
        epoch_kl_loss = 0
        epoch_pos_loss = 0
        epoch_ori_loss = 0
        epoch_recon_loss = 0
        kl_coeff = 1 - kl_weight
        kl_weight = args.kl_anneal_rate * kl_weight
        print('KL coeff: ' + str(kl_coeff))
        for i in range(0, dataset_size, batch_size):
            vae.optimizer.zero_grad()

            input_batch, decoder_input_batch, target_batch = \
                data_loader.sample_batch(dataset, i, batch_size)
            input_batch = to_var(torch.from_numpy(input_batch))
            decoder_input_batch = to_var(torch.from_numpy(decoder_input_batch))

            z, recon_mu, z_mu, z_logvar = vae.forward(input_batch,
                                                      decoder_input_batch)
            kl_loss = vae.kl_loss(z_mu, z_logvar)

            if args.task == 'contact':
                output_r, output_l = recon_mu
                if args.skill_type == 'grasp':
                    target_batch_right = to_var(
                        torch.from_numpy(target_batch[:, 0]))
                    target_batch_left = to_var(
                        torch.from_numpy(target_batch[:, 1]))

                    pos_loss_right = vae.mse(output_r[:, :3],
                                             target_batch_right[:, :3])
                    ori_loss_right = vae.rotation_loss(
                        output_r[:, 3:], target_batch_right[:, 3:])

                    pos_loss_left = vae.mse(output_l[:, :3],
                                            target_batch_left[:, :3])
                    ori_loss_left = vae.rotation_loss(output_l[:, 3:],
                                                      target_batch_left[:, 3:])

                    pos_loss = pos_loss_left + pos_loss_right
                    ori_loss = ori_loss_left + ori_loss_right
                elif args.skill_type == 'pull':
                    target_batch = to_var(
                        torch.from_numpy(target_batch.squeeze()))

                    #TODO add flags for when we're training both arms
                    # output = recon_mu[0]  # right arm is index [0]
                    # output = recon_mu[1]  # left arm is index [1]

                    pos_loss_right = vae.mse(output_r[:, :3],
                                             target_batch[:, :3])
                    ori_loss_right = vae.rotation_loss(output_r[:, 3:],
                                                       target_batch[:, 3:])

                    pos_loss = pos_loss_right
                    ori_loss = ori_loss_right

            elif args.task == 'goal':
                target_batch = to_var(torch.from_numpy(target_batch.squeeze()))
                output = recon_mu
                if args.goal_rep == 'pose':
                    pos_loss = vae.mse(output[:, :3], target_batch[:, :3])
                    ori_loss = vae.rotation_loss(output[:, 3:],
                                                 target_batch[:, 3:])
                elif args.goal_rep == 'keypoints':
                    pos_loss = vae.mse(output, target_batch)
                    ori_loss = torch.zeros(pos_loss.shape)

            elif args.task == 'transformation':
                target_batch = to_var(torch.from_numpy(target_batch.squeeze()))
                output = recon_mu
                pos_loss = vae.mse(output[:, :3], target_batch[:, :3])
                ori_loss = vae.rotation_loss(output[:, 3:], target_batch[:,
                                                                         3:])

            recon_loss = pos_loss + ori_loss

            loss = kl_coeff * kl_loss + recon_loss
            loss.backward()
            vae.optimizer.step()

            epoch_total_loss = epoch_total_loss + loss.data
            epoch_kl_loss = epoch_kl_loss + kl_loss.data
            epoch_pos_loss = epoch_pos_loss + pos_loss.data
            epoch_ori_loss = epoch_ori_loss + ori_loss.data
            epoch_recon_loss = epoch_recon_loss + recon_loss.data

            writer.add_scalar('loss/train/ori_loss', ori_loss.data, i)
            writer.add_scalar('loss/train/pos_loss', pos_loss.data, i)
            writer.add_scalar('loss/train/kl_loss', kl_loss.data, i)

            if (i / batch_size) % args.batch_freq == 0:
                if args.skill_type == 'pull' or args.task == 'goal' or args.task == 'transformation':
                    print(
                        'Train Epoch: %d [%d/%d (%f)]\tLoss: %f\tKL: %f\tPos: %f\t Ori: %f'
                        % (epoch, i, dataset_size,
                           100.0 * i / dataset_size / batch_size, loss.item(),
                           kl_loss.item(), pos_loss.item(), ori_loss.item()))
                elif args.skill_type == 'grasp' and args.task == 'contact':
                    print(
                        'Train Epoch: %d [%d/%d (%f)]\tLoss: %f\tKL: %f\tR Pos: %f\t R Ori: %f\tL Pos: %f\tL Ori: %f'
                        % (epoch, i, dataset_size, 100.0 * i / dataset_size /
                           batch_size, loss.item(), kl_loss.item(),
                           pos_loss_right.item(), ori_loss_right.item(),
                           pos_loss_left.item(), ori_loss_left.item()))
        print(' --avgerage loss: ')
        print(epoch_total_loss / (dataset_size / batch_size))
        loss_dict = {
            'epoch_total': epoch_total_loss / (dataset_size / batch_size),
            'epoch_kl': epoch_kl_loss / (dataset_size / batch_size),
            'epoch_pos': epoch_pos_loss / (dataset_size / batch_size),
            'epoch_ori': epoch_ori_loss / (dataset_size / batch_size),
            'epoch_recon': epoch_recon_loss / (dataset_size / batch_size)
        }
        total_loss.append(loss_dict)

        if epoch % args.save_freq == 0:
            print('\n--Saving model\n')
            print('time: ' + str(time.time() - start_time))

            save_state(net=vae,
                       torch_seed=torch_seed,
                       np_seed=np_seed,
                       args=args,
                       fname=os.path.join(
                           trained_model_path,
                           args.model_name + '_epoch_' + str(epoch) + '.pt'))

            np.savez(os.path.join(
                trained_model_path,
                args.model_name + '_epoch_' + str(epoch) + '_loss.npz'),
                     loss=np.asarray(total_loss))

    print('Done!')
    save_state(net=vae,
               torch_seed=torch_seed,
               np_seed=np_seed,
               args=args,
               fname=os.path.join(
                   trained_model_path,
                   args.model_name + '_epoch_' + str(epoch) + '.pt'))
コード例 #27
0
 def relation_vectors(self, ids):
     var = util.to_var(ids, volatile=True)
     vector = self.rels(var).data.cpu().numpy()
     return vector
コード例 #28
0
def train():

    train_file_idxs = np.arange(0, len(TRAIN_FILES))
    test_file_idxs = np.arange(0, len(TEST_FILES))
    np.random.shuffle(train_file_idxs)
    model = get_model()
    print(model)
    dtype = torch.FloatTensor
    criterion = nn.CrossEntropyLoss()
    # optimizer = optim.SGD(model.parameters(), lr=0.001)
    optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999))
    losses = []
    # writer = SummaryWriter()
    global_step =0
    # pdb.set_trace()
    # for i in range(len(TEST_FILES)):
    #     test_data, gt = util.loadDataFile(TEST_FILES[test_file_idxs[i]])
    #     print(test_data.shape)
    #     print(gt[0])
    if args.continue_latest==1:
        model = load_model(args.checkpoint_dir,'latest.pth',model)
        optimizer = load_optimizer(args.checkpoint_dir,'latest.pth',optimizer)
    for epoch in range(num_epochs):
        running_loss = 0
        num_total_btches=0
        total_correct =0
        total_training_samples=0
        for i in range(len(TRAIN_FILES)):
            train_data, current_labels = util.loadDataFile(TRAIN_FILES[train_file_idxs[i]])
            train_data = util.to_var(torch.from_numpy(train_data))
            current_labels = util.to_var(torch.from_numpy(current_labels))
            num_batches = train_data.shape[0] // args.batch_size
            print('Training file: {:5d} |num of batches: {:5d}'.format(i ,num_batches))
            for btch in range(num_batches):
                optimizer.zero_grad()
                start_idx = btch*args.batch_size
                end_idx = (btch+1)*args.batch_size
                current_train = train_data[start_idx:end_idx, :, :]
                btch_label = current_labels[start_idx:end_idx,:].type(torch.long)
                # pdb.set_trace()
                logits = model(current_train)
                # pdb.set_trace()
                loss = criterion(logits, btch_label.view(-1))
                loss.backward()
                optimizer.step()
                preds = F.log_softmax(logits, 1)
                pred_choice = preds.data.max(1)[1]
                correct = pred_choice.eq(btch_label.view(-1).data).cpu().sum()
                total_correct+=correct.item()
                running_loss+= loss.item()*args.batch_size  
                losses.append(loss.item())
                total_training_samples+=btch_label.shape[0]
                # writer.add_scalar('loss',loss.item(), global_step)
                # writer.add_graph(model,current_train)
                # pdb.set_trace()
                if btch % args.print_every==0:
                    print('Epoch [{:5d}/{:5d}] | loss: {:6.4f} | accuracy:{:6.4f}'.format(epoch+1, num_epochs, loss.item(),
                                                                    correct.item()/float(args.batch_size)))
                global_step+=1
                num_total_btches+=1
        print(num_total_btches*args.batch_size, total_training_samples)      
        print("Epoch {} : Total training loss {:6.4f} and accuracy {:6.4f}".format(epoch, running_loss/total_training_samples, total_correct/total_training_samples))
        log_value('training_loss',running_loss/total_training_samples,epoch)
        log_value('accuracy',total_correct/total_training_samples,epoch)
        if (epoch % evaluation_epoch==0 and epoch!=0):
            model.eval()
            pred_score = 0
            test_loss+ = 0
            total_test_samples=0
            num_test_batches=0
            with torch.no_grad():
                for i in range(len(TEST_FILES)):
                    test_data, gt = util.loadDataFile(TEST_FILES[test_file_idxs[i]])
                    test_data = util.to_var(torch.from_numpy(test_data))
                    gt = util.to_var(torch.from_numpy(gt)).type(torch.long)
                    num_batches = test_data.shape[0] // args.batch_size
                    for btch in range(num_batches):
                        start_indx = btch*args.batch_size
                        end_indx = (btch+1)*args.batch_size
                        current_test = test_data[start_indx:end_indx, :, :]
                        logits = model(current_test)
                        gt_btch = gt[start_indx:end_indx,:]
                        loss = criterion(logits, gt_btch.view(-1))
                        test_loss+=loss.item()*args.batch_size
                        preds = F.log_softmax(logits, 1)
                        predictions = preds.data.max(1)[1]
                        actuals = predictions.eq(gt_btch.view(-1).data).cpu().sum()
                        pred_score+=actuals.item()
                        num_test_batches+=1
                        total_test_samples+=gt_btch.shape[0]
            # pdb.set_trace()
            # print(num_test_batches*args.batch_size, total_test_samples)
            print('Evaluation loss {:6.4f} | Accuracy {:6.4f}'.format(test_loss/total_test_samples,pred_score/total_test_samples))
            log_value('evaluation_accuracy',pred_score/total_test_samples, epoch)      
            model.train()   
        
    # writer.close()
    model.eval()
    pred_score = 0
    test_loss = 0
    total_test_samples=0
    num_test_batches=0
    with torch.no_grad():
        for i in range(len(TEST_FILES)):
            test_data, gt = util.loadDataFile(TEST_FILES[test_file_idxs[i]])
            test_data = util.to_var(torch.from_numpy(test_data))
            gt = util.to_var(torch.from_numpy(gt)).type(torch.long)
            num_batches = test_data.shape[0] // args.batch_size
            for btch in range(num_batches):
                start_indx = btch*args.batch_size
                end_indx = (btch+1)*args.batch_size
                current_test = test_data[start_indx:end_indx, :, :]
                logits = model(current_test)
                gt_btch = gt[start_indx:end_indx,:]
                loss = criterion(logits, gt_btch.view(-1))
                test_loss+=loss.item()*args.batch_size
                preds = F.log_softmax(logits, 1)
                predictions = preds.data.max(1)[1]
                actuals = predictions.eq(gt_btch.view(-1).data).cpu().sum()
                pred_score+=actuals.item()
                num_test_batches+=1
                total_test_samples+=gt_btch.shape[0]
    print('Final test loss {:6.4f} | accuracy {:6.4f}'.format(test_loss/total_test_samples,pred_score/total_test_samples))
    save_model(args.checkpoint_dir,'latest.pth',model, num_epochs) 
    save_optimizer(args.checkpoint_dir,'latest.pth',optimizer,num_epochs)
コード例 #29
0
ファイル: solver.py プロジェクト: zoey-wong/conv-emotion
    def train(self):
        min_val_loss = np.inf
        patience_counter = 0
        best_epoch = -1

        for epoch_i in range(self.epoch_i, self.config.n_epoch):
            self.epoch_i = epoch_i

            batch_loss_history = []
            predictions, ground_truth = [], []
            self.model.train()
            n_total_words = 0
            before_gradient = None

            for batch_i, (conversations, labels, conversation_length,
                          sentence_length, type_ids, masks) in enumerate(
                              tqdm(self.train_data_loader, ncols=80)):
                # conversations: (batch_size) list of conversations
                #   conversation: list of sentences
                #   sentence: list of tokens
                # conversation_length: list of int
                # sentence_length: (batch_size) list of conversation list of sentence_lengths

                input_conversations = conversations

                # flatten input and target conversations
                input_sentences = [
                    sent for conv in input_conversations for sent in conv
                ]
                input_labels = [label for utt in labels for label in utt]
                input_sentence_length = [
                    l for len_list in sentence_length for l in len_list
                ]
                input_conversation_length = [l for l in conversation_length]
                input_masks = [mask for conv in masks for mask in conv]
                orig_input_labels = input_labels

                # transfering the input to cuda
                input_sentences = to_var(torch.LongTensor(input_sentences))
                input_labels = to_var(torch.LongTensor(input_labels))
                input_sentence_length = to_var(
                    torch.LongTensor(input_sentence_length))
                input_conversation_length = to_var(
                    torch.LongTensor(input_conversation_length))
                input_masks = to_var(torch.LongTensor(input_masks))

                # reset gradient
                self.optimizer.zero_grad()

                sentence_logits = self.model(input_sentences,
                                             input_sentence_length,
                                             input_conversation_length,
                                             input_masks)

                present_predictions = list(
                    np.argmax(sentence_logits.detach().cpu().numpy(), axis=1))

                loss_function = nn.CrossEntropyLoss()
                batch_loss = loss_function(sentence_logits, input_labels)

                predictions += present_predictions
                ground_truth += orig_input_labels

                assert not isnan(batch_loss.item())
                batch_loss_history.append(batch_loss.item())

                if batch_i % self.config.print_every == 0:
                    tqdm.write(
                        f'Epoch: {epoch_i+1}, iter {batch_i}: loss = {batch_loss.item()}'
                    )

                # Back-propagation
                batch_loss.backward()

                # Gradient cliping
                torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                               self.config.clip)

                # Run optimizer
                self.optimizer.step()

            epoch_loss = np.mean(batch_loss_history)
            self.epoch_loss = epoch_loss

            print_str = f'Epoch {epoch_i+1} loss average: {epoch_loss:.3f}'
            print(print_str)

            self.w_train_f1 = self.print_metric(ground_truth, predictions,
                                                "train")

            self.validation_loss, self.w_valid_f1, valid_predictions = self.evaluate(
                self.valid_data_loader, mode="valid")
            self.test_loss, self.w_test_f1, test_predictions = self.evaluate(
                self.test_data_loader, mode="test")

            print(self.epoch_loss, self.w_train_f1, self.w_valid_f1,
                  self.w_test_f1)

            IMPROVED = False
            if self.validation_loss < min_val_loss:
                IMPROVED = True
                min_val_loss = self.validation_loss
                best_test_loss = self.test_loss
                best_test_f1_w = self.w_test_f1
                best_epoch = (self.epoch_i + 1)

            if (not IMPROVED):
                patience_counter += 1
            else:
                patience_counter = 0
            print(f'Patience counter: {patience_counter}')
            if (patience_counter > self.config.patience):
                break

        return best_test_loss, best_test_f1_w, best_epoch
コード例 #30
0
def main(args):
    langs = args.langs
    embedding_path = args.mono_embedding_path
    bilingual_dict_path = args.bilingual_dict_path
    prefix = args.mono_emb_prefix
    char_prefix = args.mono_char_prefix
    model_path = args.model_path
    output_file = args.common_emb_eval
    output_file_best = args.common_emb_best
    linguistic_vec_path = args.linguistic_vec_path
    mono_dict_path = args.mono_dict_path

    # initialize model parameters
    batch_size = args.batch_size
    num_epochs = args.num_epochs
    learning_rate = args.learning_rate
    save_step = args.save_step
    log_step = save_step
    emb_size = args.word_embedding_size
    common_size = args.common_embedding_size
    kernel_num = args.kernel_num
    patience = args.patience
    max_word_length = args.max_word_length
    char_vec_size = emb_size
    filter_withs = args.filter_widths
    num_workers = args.num_workers
    top_k = args.top_k
    lg = args.lg

    # using dev sets in multilingual eval repro to select the best parameters
    eval_data_path = args.eval_data_path
    trans_path = os.path.join(eval_data_path,
                              "word_translation/wiktionary.da+en+it.dev")
    word_sim_path = os.path.join(eval_data_path, "wordsim/en+it-mws353-dev")
    mono_sim_path = os.path.join(eval_data_path, "wordsim/EN-MEN-TR-3k")
    mono_qvec_path = os.path.join(eval_data_path, "qvec/dev-en")
    mono_qvec_cca_path = os.path.join(eval_data_path, "qvec/dev-en")
    multi_qvec_cca_path = os.path.join(eval_data_path, "qvec/dev-en-da-it")

    # create model directory
    if not os.path.exists(model_path):
        os.makedirs(model_path)

    lang_matrixs = load_linguistic_vector(langs, linguistic_vec_path)

    context_langs = {}
    for lang in langs:
        context_langs[lang] = load_top_k(
            os.path.join(mono_dict_path, lang + ".top50.dict"))

    # Load vocabulary wrapper.
    vocab_langs = {}
    vectors_langs = {}
    embedding_langs = {}
    char_vocab_langs = {}
    char_vectors_langs = {}
    char_embedding_langs = {}
    word2char_langs = {}
    for lang in langs:
        vocab, vectors, char_vocab, char_vectors, word_2_char = \
            build_vocab(os.path.join(embedding_path,lang+prefix), os.path.join(embedding_path, lang+char_prefix),
                        emb_size, max_word_length, char_vec_size, head=True)
        embedding = nn.Embedding(len(vectors), len(vectors[0]))
        char_embedding = nn.Embedding(len(char_vectors), len(char_vectors[0]))
        vectors_langs[lang] = vectors
        vectors = convert2tensor(vectors)
        char_vectors = convert2tensor(char_vectors)
        embedding.weight = nn.Parameter(vectors)
        embedding.weight.requires_grad = False
        char_embedding.weight = nn.Parameter(char_vectors)
        if torch.cuda.is_available():
            embedding.cuda()
            char_embedding.cuda()
        vocab_langs[lang] = vocab
        embedding_langs[lang] = embedding
        char_vocab_langs[lang] = char_vocab
        char_vectors_langs[lang] = char_vectors
        char_embedding_langs[lang] = char_embedding
        word2char_langs[lang] = word_2_char

    # Build the models
    projectors = {}
    for lang in langs:
        projector = ProjLanguage(emb_size, common_size, kernel_num,
                                 char_vec_size, filter_withs)
        if torch.cuda.is_available():
            projector.cuda()
        projectors[lang] = projector

    # Loss and Optimizer
    criterion = nn.CosineEmbeddingLoss(margin=0)
    params = []
    for lang in langs:
        params += list(projectors[lang].parameters())
    optimizer = torch.optim.Adadelta(params, lr=learning_rate)

    start = time.time()

    best_score = 0
    best_sim_score = 0
    best_model_dict = {}

    # Build data loader
    print("start to load data ... ")
    data_loader_set = get_loader_bilingual_context_char(
        bilingual_dict_path,
        langs,
        vocab_langs,
        batch_size,
        word2char_langs,
        shuffle=True,
        num_workers=num_workers,
        top_k=top_k)
    print("finish loading data. \nstart to train models ")
    total_step = 0
    for new_data_loader in data_loader_set:
        (lang1, lang2, data_loader) = new_data_loader
        total_step = len(data_loader)
    print("total step ", total_step)

    data_loader_mono_set = {}
    for lang in langs:
        vocab_lang = vocab_langs[lang]
        context_lang = context_langs[lang]
        word2char_lang = word2char_langs[lang]
        data_loader = get_loader_mono_context_char(os.path.join(
            embedding_path, lang + prefix),
                                                   vocab_lang,
                                                   context_lang,
                                                   word2char_lang,
                                                   head=True,
                                                   batch_size=batch_size,
                                                   shuffle=False,
                                                   num_workers=num_workers,
                                                   top_k=top_k)
        data_loader_mono_set[lang] = data_loader

    i0 = 0
    current_patience = 0
    for epoch in range(num_epochs):
        if learning_rate < 0.01:
            break
        epoch_start = time.time()

        (lang01, lang02, data_loader_0) = data_loader_set[0]
        (matrix01_orig, matrix02_orig) = lang_matrixs[lang01 + "#" + lang02]

        for ids01, ids02, ids01_context, ids02_context, char_ids01, char_ids02 in data_loader_0:

            # Set mini-batch dataset
            ids01 = torch.FloatTensor(ids01).long()
            ids02 = torch.FloatTensor(ids02).long()
            ids01_context = torch.FloatTensor(ids01_context).long()
            ids02_context = torch.FloatTensor(ids02_context).long()
            char_ids01 = torch.FloatTensor(char_ids01).long()
            char_ids02 = torch.FloatTensor(char_ids02).long()

            if len(matrix01_orig) > batch_size:
                gap = len(matrix01_orig) - batch_size
                rand = random.randint(0, gap)
                matrix01 = matrix01_orig[rand:rand + batch_size][:]
                matrix02 = matrix02_orig[rand:rand + batch_size][:]
            else:
                matrix01 = matrix01_orig[:][:]
                matrix02 = matrix02_orig[:][:]

            matrix01 = torch.from_numpy(matrix01)
            matrix01 = matrix01.float()
            matrix02 = torch.from_numpy(matrix02)
            matrix02 = matrix02.float()

            if torch.cuda.is_available():
                ids01 = to_var(ids01)
                ids02 = to_var(ids02)
                ids01_context = to_var(ids01_context)
                ids02_context = to_var(ids02_context)
                char_ids01 = to_var(char_ids01)
                char_ids02 = to_var(char_ids02)
                matrix01 = to_var(matrix01)
                matrix02 = to_var(matrix02)

            for langTmp in langs:
                projectors[langTmp].zero_grad()

            input01 = embedding_langs[lang01](ids01)
            input02 = embedding_langs[lang02](ids02)
            input01_context = embedding_langs[lang01](ids01_context)
            input02_context = embedding_langs[lang02](ids02_context)
            input01_context = torch.mean(input01_context, 1)
            input02_context = torch.mean(input02_context, 1)

            char_ids01_tmp = char_ids01.view(
                char_ids01.size(0) * char_ids01.size(1))
            char_ids02_tmp = char_ids02.view(
                char_ids02.size(0) * char_ids02.size(1))
            input_char01 = char_embedding_langs[lang01](char_ids01_tmp)
            input_char02 = char_embedding_langs[lang02](char_ids02_tmp)
            input_char01 = input_char01.view(char_ids01.size(0),
                                             char_ids01.size(1), -1)
            input_char02 = input_char02.view(char_ids02.size(0),
                                             char_ids02.size(1), -1)

            # Forward, Backward and Optimize
            features01, output_char01, decoded_input01, decoded_input01_context = \
                projectors[lang01].forward(input01, input01_context, input_char01)
            features02, output_char02, decoded_input02, decoded_input02_context, \
                cross_decoded_input02, cross_decoded_input02_context = \
                projectors[lang02].forward(input02, input02_context, input_char02, features01)
            features01, output_char01, decoded_input01, decoded_input01_context, \
                cross_decoded_input01, cross_decoded_input01_context = \
                projectors[lang01].forward(input01, input01_context, input_char01, features02)

            linguistic_encoded_01, linguistic_decoded_01 = projectors[
                lang01].forward(matrix01)
            linguistic_encoded_02, linguistic_decoded_02, cross_linguistic_decoded_02 = \
                projectors[lang02].forward(matrix02, cross_encoded=linguistic_encoded_01)
            linguistic_encoded_01, linguistic_decoded_01, cross_linguistic_decoded_01 = \
                projectors[lang01].forward(matrix01, cross_encoded=linguistic_encoded_02)

            linguistic_label0 = Variable(
                torch.ones(linguistic_encoded_01.size(0)))
            label00 = Variable(torch.ones(features01.size(0)))

            if torch.cuda.is_available():
                features01 = features01.cuda()
                features02 = features02.cuda()
                decoded_input01 = decoded_input01.cuda()
                decoded_input02 = decoded_input02.cuda()
                cross_decoded_input01 = cross_decoded_input01.cuda()
                cross_decoded_input02 = cross_decoded_input02.cuda()
                decoded_input01_context = decoded_input01_context.cuda()
                decoded_input02_context = decoded_input02_context.cuda()
                cross_decoded_input01_context = cross_decoded_input01_context.cuda(
                )
                cross_decoded_input02_context = cross_decoded_input02_context.cuda(
                )
                label00 = label00.cuda()

                output_char01 = output_char01.cuda()
                output_char02 = output_char02.cuda()

                linguistic_encoded_01 = linguistic_encoded_01.cuda()
                linguistic_encoded_02 = linguistic_encoded_02.cuda()
                linguistic_label0 = linguistic_label0.cuda()

            loss = 0

            loss += criterion(features01, features02, label00)
            loss += criterion(input01, decoded_input01, label00)
            loss += criterion(input02, decoded_input02, label00)
            loss += criterion(input01, cross_decoded_input01, label00)
            loss += criterion(input02, cross_decoded_input02, label00)

            loss += criterion(input01_context, decoded_input01_context,
                              label00)
            loss += criterion(input01_context, cross_decoded_input01_context,
                              label00)
            loss += criterion(input02_context, decoded_input02_context,
                              label00)
            loss += criterion(input02_context, cross_decoded_input02_context,
                              label00)

            char_loss = 0
            char_loss += criterion(output_char01, output_char02, label00)

            linguistic_loss = 0
            linguistic_loss += criterion(linguistic_encoded_01,
                                         linguistic_encoded_02,
                                         linguistic_label0)

            loss = loss + char_loss + lg * linguistic_loss
            loss.backward()
            optimizer.step()

            # Print log info
            if epoch > 0 and i0 % log_step == 0:
                if os.path.exists(output_file):
                    os.remove(output_file)
                out = open(output_file, "w")
                for langTmp in langs:
                    data_loader = data_loader_mono_set[langTmp]
                    for i, (ids, context_ids,
                            char_ids) in enumerate(data_loader):
                        ids = torch.FloatTensor(ids).long()
                        context_ids = torch.FloatTensor(context_ids).long()
                        char_ids = torch.FloatTensor(char_ids).long()
                        if torch.cuda.is_available():
                            ids = to_var(ids)
                            context_ids = to_var(context_ids)
                            char_ids = to_var(char_ids)

                        proj = projectors[langTmp]

                        input1 = embedding_langs[langTmp](ids)
                        input1_contexts = embedding_langs[langTmp](context_ids)

                        input1_contexts = torch.mean(input1_contexts, 1)

                        char_ids_tmp = char_ids.view(
                            char_ids.size(0) * char_ids.size(1))
                        input_char = char_embedding_langs[langTmp](
                            char_ids_tmp)
                        input_char = input_char.view(char_ids.size(0),
                                                     char_ids.size(1), -1)

                        features, output_char, decoded_input, decoded_input_context = \
                            proj.forward(input1, input1_contexts, input_char)
                        features = torch.cat((features, output_char), 1)

                        vocab = vocab_langs[langTmp]
                        features = features.data.cpu().numpy()
                        ids = ids.data.cpu().numpy()
                        for j in range(0, len(ids)):
                            word = vocab.idx2word[ids[j]]
                            out.write(langTmp + ":" + word)
                            for m in range(len(features[j])):
                                out.write(" " + str(features[j][m]))
                            out.write("\n")
                out.close()

                mono_sim_score, mono_sim_coverate = evaluate_word_sim(
                    mono_sim_path, output_file)
                multi_sim_score, multi_sim_coverage = evaluate_word_sim(
                    word_sim_path, output_file)
                multi_trans_score, multi_trans_coverage = evaluate(
                    trans_path, output_file)
                mono_qvec_score, mono_qvec_coverate = evaluate_qvec(
                    mono_qvec_path, output_file)
                multi_qvec_score, multi_qvec_coverate = evaluate_qvec(
                    multi_qvec_cca_path, output_file)
                mono_cvec_score, mono_cvec_coverate = evaluate_cvec(
                    mono_qvec_cca_path, output_file)
                multi_cvec_score, multi_cvec_coverate = evaluate_cvec(
                    multi_qvec_cca_path, output_file)

                score = mono_sim_score + multi_sim_score + multi_trans_score + \
                        mono_qvec_score + mono_cvec_score + multi_cvec_score

                print(
                    "mono_sim: %.4f, multi_sim: %.4f, multi_trans: %.4f, mono_qvec: %.4f, multi_qvec: %.4f, "
                    "mono_cvec: %.4f, multi_cvec: %.4f" %
                    (mono_sim_score, multi_sim_score, multi_trans_score,
                     mono_qvec_score, multi_qvec_score, mono_cvec_score,
                     multi_cvec_score))

                print("\n")

                if score > best_score:
                    shutil.copyfile(output_file, output_file_best)
                    current_patience = 0
                    best_score = score
                    for tmp in langs:
                        best_model_dict[tmp] = projectors[tmp].state_dict()
                else:
                    current_patience += 1

                if current_patience > patience:
                    learning_rate = learning_rate * 0.5
                    current_patience = 0

                epoch_end = time.time()
                epoch_time = epoch_end - epoch_start
                print(
                    'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Best Score: %.4f, Best WordSim: %.4f, Learning_Rate: '
                    '%.4f, CurrentPatience: %.4f, Perplexity: %5.4f, Time: %d'
                    % (epoch, num_epochs, i0, total_step, loss.data[0],
                       best_score, best_sim_score, learning_rate,
                       current_patience, np.exp(loss.data[0]), epoch_time))

                epoch_start = time.time()

                # Save the models
            if (epoch + 1) % save_step == 0:
                for tmp in langs:
                    torch.save(
                        projectors[tmp].state_dict(),
                        os.path.join(
                            model_path,
                            tmp + '-encoder-%d-%d.pkl' % (epoch + 1, i0 + 1)))
            i0 += 1

    end = time.time()
    all_time = end - start
    print('Overall training time %d' % all_time)
    for lang in langs:
        torch.save(best_model_dict[lang],
                   os.path.join(model_path, lang + '-best-encoder.pkl'))