Ejemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-read_train_dir', required=True)
    parser.add_argument('-read_dev_dir', required=True)
    parser.add_argument('-read_test_dir', required=True)
    parser.add_argument('-read_vocab_file', required=True)
    parser.add_argument('-load_model_file', required=True)
    parser.add_argument('-save_model_dir', required=True)
    #the epoch of initialized model is 0, after 1 epoch training, epoch is 1
    #if continue trainning, curr_epoch should be model.epoch + 1
    parser.add_argument('-epoch', type=int, default=50)
    parser.add_argument('-optim_start_lr', type=float, default=0.001)
    parser.add_argument('-optim_soft_coefficient', type=float, default=1000)
    parser.add_argument('-batch_size', type=int, default=64)
    parser.add_argument('-use_gpu', action='store_true')
    parser.add_argument('-save_interval', type=int, default=10)
    opt = parser.parse_args()

    print('[PROCEDURE] prepare trainning.')

    checkpoint = torch.load(opt.load_model_file,
                            map_location=lambda storage, loc: storage)
    model = checkpoint['model']
    model_options = checkpoint['model_options']
    print('[INFO] loading model with parameter:\n\t{}'.format(model_options))

    print('[INFO] reading training data...')
    train_data = initialize_batch_loader(opt.read_train_dir + '/feats.scp',
                                         opt.read_train_dir + '/text',
                                         opt.read_vocab_file, opt.batch_size)

    print('[INFO] reading dev data...')
    dev_data = initialize_batch_loader(opt.read_dev_dir + '/feats.scp',
                                       opt.read_dev_dir + '/text',
                                       opt.read_vocab_file, opt.batch_size)

    print('[INFO] reading test data...')
    test_data = initialize_batch_loader(opt.read_test_dir + '/feats.scp',
                                        opt.read_test_dir + '/text',
                                        opt.read_vocab_file, opt.batch_size)
    print('[INFO] batch loader is initialized')

    vocab_size = len(torch.load(opt.read_vocab_file))
    crit = get_criterion(vocab_size)
    print('[INFO] using cross entropy loss.')

    optimizer = ScheduledOptim(optim.Adam(model.parameters(),
                                          betas=(0.9, 0.98),
                                          eps=1e-09),
                               start_lr=opt.optim_start_lr,
                               soft_coefficient=opt.optim_soft_coefficient)
    print('[INFO] using adam as optimizer.')

    print('[PROCEDURE] trainning start...')
    if opt.use_gpu:
        model = model.cuda()
        crit = crit.cuda()
    train(model, train_data, dev_data, test_data, crit, optimizer, opt,
          model_options)
Ejemplo n.º 2
0
def driver(_config,_run):
    
    if (_config["selectively_omitted_index"] != -1):
        ex.add_config({"omitted_feature_name":_config["selective_audio_visual_feature_omission"][_config["selectively_omitted_index"]]["name"]})
   
    output = open('config_file.pkl', 'wb')
    pickle.dump(_config, output)
    ex.add_artifact('config_file.pkl')
    output.close()
    
        
    set_random_seed()
    #print("inside driver")
    #X_train, y_train, X_valid, y_valid, X_test, y_test = load_saved_data()
    #print(X_train, y_train, X_valid, y_valid, X_test, y_test)
    train_data_loader,dev_data_loader,test_data_loader = set_up_data_loader()
    
    
    
    multimodal_context_config = _config["multimodal_context_configs"]
    
    model = Contextual_MFN(_config,my_logger).to(_config["device"])
    #for now, we will use the same scheduler for the entire model.
    #Later, if necessary, we may use the default optimizer of MFN
    #TODO: May have to use separate scheduler for transformer and mfn
    #We are using the optimizer and scgheduler of mfn as a last resort
    optimizer = ScheduledOptim(
        optim.Adam(
            filter(lambda x: x.requires_grad, model.parameters()),
            betas=(0.9, 0.98), eps=1e-09),
        multimodal_context_config["d_model"], multimodal_context_config["n_warmup_steps"])
    
    #TODO: May have to change the criterion
    #criterion = nn.L1Loss()
    criterion = nn.BCEWithLogitsLoss()
    criterion = criterion.to(_config["device"])
    # optimizer =  optim.Adam(
    #         filter(lambda x: x.requires_grad, model.parameters()),lr = _config["config"]["lr"],
    #         betas=(0.9, 0.98), eps=1e-09)
    # #torch.optim.Adam(params, lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
    # #optimizer = ReduceLROnPlateau(optimizer,mode='min',patience=100,factor=0.5,verbose=False)
    # scheduler = ReduceLROnPlateau(optimizer,mode='min',patience=100,factor=0.5,verbose=True)

    train(model, train_data_loader,dev_data_loader, optimizer, criterion)

    #test_accuracy =  test_score_from_model(model,test_data_loader,criterion)
    
    test_accuracy = test_score_from_file(test_data_loader,criterion)
    ex.log_scalar("test.accuracy",test_accuracy)
    results = dict()
    #I believe that it will try to minimize the rest. Let's see how it plays out
    results["optimization_target"] = 1 - test_accuracy
    
    stat_file = open("all_accuracies_for_stat.txt","a") 
 
    stat_file.write(str(_config["experiment_config_index"]) + "," + str(test_accuracy) + "\n") 
    stat_file.close()
    return results
Ejemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-read_feats_scp_file', required=True)
    parser.add_argument('-read_text_file', required=True)
    parser.add_argument('-read_vocab_file', required=True)
    parser.add_argument('-load_model_file', required=True)

    parser.add_argument('-n_warmup_steps', type=int, default=4000)
    parser.add_argument('-epoch', type=int, default=10)
    #the epoch of initialized model is 0, after 1 epoch training, epoch is 1
    #if continue trainning, curr_epoch should be model.epoch + 1
    parser.add_argument('-curr_epoch', type=int, default=1)

    parser.add_argument('-batch_size', type=int, default=64)

    parser.add_argument('-save_model_perfix', required=True)
    parser.add_argument('-use_gpu', action='store_true')
    opt = parser.parse_args()


    print('--------------------[PROCEDURE]--------------------')
    print('[PROCEDURE] prepare trainning.')


    train_data = initialize_batch_loader(opt.read_feats_scp_file, opt.read_text_file, opt.read_vocab_file, opt.batch_size)
    eval_data = initialize_batch_loader(opt.read_feats_scp_file, opt.read_text_file, opt.read_vocab_file, opt.batch_size)
    print('[INFO] batch loader is initialized')


    checkpoint = torch.load(opt.load_model_file)
    model = checkpoint['model']
    model_options = checkpoint['model_options']
    print('[INFO] loading model with parameter: {}'.format(model_options))


    def get_criterion(vocab_size):
        ''' With PAD token zero weight '''
        weight = torch.ones(vocab_size)
        weight[constants.PAD] = 0
        return nn.CrossEntropyLoss(weight, size_average=False)
    vocab_size = len(torch.load(opt.read_vocab_file))
    crit = get_criterion(vocab_size)
    print('[INFO] using cross entropy loss.')


    optimizer = ScheduledOptim(
        optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09),
        model_options.d_model * 2, opt.n_warmup_steps)
    print('[INFO] using adam as optimizer.')

    print('--------------------[PROCEDURE]--------------------')
    print('[PROCEDURE] trainning start...')
    if opt.use_gpu:
        train(model.cuda(), train_data, eval_data, crit.cuda(), optimizer, opt, model_options)
    else:
        train(model, train_data, eval_data, crit, optimizer, opt, model_options)
Ejemplo n.º 4
0
def skyline_iteration_provider(transformer):
    opt = model_config()
    optimizer = ScheduledOptim(
        optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()),
                   betas=(0.9, 0.98),
                   eps=1e-09), opt.d_model, opt.n_warmup_steps)

    def iteration(src_seq, src_pos, tgt_seq, tgt_pos, gold):
        optimizer.zero_grad()
        loss = transformer(src_seq, src_pos, tgt_seq, tgt_pos, gold)
        loss.backward()
        optimizer.step_and_update_lr()

    return iteration
Ejemplo n.º 5
0
def prepare_for_training(_config):
    train_data_loader, dev_data_loader, test_data_loader = set_up_data_loader()

    if _config["model"] == "trans_mfn":
        model = Transformed_mfn(_config).to(_config["device"])
    else:
        model = Multimodal_Video_transformer(_config,
                                             my_logger).to(_config["device"])

    #for now, we will use the same scheduler for the entire model.
    #Later, if necessary, we may use the default optimizer of MFN
    #TODO: May have to use separate scheduler for transformer and mfn
    #We are using the optimizer and scgheduler of mfn as a last resort
    if (_config["optim"] == "transformer"):
        optimizer = ScheduledOptim(
            optim.Adam(filter(lambda x: x.requires_grad, model.parameters()),
                       betas=(0.9, 0.98),
                       eps=1e-09),
            _config["sentence_transformer_configs"]["d_model"] * 5,
            _config["sentence_transformer_configs"]["n_warmup_steps"])
    elif (_config["optim"] == "paul"):
        print("initializing paul trans")
        optimizer_adam = optim.Adam(filter(lambda x: x.requires_grad,
                                           model.parameters()),
                                    lr=_config["lr"],
                                    betas=(0.9, 0.98),
                                    eps=1e-09)

        scheduler = ReduceLROnPlateau(optimizer_adam,
                                      mode='min',
                                      patience=100,
                                      factor=0.5,
                                      verbose=True)
        optimizer = Optimizer_Scheduler(optimizer_adam, scheduler)

    #We are multiplying by 3 as there are three different transformer units

    #TODO: May have to change the criterion
    #since the scores are in float format, we are using the L1Loss
    if (_config["loss_function"] == "ll1"):
        criterion = nn.L1Loss()
    else:
        criterion = nn.BCEWithLogitsLoss()
    criterion = criterion.to(_config["device"])

    return train_data_loader, dev_data_loader, test_data_loader, model, optimizer, criterion
Ejemplo n.º 6
0
    def train(self, niter=1):
        optimizer = ScheduledOptim(
            optim.Adam(self.module.parameters(), betas=(0.9, 0.98), eps=1e-09),
            2.0, self.opt.d_model, self.opt.n_warmup_steps)
        for _ in range(niter):
            optimizer.zero_grad()
            pred = self.module(*self.example_inputs)

            loss, n_correct, n_word = cal_performance(
                pred, self.gold, self.opt.trg_pad_idx, smoothing=self.opt.label_smoothing)
            loss.backward()
            optimizer.step_and_update_lr()
Ejemplo n.º 7
0
def main():
    ''' Main function '''

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model

    # ========= Loading Dataset =========#
    data = torch.load(opt.data)
    opt.max_token_seq_len = data['settings'].max_token_seq_len

    trn_data, val_data = prepare_dataloaders(data, opt)

    opt.src_vocab_size = trn_data.dataset.src_vocab_size
    opt.tgt_vocab_size = trn_data.dataset.tgt_vocab_size

    # ========= Preparing Model =========#
    if opt.embs_share_weight:
        assert trn_data.dataset.src_word2idx == trn_data.dataset.tgt_word2idx,\
            ('The src/tgt word2idx table are different but asked to share '
             'word embedding.')

    print(opt)

    device = torch.device('cuda' if opt.cuda else 'cpu')
    transformer = Transformer(opt.src_vocab_size,
                              opt.tgt_vocab_size,
                              opt.max_token_seq_len,
                              tgt_emb_prj_weight_sharing=opt.proj_share_weight,
                              emb_src_tgt_weight_sharing=opt.embs_share_weight,
                              d_k=opt.d_k,
                              d_v=opt.d_v,
                              d_model=opt.d_model,
                              d_word_vec=opt.d_word_vec,
                              d_inner=opt.d_inner_hid,
                              n_layers=opt.n_layers,
                              n_head=opt.n_head,
                              dropout=opt.dropout).to(device)

    optimizer = ScheduledOptim(
        optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()),
                   betas=(0.9, 0.98),
                   eps=1e-09), opt.d_model, opt.n_warmup_steps)

    train(transformer, trn_data, val_data, optimizer, device, opt)
Ejemplo n.º 8
0
def prep_for_training(_config):
    encoder_config = _config["encoder"]
    model = Multi_Transformer(
        n_src_features=encoder_config["n_source_features"],
        len_max_seq=encoder_config["max_token_seq_len"],
        _config=_config,
        tgt_emb_prj_weight_sharing=encoder_config["proj_share_weight"],
        emb_src_tgt_weight_sharing=encoder_config["embs_share_weight"],
        d_k=encoder_config["d_k"],
        d_v=encoder_config["d_v"],
        d_model=encoder_config["d_model"],
        d_word_vec=encoder_config["d_word_vec"],
        d_inner=encoder_config["d_inner_hid"],
        n_layers=encoder_config["n_layers"],
        n_head=encoder_config["n_head"],
        dropout=encoder_config["dropout"]).to(_config["device"])

    optimizer = ScheduledOptim(
        optim.Adam(filter(lambda x: x.requires_grad, model.parameters()),
                   betas=(0.9, 0.98),
                   eps=1e-09), encoder_config["d_model"],
        encoder_config["n_warmup_steps"])

    if (_config["loss_function"] == "bce"):
        print("using bce loss")
        criterion = nn.BCEWithLogitsLoss()
    else:
        criterion = nn.L1Loss()
    criterion = criterion.to(_config["device"])

    # optimizer =  optim.Adam(
    #         filter(lambda x: x.requires_grad, transformer.parameters()),lr = _config["learning_rate"],
    #         betas=(0.9, 0.98), eps=1e-09)
    #torch.optim.Adam(params, lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
    #optimizer = ReduceLROnPlateau(optimizer,mode='min',patience=100,factor=0.5,verbose=False)
    #scheduler = ReduceLROnPlateau(optimizer,mode='min',patience=100,factor=0.5,verbose=True)
    return model, optimizer, criterion
Ejemplo n.º 9
0
    def __init__(self,
                 source_dataset,
                 batch_size,
                 epochs,
                 window_size,
                 device,
                 plot_file,
                 train_data,
                 test_data,
                 valid_data,
                 target_column,
                 target_min,
                 target_max,
                 d_inner,
                 n_layers,
                 n_head_,
                 d_k,
                 d_v,
                 n_warmup_steps,
                 criterion,
                 target_name,
                 d_model,
                 model_file=None,
                 load_data=False,
                 load_model=False):
        self.data_frame = self.read_dataset(source_dataset)
        self.batch_size = batch_size
        self.epochs = epochs
        self.device = device
        self.target_column = target_column
        self.window = window_size
        self.plot_file = plot_file
        self.n_layers = n_layers
        self.n_head = n_head_
        self.d_inner = d_inner
        self.warmup_step = n_warmup_steps
        self.d_k = d_k
        self.d_v = d_v
        self.d_model = d_model
        self.target_name = target_name
        self.input_mask = torch.ones([self.batch_size, 1, self.window],
                                     dtype=torch.int,
                                     device=device)
        self.target_max = target_max
        self.target_min = target_min
        self.model_file = model_file
        self.prev_epoch = 0
        if load_data:
            self.train_df = pd.read_csv(train_data)
            self.test_df = pd.read_csv(test_data)
            self.valid_df = pd.read_csv(valid_data)
        else:
            self.train_df, self.valid_df, self.test_df = self.organize_dataset(
                train_data, test_data, valid_data)

        pad_col = [
            'col' + str(i) for i in range(self.train_df.shape[1], self.d_model)
        ]
        for col in pad_col:
            self.train_df[col] = 0
            self.test_df[col] = 0
            self.valid_df[col] = 0
        self.columns = self.train_df.shape[1]
        self.model = Encoder(n_position=200,
                             d_word_vec=self.columns,
                             d_model=self.columns,
                             d_inner=d_inner,
                             n_layers=n_layers,
                             n_head=n_head_,
                             d_k=d_k,
                             d_v=d_v,
                             dropout=0).to(device)

        if load_model:
            self.model = torch.load(self.model_file)['model']
            self.model.eval()
            self.model = self.model.to(device)
            self.prev_epoch = torch.load(self.model_file)['epoch']

        self.criterion = criterion
        self.optimizer = ScheduledOptim(
            optim.Adam(self.model.parameters(), betas=(0.9, 0.98), eps=1e-09),
            2.0,
            self.columns,
            n_warmup_steps,
            n_step=self.prev_epoch * (math.floor(
                self.train_df.shape[0] / self.window * self.batch_size)))
        self.loss_list = []
        self.lr_list = []
Ejemplo n.º 10
0
class Dataset:
    def __init__(self,
                 source_dataset,
                 batch_size,
                 epochs,
                 window_size,
                 device,
                 plot_file,
                 train_data,
                 test_data,
                 valid_data,
                 target_column,
                 target_min,
                 target_max,
                 d_inner,
                 n_layers,
                 n_head_,
                 d_k,
                 d_v,
                 n_warmup_steps,
                 criterion,
                 target_name,
                 d_model,
                 model_file=None,
                 load_data=False,
                 load_model=False):
        self.data_frame = self.read_dataset(source_dataset)
        self.batch_size = batch_size
        self.epochs = epochs
        self.device = device
        self.target_column = target_column
        self.window = window_size
        self.plot_file = plot_file
        self.n_layers = n_layers
        self.n_head = n_head_
        self.d_inner = d_inner
        self.warmup_step = n_warmup_steps
        self.d_k = d_k
        self.d_v = d_v
        self.d_model = d_model
        self.target_name = target_name
        self.input_mask = torch.ones([self.batch_size, 1, self.window],
                                     dtype=torch.int,
                                     device=device)
        self.target_max = target_max
        self.target_min = target_min
        self.model_file = model_file
        self.prev_epoch = 0
        if load_data:
            self.train_df = pd.read_csv(train_data)
            self.test_df = pd.read_csv(test_data)
            self.valid_df = pd.read_csv(valid_data)
        else:
            self.train_df, self.valid_df, self.test_df = self.organize_dataset(
                train_data, test_data, valid_data)

        pad_col = [
            'col' + str(i) for i in range(self.train_df.shape[1], self.d_model)
        ]
        for col in pad_col:
            self.train_df[col] = 0
            self.test_df[col] = 0
            self.valid_df[col] = 0
        self.columns = self.train_df.shape[1]
        self.model = Encoder(n_position=200,
                             d_word_vec=self.columns,
                             d_model=self.columns,
                             d_inner=d_inner,
                             n_layers=n_layers,
                             n_head=n_head_,
                             d_k=d_k,
                             d_v=d_v,
                             dropout=0).to(device)

        if load_model:
            self.model = torch.load(self.model_file)['model']
            self.model.eval()
            self.model = self.model.to(device)
            self.prev_epoch = torch.load(self.model_file)['epoch']

        self.criterion = criterion
        self.optimizer = ScheduledOptim(
            optim.Adam(self.model.parameters(), betas=(0.9, 0.98), eps=1e-09),
            2.0,
            self.columns,
            n_warmup_steps,
            n_step=self.prev_epoch * (math.floor(
                self.train_df.shape[0] / self.window * self.batch_size)))
        self.loss_list = []
        self.lr_list = []

    def read_dataset(self, source_dataset):
        return pd.read_csv(source_dataset)

    def organize_dataset(self, train_data, test_data, valid_data):
        train_df = self.data_frame
        valid_df = self.data_frame
        test_df = self.data_frame
        return train_df, valid_df, test_df

    def train(self):
        train_tensor = torch.tensor(self.train_df.values,
                                    dtype=torch.float,
                                    device=self.device)
        train_rows = self.train_df.shape[0]
        section_size = self.window * self.batch_size
        avg_loss = 0
        for i in range(self.epochs):
            chosen_idx = np.random.choice(train_rows,
                                          replace=True,
                                          size=math.floor(train_rows / 10))
            imputing_df = self.train_df.copy()
            imputing_df.iloc[[j in chosen_idx for j in range(train_rows)],
                             self.target_column] = 0
            imputing_tensor = torch.tensor(imputing_df.values,
                                           dtype=torch.float,
                                           device=self.device)

            avg_loss = 0
            lr = 0

            for j in range(math.floor(train_rows / section_size)):
                batch_imputing_tensor = imputing_tensor[j *
                                                        section_size:(j + 1) *
                                                        section_size, :]
                batch_train_tensor = train_tensor[j * section_size:(j + 1) *
                                                  section_size, :]

                input_tensor = self.unsqueeze(batch_imputing_tensor)

                self.optimizer.zero_grad()

                imputed_tensor = self.squeeze(
                    self.model(input_tensor, self.input_mask)[0])

                imputing_idx = [
                    k in chosen_idx
                    for k in range(j * section_size, (j + 1) * section_size)
                ]
                imputing_idx_tensor = torch.tensor(imputing_idx)

                imputed_label_tensor = imputed_tensor[imputing_idx_tensor,
                                                      self.target_column]
                true_label_tensor = batch_train_tensor[imputing_idx_tensor,
                                                       self.target_column]

                loss = torch.sqrt(
                    self.criterion(imputed_label_tensor, true_label_tensor))
                # loss = self.criterion(imputed_label_tensor, true_label_tensor)

                if imputed_label_tensor.shape[0] > 0:

                    loss.backward()  #here compute engine
                    lr = self.optimizer.step_and_update_lr()

                    avg_loss = (j * avg_loss + loss) / (j + 1)

            self.loss_list.append(avg_loss *
                                  (self.target_max - self.target_min))
            self.lr_list.append(10000 * lr)

            self.save_model(i)

            print(avg_loss * (self.target_max - self.target_min))

        self.draw_plots(avg_loss * (self.target_max - self.target_min))

    def validate(self):
        valid_tensor = torch.tensor(self.valid_df.values,
                                    dtype=torch.float,
                                    device=self.device)
        valid_rows = self.valid_df.shape[0]
        section_size = self.window * self.batch_size

        chosen_idx = np.random.choice(valid_rows,
                                      replace=True,
                                      size=math.floor(valid_rows / 10))
        imputing_df = self.valid_df.copy()
        imputing_df.iloc[[j in chosen_idx for j in range(valid_rows)],
                         self.target_column] = 0
        imputing_tensor = torch.tensor(imputing_df.values,
                                       dtype=torch.float,
                                       device=self.device)
        avg_loss = 0

        imputed_list = []

        for j in range(math.floor(valid_rows / section_size)):
            batch_imputing_tensor = imputing_tensor[j * section_size:(j + 1) *
                                                    section_size, :]
            batch_valid_tensor = valid_tensor[j * section_size:(j + 1) *
                                              section_size, :]

            input_tensor = self.unsqueeze(batch_imputing_tensor)

            imputed_tensor = self.squeeze(
                self.model(input_tensor, self.input_mask)[0])

            imputing_idx = [
                k in chosen_idx
                for k in range(j * section_size, (j + 1) * section_size)
            ]
            imputing_idx_tensor = torch.tensor(imputing_idx)

            imputed_label_tensor = imputed_tensor[imputing_idx_tensor,
                                                  self.target_column]
            true_label_tensor = batch_valid_tensor[imputing_idx_tensor,
                                                   self.target_column]

            imputed_list = imputed_list + imputed_tensor[:, self.
                                                         target_column].tolist(
                                                         )

            # loss = torch.sqrt(self.criterion(imputed_label_tensor, true_label_tensor))
            loss = self.criterion(imputed_label_tensor, true_label_tensor)

            if imputed_label_tensor.shape[0] > 0:
                avg_loss = (j * avg_loss + loss) / (j + 1)

        print(avg_loss * (self.target_max - self.target_min))

        valid_list = valid_tensor[:, self.target_column].tolist()
        imputed_list = [(imputed_list[i] * (i in chosen_idx) + valid_list[i] *
                         (i not in chosen_idx))
                        for i in range(len(imputed_list))]

        plt.plot(imputed_list, 'r', label="Imputed")
        plt.plot(valid_list, 'b', label="True")
        plt.legend(loc="upper right")
        plt.show()

    def unsqueeze(self, batch_tensor):
        temp_tensor = torch.zeros((self.batch_size, self.window, self.columns),
                                  dtype=torch.float,
                                  device=self.device)
        for i in range(self.batch_size):
            temp_tensor[i, :, :] = batch_tensor[i * self.window:(i + 1) *
                                                self.window, :]
        return temp_tensor

    def squeeze(self, predict_tensor):
        temp_tensor = torch.zeros(
            (self.batch_size * self.window, self.columns),
            dtype=torch.float,
            device=self.device)
        for i in range(self.batch_size):
            temp_tensor[i * self.window:(i + 1) *
                        self.window, :] = predict_tensor[i, :, :]
        return temp_tensor

    def draw_plots(self, avg_loss):
        plt.plot(self.loss_list, 'r', label="Loss")
        plt.plot(self.lr_list, 'b', label="10000 * Learning Rate")
        title = 'n_layers: ' + str(self.n_layers) + '\n' + 'n_heads: ' + str(
            self.n_head
        ) + '\n' + 'd_inner: ' + str(
            self.d_inner
        ) + '\n' + 'warmup_step: ' + str(
            self.warmup_step
        ) + '\n' + 'd_v: ' + str(self.d_v) + '\n' + 'd_k: ' + str(
            self.d_k
        ) + '\n' + 'd_model: ' + str(self.d_model) + '\n' + 'window: ' + str(
            self.window
        ) + '\n' + 'target_column: ' + self.target_name + '\n' + 'Loss_function: ' + str(
            self.criterion) + '\n' + 'avg_loss: ' + str(float(avg_loss.data))
        plt.legend(loc="upper right", title=title)
        timestr = time.strftime("%Y%m%d-%H%M%S")
        plt.savefig(self.plot_file + timestr, quality=90)

    def save_model(self, epoch):
        checkpoint = {
            'epoch': epoch,
            'lr_list': self.lr_list,
            'loss_list': self.loss_list,
            'model': self.model
        }
        if self.model_file:
            torch.save(checkpoint, self.model_file)
Ejemplo n.º 11
0
def main():
    ''' Main function '''
    parser = argparse.ArgumentParser()

    # parser.add_argument(
    #     '-data', default='/data/nfsdata/data/sunzijun/transformer/burry4/data.pt')
    parser.add_argument('-data', default='./mini_data.pt')
    parser.add_argument('-epoch', type=int, default=10)
    parser.add_argument('-batch_size', type=int, default=64)

    # parser.add_argument('-d_word_vec', type=int, default=512)
    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=3)
    parser.add_argument('-n_warmup_steps', type=int, default=4000)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', default=None)
    parser.add_argument('-save_model', default='trained')
    parser.add_argument('-save_mode',
                        type=str,
                        choices=['all', 'best'],
                        default='all')

    parser.add_argument('-no_cuda', action='store_true')
    parser.add_argument('-label_smoothing', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model

    debug = False
    if debug:
        opt.batch_size = 2
        opt.dropout = 0
        opt.epoch = 300
        opt.log = '/home/sunzijun/data/CRNN_trans'

    # ========= Loading Dataset =========#
    data = torch.load(opt.data)
    opt.max_token_seq_len = data['settings'].max_token_seq_len

    training_data, validation_data = my_prepare_dataloaders(data, opt)

    opt.src_vocab_size = training_data.dataset.src_vocab_size
    opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size

    # 构造id2word字典
    idx2word = {idx: word for word, idx in data['dict']['tgt'].items()}

    # ========= Preparing Model =========#
    print("************ prepare model ****************")
    if opt.embs_share_weight:
        assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \
            'The src/tgt word2idx table are different but asked to share word embedding.'

    print(opt)
    device_ids = [1, 3]
    device = torch.device('cuda', device_ids[0])

    transformer = CRNN_Transformer(
        opt.src_vocab_size,
        opt.tgt_vocab_size,
        opt.max_token_seq_len,
        tgt_emb_prj_weight_sharing=opt.proj_share_weight,
        emb_src_tgt_weight_sharing=opt.embs_share_weight,
        d_k=opt.d_k,
        d_v=opt.d_v,
        d_model=opt.d_model,
        d_word_vec=opt.d_word_vec,
        d_inner=opt.d_inner_hid,
        n_layers=opt.n_layers,
        n_head=opt.n_head,
        dropout=opt.dropout)

    transformer = transformer.cuda(device_ids[0])  # 模型载入cuda device 0
    transformer = torch.nn.DataParallel(
        transformer, device_ids=device_ids)  # dataParallel重新包装

    optimizer = ScheduledOptim(
        optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()),
                   betas=(0.9, 0.98),
                   eps=1e-09), opt.d_model, opt.n_warmup_steps)

    train(transformer, training_data, validation_data, optimizer, device, opt,
          idx2word)
Ejemplo n.º 12
0
def main():
    ''' Main function '''
    parser = argparse.ArgumentParser()

    parser.add_argument('-data', required=True)

    parser.add_argument('-epoch', type=int, default=None)
    parser.add_argument('-step', type=int, default=None)
    parser.add_argument('-batch_size', type=int, default=64)

    #parser.add_argument('-d_word_vec', type=int, default=512)
    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    # NOTE(keshav2): This just refers to the learning rate schedule,
    #                nothing performance related.
    parser.add_argument('-n_warmup_steps', type=int, default=4000)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', default=None)
    parser.add_argument('--checkpoint_dir',
                        type=str,
                        default='/lfs/1/keshav2/checkpoints/transformer')
    parser.add_argument('-save_mode',
                        type=str,
                        choices=['all', 'best'],
                        default='all')

    parser.add_argument('-no_cuda', action='store_true')
    parser.add_argument('-label_smoothing', action='store_true')

    parser.add_argument('--dist-url',
                        default='env://',
                        type=str,
                        help='url used to set up distributed training')
    parser.add_argument('--dist-backend',
                        default='nccl',
                        type=str,
                        help='Distributed backend')
    parser.add_argument('--local_rank', default=0, type=int, help='Local rank')
    parser.add_argument('--rank', default=None, type=int, help='Rank')
    parser.add_argument('--world_size',
                        default=None,
                        type=int,
                        help='World size')
    parser.add_argument('--master_addr',
                        default=None,
                        type=str,
                        help='Master address to use for distributed run')
    parser.add_argument('--master_port',
                        default=None,
                        type=int,
                        help='Master port to use for distributed run')

    parser.add_argument('--throughput_estimation_interval',
                        type=int,
                        default=None,
                        help='Steps between logging steps completed')
    parser.add_argument('--max_duration',
                        type=int,
                        default=None,
                        help='Maximum duration in seconds')
    parser.add_argument('--enable_gavel_iterator',
                        action='store_true',
                        default=False,
                        help='If set, use Gavel iterator')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model

    torch.cuda.set_device(opt.local_rank)

    if opt.epoch is not None and opt.step is not None:
        raise ValueError('Only one of epoch and step may be set')
    elif opt.epoch is None and opt.step is None:
        raise ValueError('One of epoch and step must be set')

    opt.distributed = False
    if opt.master_addr is not None:
        opt.distributed = True
        os.environ['MASTER_ADDR'] = opt.master_addr
        os.environ['MASTER_PORT'] = str(opt.master_port)
        dist.init_process_group(backend=opt.dist_backend,
                                init_method=opt.dist_url,
                                world_size=opt.world_size,
                                rank=opt.rank)

    #========= Loading Dataset =========#
    data = torch.load(opt.data)
    opt.max_token_seq_len = data['settings'].max_token_seq_len

    training_data, validation_data = prepare_dataloaders(
        data, opt, opt.master_addr is not None)

    opt.src_vocab_size = training_data.dataset.src_vocab_size
    opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size

    #========= Preparing Model =========#
    if opt.embs_share_weight:
        assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \
            'The src/tgt word2idx table are different but asked to share word embedding.'

    print(opt)

    device = torch.device('cuda' if opt.cuda else 'cpu')
    transformer = Transformer(opt.src_vocab_size,
                              opt.tgt_vocab_size,
                              opt.max_token_seq_len,
                              tgt_emb_prj_weight_sharing=opt.proj_share_weight,
                              emb_src_tgt_weight_sharing=opt.embs_share_weight,
                              d_k=opt.d_k,
                              d_v=opt.d_v,
                              d_model=opt.d_model,
                              d_word_vec=opt.d_word_vec,
                              d_inner=opt.d_inner_hid,
                              n_layers=opt.n_layers,
                              n_head=opt.n_head,
                              dropout=opt.dropout).to(device)

    if opt.distributed:
        transformer = DDP(transformer,
                          device_ids=[opt.local_rank],
                          output_device=opt.local_rank)

    if opt.enable_gavel_iterator:
        training_data = GavelIterator(training_data, opt.checkpoint_dir,
                                      load_checkpoint, save_checkpoint)

    optimizer = ScheduledOptim(
        optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()),
                   betas=(0.9, 0.98),
                   eps=1e-09), opt.d_model, opt.n_warmup_steps)

    train(transformer, training_data, validation_data, optimizer, device, opt)
Ejemplo n.º 13
0
def main():
    parser = argparse.ArgumentParser(description='main_train.py')
    # dir = "../data/jd/big"
    # dir = "../data/jd/middle"
    dir = "../data/jd/pure"
    parser.add_argument('-data_dir', default=dir)
    parser.add_argument('-epoch', type=int, default=30)
    parser.add_argument('-batch_size', type=int, default=64)
    parser.add_argument('-d_word_vec', type=int, default=512)
    parser.add_argument('-d_model', type=int, default=512)
    # parser.add_argument('-d_inner_hid', type=int, default=1024)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)
    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-ct_layers', type=int, default=1)  # ContextLayers
    parser.add_argument('-n_layers', type=int, default=3)
    parser.add_argument('-n_warmup_steps', type=int, default=4000)
    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight',
                        action='store_true',
                        default=True)
    parser.add_argument('-proj_share_weight',
                        action='store_true',
                        default=True)
    parser.add_argument('-label_smoothing', action='store_true', default=True)
    parser.add_argument('-log', default="log")
    # parser.add_argument('-save_model', default="model")
    parser.add_argument('-save_mode',
                        type=str,
                        choices=['all', 'best'],
                        default='best')
    parser.add_argument(
        '-device',
        action='store_true',
        default=torch.device('cuda:0' if torch.cuda.is_available() else 'cpu'))
    args = parser.parse_args()
    args.model_name = str(args.ct_layers) + '_' + str(args.n_layers) + '_'
    if not os.path.exists(args.log):
        os.mkdir(args.log)

    print("加载词汇表")
    reader = torch.load(args.data_dir + "/reader.data")
    args.max_token_seq_len = reader['settings']["max_token_seq_len"]
    args.max_word_seq_len = reader['settings']["max_word_seq_len"]

    print("加载验证集数据")

    valid_src = read_file(path=args.data_dir + "/valid_src.txt", )
    valid_tgt = read_file(path=args.data_dir + "/valid_tgt.txt")
    valid_ctx = read_file(path=args.data_dir + "/valid_attr.txt")
    valid_src, valid_ctx, valid_tgt = \
        digitalize(src=valid_src, tgt=valid_tgt, ctx=valid_ctx, max_sent_len=args.max_token_seq_len - 2,
                   word2idx=reader['dict']['src'], index2freq=reader["dict"]["frequency"], topk=3)
    # training_data, validation_data = prepare_dataloaders(reader, data, args)
    validation_data = torch.utils.data.DataLoader(SeqDataset(
        src_word2idx=reader['dict']['src'],
        tgt_word2idx=reader['dict']['tgt'],
        ctx_word2idx=reader['dict']['ctx'],
        src_insts=valid_src,
        ctx_insts=valid_ctx,
        tgt_insts=valid_tgt),
                                                  num_workers=4,
                                                  pin_memory=False,
                                                  batch_size=args.batch_size,
                                                  collate_fn=tri_collate_fn)
    del valid_src, valid_ctx, valid_tgt
    print("加载训练集数据")
    begin, end = 0, sys.maxsize
    # begin, end = 0, 100
    train_src = read_file(path=args.data_dir + "/train_src.txt",
                          begin=begin,
                          end=end)
    train_tgt = read_file(path=args.data_dir + "/train_tgt.txt",
                          begin=begin,
                          end=end)
    train_ctx = read_file(path=args.data_dir + "/train_attr.txt",
                          begin=begin,
                          end=end)
    train_src, train_ctx, train_tgt = \
        digitalize(src=train_src, tgt=train_tgt, ctx=train_ctx, max_sent_len=args.max_token_seq_len - 2,
                   word2idx=reader['dict']['src'], index2freq=reader["dict"]["frequency"], topk=0)

    training_data = torch.utils.data.DataLoader(SeqDataset(
        src_word2idx=reader['dict']['src'],
        tgt_word2idx=reader['dict']['tgt'],
        ctx_word2idx=reader['dict']['ctx'],
        src_insts=train_src,
        ctx_insts=train_ctx,
        tgt_insts=train_tgt),
                                                num_workers=4,
                                                pin_memory=False,
                                                batch_size=args.batch_size,
                                                collate_fn=tri_collate_fn,
                                                shuffle=True)
    del train_src, train_ctx, train_tgt
    args.src_vocab_size = training_data.dataset.src_vocab_size
    args.tgt_vocab_size = training_data.dataset.tgt_vocab_size
    args.ctx_vocab_size = training_data.dataset.ctx_vocab_size
    args.idx2word = {idx: word for word, idx in reader['dict']['src'].items()}

    print("---准备模型---")
    if args.embs_share_weight:
        assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \
            'The src/tgt word2idx table 不同 但共用word embedding.'

    print(args)

    args.model_path = "log/" + args.model_name + ".model"
    if os.path.exists(args.model_path):
        checkpoint = torch.load(args.model_path, map_location=args.device)
        model_opt = checkpoint['settings']
        transformer = ContextTransformer(
            model_opt.ctx_vocab_size,
            model_opt.src_vocab_size,
            model_opt.tgt_vocab_size,
            model_opt.max_token_seq_len,
            tgt_emb_prj_weight_sharing=model_opt.proj_share_weight,
            emb_src_tgt_weight_sharing=model_opt.embs_share_weight,
            d_k=model_opt.d_k,
            d_v=model_opt.d_v,
            d_model=model_opt.d_model,
            d_word_vec=model_opt.d_word_vec,
            d_inner=model_opt.d_inner_hid,
            ct_layers=model_opt.en_layers,
            n_layers=model_opt.n_layers,
            n_head=model_opt.n_head,
            dropout=model_opt.dropout)
        if (args.ct_layers < 0):
            transformer = Transformer(
                args.src_vocab_size,
                args.tgt_vocab_size,
                args.max_token_seq_len,
                tgt_emb_prj_weight_sharing=args.proj_share_weight,
                emb_src_tgt_weight_sharing=args.embs_share_weight,
                d_k=args.d_k,
                d_v=args.d_v,
                d_model=args.d_model,
                d_word_vec=args.d_word_vec,
                d_inner=args.d_inner_hid,
                n_layers=args.n_layers,
                n_head=args.n_head,
                dropout=args.dropout).to(args.device)
        transformer.load_state_dict(checkpoint['model'])
        transformer = transformer.to(args.device)
        print('[Info] 装入模型,继续训练')
    else:
        transformer = ContextTransformer(
            args.ctx_vocab_size,
            args.src_vocab_size,
            args.tgt_vocab_size,
            args.max_token_seq_len,
            tgt_emb_prj_weight_sharing=args.proj_share_weight,
            emb_src_tgt_weight_sharing=args.embs_share_weight,
            d_k=args.d_k,
            d_v=args.d_v,
            d_model=args.d_model,
            d_word_vec=args.d_word_vec,
            d_inner=args.d_inner_hid,
            ct_layers=args.ct_layers,
            n_layers=args.n_layers,
            n_head=args.n_head,
            dropout=args.dropout).to(args.device)
        if (args.ct_layers < 0):
            transformer = Transformer(
                args.src_vocab_size,
                args.tgt_vocab_size,
                args.max_token_seq_len,
                tgt_emb_prj_weight_sharing=args.proj_share_weight,
                emb_src_tgt_weight_sharing=args.embs_share_weight,
                d_k=args.d_k,
                d_v=args.d_v,
                d_model=args.d_model,
                d_word_vec=args.d_word_vec,
                d_inner=args.d_inner_hid,
                n_layers=args.n_layers,
                n_head=args.n_head,
                dropout=args.dropout).to(args.device)

    optimizer = torch.optim.Adam(filter(lambda x: x.requires_grad,
                                        transformer.parameters()),
                                 betas=(0.9, 0.98),
                                 eps=1e-09)
    args_optimizer = ScheduledOptim(optimizer, args.d_model,
                                    args.n_warmup_steps)
    printModel(transformer)

    train(transformer, training_data, validation_data, args_optimizer, args)
Ejemplo n.º 14
0
def train(model, training_data, validation_data, device, opt, training_mode):
    ''' Start training '''

    log_train_file, log_valid_file = None, None

    try:
        os.makedirs(opt.save_folder)
    except FileExistsError:
        pass

    if opt.log:
        if training_mode == TRAIN_BASE:
            log_train_file = os.path.join(opt.save_folder,
                                          opt.log + '.train.base.log')
            log_valid_file = os.path.join(opt.save_folder,
                                          opt.log + '.valid.base.log')
        elif training_mode == TRAIN_ENCODER:
            log_train_file = os.path.join(
                opt.save_folder, opt.log + '.train.encoder.highway.log')
            log_valid_file = os.path.join(
                opt.save_folder, opt.log + '.valid.encoder.highway.log')
        elif training_mode == TRAIN_DECODER:
            log_train_file = os.path.join(
                opt.save_folder, opt.log + '.train.decoder.highway.log')
            log_valid_file = os.path.join(
                opt.save_folder, opt.log + '.valid.decoder.highway.log')

        print('[Info] Training performance will be written to file: {} and {}'.
              format(log_train_file, log_valid_file))

        with open(log_train_file, 'w') as log_tf, open(log_valid_file,
                                                       'w') as log_vf:
            log_tf.write('epoch,loss,ppl,accuracy\n')
            log_vf.write('epoch,loss,ppl,accuracy\n')

    def print_performances(header, loss, accu, start_time):
        print('  - {header:12} ppl: {ppl: 8.5f}, accuracy: {accu:3.3f} %, '\
              'elapse: {elapse:3.3f} min'.format(
                  header=f"({header})", ppl=math.exp(min(loss, 100)),
                  accu=100*accu, elapse=(time.time()-start_time)/60))

    no_decay = ["bias", "LayerNorm.weight"]
    if training_mode == TRAIN_BASE:
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in model.named_parameters()
                    if ("encoder_highway" not in n) and (
                        "decoder_highway" not in n) and (not any(
                            nd in n for nd in no_decay))
                ],
                "weight_decay":
                opt.weight_decay,
            },
            {
                "params": [
                    p for n, p in model.named_parameters()
                    if ("encoder_highway" not in n) and
                    ("decoder_highway" not in n) and (any(nd in n
                                                          for nd in no_decay))
                ],
                "weight_decay":
                0.0,
            },
        ]

    elif training_mode == TRAIN_ENCODER:
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in model.named_parameters()
                    if ("encoder_highway" in n) and (not any(
                        nd in n for nd in no_decay))
                ],
                "weight_decay":
                opt.weight_decay,
            },
            {
                "params": [
                    p for n, p in model.named_parameters()
                    if ("encoder_highway" in n) and (any(nd in n
                                                         for nd in no_decay))
                ],
                "weight_decay":
                0.0,
            },
        ]

    elif training_mode == TRAIN_DECODER:
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in model.named_parameters()
                    if ("decoder_highway" in n) and (not any(
                        nd in n for nd in no_decay))
                ],
                "weight_decay":
                opt.weight_decay,
            },
            {
                "params": [
                    p for n, p in model.named_parameters()
                    if ("decoder_highway" in n) and (any(nd in n
                                                         for nd in no_decay))
                ],
                "weight_decay":
                0.0,
            },
        ]

    optimizer = ScheduledOptim(
        optim.Adam(optimizer_grouped_parameters, betas=(0.9, 0.98), eps=1e-09),
        2.0, opt.d_model, opt.n_warmup_steps)

    if training_mode == TRAIN_BASE:
        training_epoch = opt.base_epoch
    elif training_mode == TRAIN_ENCODER:
        training_epoch = opt.highway_encoder_epoch
    elif training_mode == TRAIN_DECODER:
        training_epoch = opt.highway_decoder_epoch

    #valid_accus = []
    valid_losses = []
    for epoch_i in range(training_epoch):
        print('[ Epoch', epoch_i, ']')

        start = time.time()
        train_loss, train_accu = train_epoch(model,
                                             training_data,
                                             optimizer,
                                             opt,
                                             device,
                                             smoothing=opt.label_smoothing,
                                             training_mode=training_mode)
        print_performances('Training', train_loss, train_accu, start)

        start = time.time()
        valid_loss, valid_accu = eval_epoch(model, validation_data, device,
                                            opt, training_mode)
        print_performances('Validation', valid_loss, valid_accu, start)

        valid_losses += [valid_loss]

        checkpoint = {
            'epoch': epoch_i,
            'settings': opt,
            'model': model.state_dict()
        }

        if opt.save_model:
            if opt.save_mode == 'all':
                if training_mode == TRAIN_BASE:
                    model_name = os.path.join(
                        opt.save_folder,
                        opt.save_model + '_accu_{accu:3.3f}.chkpt'.format(
                            accu=100 * valid_accu))
                elif training_mode == TRAIN_ENCODER:
                    model_name = os.path.join(
                        opt.save_folder, opt.save_model +
                        '_loss_{loss:3.3f}_encoder_highway.chkpt'.format(
                            valid_loss))
                elif training_mode == TRAIN_DECODER:
                    model_name = os.path.join(
                        opt.save_folder, opt.save_model +
                        '_accu_{accu:3.3f}_decoder_highway.chkpt'.format(
                            accu=100 * valid_accu))
                torch.save(checkpoint, model_name)

            elif opt.save_mode == 'best':
                if training_mode == TRAIN_BASE:
                    model_name = os.path.join(opt.save_folder,
                                              opt.save_model + '.chkpt')
                elif training_mode == TRAIN_ENCODER:
                    model_name = os.path.join(
                        opt.save_folder,
                        opt.save_model + '_encoder_highway.chkpt')
                elif training_mode == TRAIN_DECODER:
                    model_name = os.path.join(
                        opt.save_folder,
                        opt.save_model + '_decoder_highway.chkpt')

                if training_mode == TRAIN_BASE and valid_loss <= min(
                        valid_losses):
                    torch.save(checkpoint, model_name)
                elif training_mode == TRAIN_ENCODER or training_mode == TRAIN_DECODER:
                    torch.save(checkpoint, model_name)
                print('    - [Info] The checkpoint file has been updated.')

        if log_train_file and log_valid_file:
            with open(log_train_file,
                      'a') as log_tf, open(log_valid_file, 'a') as log_vf:
                log_tf.write(
                    '{epoch},{loss: 8.5f},{ppl: 8.5f},{accu:3.3f}\n'.format(
                        epoch=epoch_i,
                        loss=train_loss,
                        ppl=math.exp(min(train_loss, 100)),
                        accu=100 * train_accu))
                log_vf.write(
                    '{epoch},{loss: 8.5f},{ppl: 8.5f},{accu:3.3f}\n'.format(
                        epoch=epoch_i,
                        loss=valid_loss,
                        ppl=math.exp(min(valid_loss, 100)),
                        accu=100 * valid_accu))
Ejemplo n.º 15
0
def main():
    ''' Main function '''
    parser = argparse.ArgumentParser()

    parser.add_argument('-data', required=True)

    parser.add_argument('-epoch', type=int, default=10)
    parser.add_argument('-batch_size', type=int, default=64)

    # parser.add_argument('-d_word_vec', type=int, default=512)
    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-n_warmup_steps', type=int, default=4000)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', default=None)
    parser.add_argument('-save_model', default=None)
    parser.add_argument('-save_mode',
                        type=str,
                        choices=['all', 'best'],
                        default='best')

    parser.add_argument('-no_cuda', action='store_true')
    parser.add_argument('-label_smoothing', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model

    # ========= Loading Dataset =========#
    data = torch.load(opt.data)
    opt.max_token_seq_len = data['settings'].max_token_seq_len

    training_data, validation_data = prepare_dataloaders(data, opt)

    opt.src_vocab_size = training_data.dataset.src_vocab_size
    opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size

    # ========= Preparing Model =========#
    if opt.embs_share_weight:
        assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \
            'The src/tgt word2idx table are different but asked to share word embedding.'

    print(opt)

    device = torch.device('cuda' if opt.cuda else 'cpu')
    transformer = Transformer(opt.src_vocab_size,
                              opt.tgt_vocab_size,
                              opt.max_token_seq_len,
                              tgt_emb_prj_weight_sharing=opt.proj_share_weight,
                              emb_src_tgt_weight_sharing=opt.embs_share_weight,
                              d_k=opt.d_k,
                              d_v=opt.d_v,
                              d_model=opt.d_model,
                              d_word_vec=opt.d_word_vec,
                              d_inner=opt.d_inner_hid,
                              n_layers=opt.n_layers,
                              n_head=opt.n_head,
                              dropout=opt.dropout).to(device)

    optimizer = ScheduledOptim(
        optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()),
                   betas=(0.9, 0.98),
                   eps=1e-09), opt.d_model, opt.n_warmup_steps)

    train(transformer, training_data, validation_data, optimizer, device, opt)
def main():
    ''' Main function '''
    parser = argparse.ArgumentParser()

    parser.add_argument('-data', required=True)

    parser.add_argument('-epoch', type=int, default=10)
    parser.add_argument('-batch_size', type=int, default=64)

    # parser.add_argument('-d_word_vec', type=int, default=512)
    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-n_warmup_steps', type=int, default=4000)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', default='default')
    parser.add_argument('-tensorboard', default=None)
    parser.add_argument('-save_model', default=None)
    parser.add_argument('-save_mode',
                        type=str,
                        choices=['all', 'best'],
                        default='best')

    parser.add_argument('-no_cuda', action='store_true')
    parser.add_argument('-label_smoothing', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model
    global global_counter
    global_counter = 0

    writer = None
    if opt.tensorboard:
        writer = SummaryWriter(os.path.join('./logs', opt.tensorboard))

    # ========= Loading Dataset =========#
    data = torch.load(opt.data)
    global idx2char
    idx2char = {v: k for k, v in data['dict']['src'].items()}

    opt.max_token_seq_len = data['settings'].max_token_seq_len

    training_data, validation_data, unique_char_len = prepare_dataloaders(
        data, opt)

    opt.src_vocab_size = training_data.dataset.src_vocab_size
    opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size

    # ========= Preparing Model =========#
    if opt.embs_share_weight:
        assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \
            'The src/tgt word2idx table are different but asked to share word embedding.'

    print(opt)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    transformer = Transformer(opt.src_vocab_size,
                              opt.tgt_vocab_size,
                              opt.max_token_seq_len,
                              tgt_emb_prj_weight_sharing=opt.proj_share_weight,
                              emb_src_tgt_weight_sharing=opt.embs_share_weight,
                              d_k=opt.d_k,
                              d_v=opt.d_v,
                              d_model=opt.d_model,
                              d_word_vec=opt.d_word_vec,
                              d_inner=opt.d_inner_hid,
                              n_layers=opt.n_layers,
                              n_head=opt.n_head,
                              dropout=opt.dropout).to(device)

    try:
        transformer.load_state_dict(torch.load('./checkpoints/model.pt'))
        print("Model loaded successfully.......")
    except:
        pass

    optimizer = ScheduledOptim(
        optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()),
                   betas=(0.9, 0.98),
                   eps=1e-09), opt.d_model, opt.n_warmup_steps)

    train(transformer, training_data, validation_data, optimizer, device, opt,
          unique_char_len, writer)
Ejemplo n.º 17
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-data', default='./data/preprocessedData')

    parser.add_argument('-epoch', type=int, default=50)
    parser.add_argument('-batch_size', type=int, default=64)

    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-n_warmup_steps', type=int, default=4000)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', default='log')  # None
    parser.add_argument('-save_model', default='trained')  # None
    parser.add_argument('-save_mode',
                        type=str,
                        choices=['all', 'best'],
                        default='best')

    parser.add_argument('-beam_size', type=int, default=5, help='Beam size')
    parser.add_argument('-n_best',
                        type=int,
                        default=1,
                        help="""If verbose is set, will output the n_best
                        decoded sentences""")

    parser.add_argument('-no_cuda', action='store_true')
    parser.add_argument('-label_smoothing', action='store_true', default=True)

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model

    # Loading Dataset
    data = torch.load(opt.data)
    opt.max_token_seq_len = data['settings'].max_token_seq_len

    training_data, validation_data = prepare_dataloaders(data, opt)

    opt.src_vocab_size = training_data.dataset.src_vocab_size
    opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size

    # Preparing Model
    if opt.embs_share_weight:
        assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \
            'The src/tgt word2idx table are different but asked to share word embedding.'

    print(opt)

    device = torch.device('cuda' if opt.cuda else 'cpu')
    # device = torch.device('cpu')

    transformer = Transformer(opt.src_vocab_size,
                              opt.tgt_vocab_size,
                              opt.max_token_seq_len,
                              tgt_emb_prj_weight_sharing=opt.proj_share_weight,
                              emb_src_tgt_weight_sharing=opt.embs_share_weight,
                              d_k=opt.d_k,
                              d_v=opt.d_v,
                              d_model=opt.d_model,
                              d_word_vec=opt.d_word_vec,
                              d_inner=opt.d_inner_hid,
                              n_layers=opt.n_layers,
                              n_head=opt.n_head,
                              dropout=opt.dropout)

    discriminator = Discriminator(opt.d_model, 1024, opt.max_token_seq_len,
                                  device)

    #'''
    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        transformer = nn.DataParallel(transformer)
    #    '''
    transformer.to(device)
    discriminator.to(device)

    optimizer = ScheduledOptim(
        optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()),
                   betas=(0.9, 0.98),
                   eps=1e-09), opt.d_model, opt.n_warmup_steps)
    optimizer_d = optim.RMSprop(discriminator.parameters(), lr=5e-4)

    train(transformer, discriminator, training_data, validation_data,
          optimizer, optimizer_d, device, opt)
Ejemplo n.º 18
0
def main():
    ''' Main function '''
    parser = argparse.ArgumentParser()

    parser.add_argument('-data', required=True)

    parser.add_argument('-epoch', type=int, default=100)
    parser.add_argument('-batch_size', type=int, default=64)

    #parser.add_argument('-d_word_vec', type=int, default=512)
    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=1024)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-n_warmup_steps', type=int, default=4000)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', default=None)
    parser.add_argument('-save_model', default=None)
    parser.add_argument('-save_mode',
                        type=str,
                        choices=['all', 'best'],
                        default='all')

    parser.add_argument('-no_cuda', action='store_true')

    parser.add_argument('-multi_gpu', action='store_true')

    parser.add_argument('-use_ctx', action='store_true')

    parser.add_argument(
        '-external_validation_script',
        type=str,
        default=None,
        metavar='PATH',
        nargs='*',
        help=
        "location of validation script (to run your favorite metric for validation) (default: %(default)s)"
    )

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model

    #========= Loading Dataset =========#
    data = torch.load(opt.data)
    opt.max_token_seq_len = data['settings'].max_token_seq_len

    #========= Preparing DataLoader =========#
    training_data = DataLoader(
        data['dict']['src'],
        data['dict']['tgt'],
        src_insts=data['train']['src'],
        tgt_insts=data['train']['tgt'],
        ctx_insts=(data['train']['ctx'] if opt.use_ctx else None),
        batch_size=opt.batch_size,
        cuda=opt.cuda,
        is_train=True,
        sort_by_length=True)

    validation_data = DataLoader(
        data['dict']['src'],
        data['dict']['tgt'],
        src_insts=data['valid']['src'],
        tgt_insts=data['valid']['tgt'],
        ctx_insts=(data['valid']['ctx'] if opt.use_ctx else None),
        batch_size=opt.batch_size,
        shuffle=False,
        cuda=opt.cuda,
        is_train=False,
        sort_by_length=True)

    opt.src_vocab_size = training_data.src_vocab_size
    opt.tgt_vocab_size = training_data.tgt_vocab_size

    #========= Preparing Model =========#
    if opt.embs_share_weight and training_data.src_word2idx != training_data.tgt_word2idx:
        print(
            '[Warning]',
            'The src/tgt word2idx table are different but asked to share word embedding.'
        )

    print(opt)

    transformer = Transformer(opt.src_vocab_size,
                              opt.tgt_vocab_size,
                              opt.max_token_seq_len,
                              proj_share_weight=opt.proj_share_weight,
                              embs_share_weight=opt.embs_share_weight,
                              d_k=opt.d_k,
                              d_v=opt.d_v,
                              d_model=opt.d_model,
                              d_word_vec=opt.d_word_vec,
                              d_inner_hid=opt.d_inner_hid,
                              n_layers=opt.n_layers,
                              n_head=opt.n_head,
                              dropout=opt.dropout,
                              use_ctx=opt.use_ctx)

    #print(transformer)

    # optimizer = ScheduledOptim(
    #     optim.Adam(
    #         transformer.get_trainable_parameters(),
    #         betas=(0.9, 0.98), eps=1e-09),
    #     opt.d_model, opt.n_warmup_steps)

    optimizer = ScheduledOptim(
        optim.Adam(transformer.get_trainable_parameters(),
                   betas=(0.9, 0.98),
                   eps=1e-09), opt.d_model, opt.n_warmup_steps)

    def get_criterion(vocab_size):
        ''' With PAD token zero weight '''
        weight = torch.ones(vocab_size)
        weight[Constants.PAD] = 0
        #return nn.CrossEntropyLoss(weight, size_average=False)
        return nn.NLLLoss(weight, size_average=False)

    crit = get_criterion(training_data.tgt_vocab_size)
    logsoftmax = nn.LogSoftmax()

    if opt.cuda:
        transformer = transformer.cuda()
        crit = crit.cuda()
        logsoftmax = logsoftmax.cuda()

    if opt.multi_gpu:
        transformer = nn.DataParallel(transformer)
        crit = nn.DataParallel(crit)
        logsoftmax = nn.DataParallel(logsoftmax)

    train(transformer, training_data, validation_data, crit, logsoftmax,
          optimizer, opt)
Ejemplo n.º 19
0
def main():
    ''' Main function '''
    parser = argparse.ArgumentParser()

    parser.add_argument('-data', required=True)
    parser.add_argument('-mined_data', required=True)
    parser.add_argument('-snippet_model', required=True)

    parser.add_argument('-epoch', type=int, default=10)
    parser.add_argument('-batch_size', type=int, default=64)

    #parser.add_argument('-d_word_vec', type=int, default=512)
    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-n_warmup_steps', type=int, default=4000)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', type=bool, default=True)
    parser.add_argument('-save_model_dir', default=None, required=True)
    parser.add_argument('-save_mode',
                        type=str,
                        choices=['all', 'best'],
                        default='all')

    parser.add_argument('-no_cuda', action='store_true')
    parser.add_argument('-label_smoothing', action='store_true')

    # For bleu eval
    parser.add_argument('-beam_size', type=int, default=5, help='Beam size')
    parser.add_argument('-n_best',
                        type=int,
                        default=1,
                        help="""If verbose is set, will output the n_best
                        decoded sentences""")

    parser.add_argument('-test_epoch',
                        type=int,
                        default=5,
                        help='Test every x epochs')
    parser.add_argument('-resume_from_epoch',
                        type=int,
                        default=0,
                        help='Warm restart')

    # Not really needed
    parser.add_argument('-alpha',
                        type=float,
                        default=1.0,
                        help='Weighting loss')
    parser.add_argument('-loss_weight',
                        type=float,
                        default=0.1,
                        help='Mined loss weight')
    parser.add_argument('-lr', type=float, default=1e-3, help='Learning rate')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model

    # Snippet model sentencepiece
    sp.Load(opt.snippet_model)

    #========= Loading Dataset =========#
    data = torch.load(opt.data)
    mined_data = torch.load(opt.mined_data)

    opt.inp_seq_max_len = 4 * data['settings'].train_max_input_len
    opt.out_seq_max_len = 4 * data['settings'].train_max_output_len

    opt.max_token_seq_len = int(opt.out_seq_max_len / 4)

    training_data, validation_data, test_data, mined_data = prepare_dataloaders(
        data, mined_data, opt)

    opt.src_vocab_size = training_data.dataset.src_vocab_size
    opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size

    print(opt.inp_seq_max_len, opt.out_seq_max_len, opt.src_vocab_size,
          opt.tgt_vocab_size)

    #========= Preparing Model =========#
    if opt.embs_share_weight:
        assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \
            'The src/tgt word2idx table are different but asked to share word embedding.'

    print(opt)

    device = torch.device('cuda' if opt.cuda else 'cpu')
    transformer = Transformer(opt.src_vocab_size,
                              opt.tgt_vocab_size,
                              opt.inp_seq_max_len,
                              opt.out_seq_max_len,
                              tgt_emb_prj_weight_sharing=opt.proj_share_weight,
                              emb_src_tgt_weight_sharing=opt.embs_share_weight,
                              d_k=opt.d_k,
                              d_v=opt.d_v,
                              d_model=opt.d_model,
                              d_word_vec=opt.d_word_vec,
                              d_inner=opt.d_inner_hid,
                              n_layers=opt.n_layers,
                              n_head=opt.n_head,
                              dropout=opt.dropout).to(device)

    optimizer = ScheduledOptim(
        optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()),
                   betas=(0.9, 0.98),
                   eps=1e-09,
                   lr=opt.lr), opt.d_model, opt.n_warmup_steps)

    save_params(opt)

    opt = check_restart_conditions(opt)
    if opt.resume_from_epoch >= 1:
        print('Loading Old model')
        print('Loading model files from folder: %s' % opt.save_model_dir)
        transformer = load_models(transformer, opt, opt.resume_from_epoch)

    train(transformer, training_data, validation_data, test_data, mined_data,
          optimizer, device, opt)
Ejemplo n.º 20
0
def main():
    ''' 
    Usage:
    python train.py -data_pkl m30k_deen_shr.pkl -embs_share_weight -proj_share_weight -label_smoothing -output_dir output -b 256 -warmup 128000
    '''

    parser = argparse.ArgumentParser()

    parser.add_argument('-data_pkl', default=None)     # all-in-1 data pickle or bpe field

    parser.add_argument('-train_path', default=None)   # bpe encoded data
    parser.add_argument('-val_path', default=None)     # bpe encoded data

    parser.add_argument('-epoch', type=int, default=10)
    parser.add_argument('-b', '--batch_size', type=int, default=2048)

    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-warmup','--n_warmup_steps', type=int, default=4000)
    parser.add_argument('-lr_mul', type=float, default=2.0)
    parser.add_argument('-seed', type=int, default=None)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')
    parser.add_argument('-scale_emb_or_prj', type=str, default='prj')

    parser.add_argument('-output_dir', type=str, default=None)
    parser.add_argument('-use_tb', action='store_true')
    parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best')

    parser.add_argument('-no_cuda', action='store_true')
    parser.add_argument('-label_smoothing', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model

    # https://pytorch.org/docs/stable/notes/randomness.html
    # For reproducibility
    if opt.seed is not None:
        torch.manual_seed(opt.seed)
        torch.backends.cudnn.benchmark = False
        # torch.set_deterministic(True)
        np.random.seed(opt.seed)
        random.seed(opt.seed)

    if not opt.output_dir:
        print('No experiment result will be saved.')
        raise

    if not os.path.exists(opt.output_dir):
        os.makedirs(opt.output_dir)

    if opt.batch_size < 2048 and opt.n_warmup_steps <= 4000:
        print('[Warning] The warmup steps may be not enough.\n'\
              '(sz_b, warmup) = (2048, 4000) is the official setting.\n'\
              'Using smaller batch w/o longer warmup may cause '\
              'the warmup stage ends with only little data trained.')

    device = torch.device('cuda' if opt.cuda else 'cpu')

    #========= Loading Dataset =========#

    if all((opt.train_path, opt.val_path)):
        training_data, validation_data = prepare_dataloaders_from_bpe_files(opt, device)
    elif opt.data_pkl:
        training_data, validation_data = prepare_dataloaders(opt, device)
    else:
        raise

    print(opt)

    transformer = Transformer(
        opt.src_vocab_size,
        opt.trg_vocab_size,
        src_pad_idx=opt.src_pad_idx,
        trg_pad_idx=opt.trg_pad_idx,
        trg_emb_prj_weight_sharing=opt.proj_share_weight,
        emb_src_trg_weight_sharing=opt.embs_share_weight,
        d_k=opt.d_k,
        d_v=opt.d_v,
        d_model=opt.d_model,
        d_word_vec=opt.d_word_vec,
        d_inner=opt.d_inner_hid,
        n_layers=opt.n_layers,
        n_head=opt.n_head,
        dropout=opt.dropout,
        scale_emb_or_prj=opt.scale_emb_or_prj).to(device)

    optimizer = ScheduledOptim(
        optim.Adam(transformer.parameters(), betas=(0.9, 0.98), eps=1e-09),
        opt.lr_mul, opt.d_model, opt.n_warmup_steps)

    train(transformer, training_data, validation_data, optimizer, device, opt)
def main():
    ''' Main function '''
    parser = argparse.ArgumentParser()

    parser.add_argument('-epoch', type=int, default=1)
    parser.add_argument('-batch_size', type=int, default=4)
    parser.add_argument('-context_width', type=int, default=1)
    parser.add_argument('-frame_rate', type=int, default=30)

    #parser.add_argument('-d_word_vec', type=int, default=512)
    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=1024)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-n_warmup_steps', type=int, default=400)

    parser.add_argument('-dropout', type=float, default=0.1)

    parser.add_argument('-log', default=None)
    parser.add_argument('-save_model', default='./exp')
    parser.add_argument('-save_mode',
                        type=str,
                        choices=['all', 'best'],
                        default='best')

    opt = parser.parse_args()

    cfg_path = './config/transformer.cfg'
    config = configparser.ConfigParser()
    config.read(cfg_path)

    #========= Preparing DataLoader =========#
    training_data = DataLoader('train',
                               config,
                               DEVICE,
                               batch_size=opt.batch_size,
                               context_width=opt.context_width,
                               frame_rate=opt.frame_rate)
    validation_data = DataLoader('dev',
                                 config,
                                 DEVICE,
                                 batch_size=opt.batch_size,
                                 context_width=opt.context_width,
                                 frame_rate=opt.frame_rate)
    test_data = DataLoader('test',
                           config,
                           DEVICE,
                           batch_size=opt.batch_size,
                           context_width=opt.context_width,
                           frame_rate=opt.frame_rate)

    #========= Preparing Model =========#

    print(opt)

    input_dim = training_data.features_dim
    output_dim = training_data.vocab_size
    n_inputs_max_seq = max(training_data.inputs_max_seq_lengths,
                           validation_data.inputs_max_seq_lengths,
                           test_data.inputs_max_seq_lengths)
    n_outputs_max_seq = max(training_data.outputs_max_seq_lengths,
                            validation_data.outputs_max_seq_lengths,
                            test_data.outputs_max_seq_lengths)
    print('*************************')
    print('The max length of inputs is %d:' % n_inputs_max_seq)
    print('The max length of targets is %d' % n_outputs_max_seq)

    transformer = Transformer(input_dim,
                              output_dim,
                              n_inputs_max_seq,
                              n_outputs_max_seq,
                              d_k=opt.d_k,
                              d_v=opt.d_v,
                              d_model=opt.d_model,
                              d_inner_hid=opt.d_inner_hid,
                              n_layers=opt.n_layers,
                              n_head=opt.n_head,
                              dropout=opt.dropout,
                              device=DEVICE)

    # print(transformer)

    optimizer = ScheduledOptim(
        optim.Adam(transformer.get_trainable_parameters(),
                   betas=(0.9, 0.98),
                   eps=1e-09), opt.d_model, opt.n_warmup_steps)

    def get_criterion(output_dim):
        ''' With PAD token zero weight '''
        weight = torch.ones(output_dim)
        weight[Constants.PAD] = 0
        return nn.CrossEntropyLoss(weight, size_average=False)

    crit = get_criterion(training_data.vocab_size)

    transformer = transformer.to(DEVICE)
    crit = crit.to(DEVICE)

    train(transformer, training_data, validation_data, crit, optimizer, opt)
def main():
    '''
    Usage:
    python train.py -data_pkl m30k_deen_shr.pkl -log m30k_deen_shr -embs_share_weight -proj_share_weight -label_smoothing -save_model trained -b 256 -warmup 128000
    '''
    global C
    global shapes
    global Beta
    parser = argparse.ArgumentParser()

    parser.add_argument('-data_pkl',
                        default=None)  # all-in-1 data pickle or bpe field
    parser.add_argument('-srn', type=bool, default=False)
    parser.add_argument('-optimize_c', type=bool, default=False)
    parser.add_argument('-Beta', type=float, default=1.0)
    parser.add_argument("-lr", type=float, default=1e-1)
    parser.add_argument("-scheduler_mode", type=str, default=None)
    parser.add_argument("-scheduler_factor", type=float, default=0.5)
    parser.add_argument('-train_path', default=None)  # bpe encoded data
    parser.add_argument('-val_path', default=None)  # bpe encoded data

    parser.add_argument('-epoch', type=int, default=10)
    parser.add_argument('-b', '--batch_size', type=int, default=2048)

    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-warmup', '--n_warmup_steps', type=int, default=4000)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', default=None)
    parser.add_argument('-save_model', default=None)
    parser.add_argument('-save_mode',
                        type=str,
                        choices=['all', 'best'],
                        default='best')

    parser.add_argument('-no_cuda', action='store_true')
    parser.add_argument('-label_smoothing', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model
    Beta = opt.Beta

    if not opt.log and not opt.save_model:
        print('No experiment result will be saved.')
        raise

    if opt.batch_size < 2048 and opt.n_warmup_steps <= 4000:
        print('[Warning] The warmup steps may be not enough.\n'\
              '(sz_b, warmup) = (2048, 4000) is the official setting.\n'\
              'Using smaller batch w/o longer warmup may cause '\
              'the warmup stage ends with only little data trained.')

    device = torch.device('cuda' if opt.cuda else 'cpu')

    #========= Loading Dataset =========#

    if all((opt.train_path, opt.val_path)):
        training_data, validation_data = prepare_dataloaders_from_bpe_files(
            opt, device)
    elif opt.data_pkl:
        training_data, validation_data = prepare_dataloaders(opt, device)
    else:
        raise

    print(opt)

    transformer = Transformer(opt.src_vocab_size,
                              opt.trg_vocab_size,
                              src_pad_idx=opt.src_pad_idx,
                              trg_pad_idx=opt.trg_pad_idx,
                              trg_emb_prj_weight_sharing=opt.proj_share_weight,
                              emb_src_trg_weight_sharing=opt.embs_share_weight,
                              d_k=opt.d_k,
                              d_v=opt.d_v,
                              d_model=opt.d_model,
                              d_word_vec=opt.d_word_vec,
                              d_inner=opt.d_inner_hid,
                              n_layers=opt.n_layers,
                              n_head=opt.n_head,
                              dropout=opt.dropout).to(device)
    if opt.srn:
        transformer = migrate_to_srn(transformer)
        transformer = transformer.to(device)
    if opt.optimize_c:
        srn_modules = [
            module for module in transformer.modules()
            if isinstance(module, (SRNLinear, SRNConv2d))
        ]
        sranks = []
        shapes = []

        for module in srn_modules:
            W = module.weight.detach()
            shape_w = W.shape
            W = W.view(shape_w[0], -1)
            sranks.append(stable_rank(W).item())
            shapes.append(W.shape)

        # a rule of thump to initialize the target srank with the current srank of the model
        C = [
            Parameter((torch.ones(1) * sranks[i] / min(shapes[i])).view(()))
            for i in range(len(srn_modules))
        ]
        for i, module in enumerate(srn_modules):
            C[i].to(device)
            module.c = C[i]
        criteria = criteria_
    else:
        criteria = cal_performance
    optimizer = ScheduledOptim(optim.Adam(transformer.parameters(),
                                          lr=1e-2,
                                          betas=(0.9, 0.98),
                                          eps=1e-09),
                               opt.lr,
                               opt.d_model,
                               opt.n_warmup_steps,
                               mode=opt.scheduler_mode,
                               factor=opt.scheduler_factor,
                               patience=3)

    train(transformer,
          training_data,
          validation_data,
          optimizer,
          device,
          opt,
          loss=criteria)
    print("~~~~~~~~~~~~~C~~~~~~~~~~~~~")
    print(C)
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    print("-----------Model-----------")
    print(transformer)
    print("---------------------------")
    with torch.no_grad():
        for pname, p in transformer.named_parameters():
            if len(p.shape) > 1:
                print("...Parameter ", pname, ", srank=",
                      stable_rank(p.view(p.shape[0], -1)).item())
Ejemplo n.º 23
0
def main():
    ''' 
    Usage:
    python train.py -data_pkl m30k_deen_shr.pkl -log m30k_deen_shr -embs_share_weight -proj_share_weight -label_smoothing -save_model trained -b 256 -warmup 128000
    '''

    parser = argparse.ArgumentParser()

    parser.add_argument('-data_pkl',
                        default=None)  # all-in-1 data pickle or bpe field

    parser.add_argument('-train_path', default=None)  # bpe encoded data
    parser.add_argument('-val_path', default=None)  # bpe encoded data

    parser.add_argument('-epoch', type=int, default=10)
    parser.add_argument('-b', '--batch_size', type=int, default=2048)

    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-warmup', '--n_warmup_steps', type=int, default=4000)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', default=None)
    parser.add_argument('-save_model', default=None)
    parser.add_argument('-save_mode',
                        type=str,
                        choices=['all', 'best'],
                        default='best')

    parser.add_argument('-no_cuda', default=True, action='store_true')
    parser.add_argument('-label_smoothing', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model

    if not opt.log and not opt.save_model:
        print('No experiment result will be saved.')
        raise

    if opt.batch_size < 2048 and opt.n_warmup_steps <= 4000:
        print('[Warning] The warmup steps may be not enough.\n'\
              '(sz_b, warmup) = (2048, 4000) is the official setting.\n'\
              'Using smaller batch w/o longer warmup may cause '\
              'the warmup stage ends with only little data trained.')

    device = torch.device('cuda' if opt.cuda else 'cpu')

    #========= Loading Dataset =========#

    if all((opt.train_path, opt.val_path)):
        training_data, validation_data = prepare_dataloaders_from_bpe_files(
            opt, device)
    elif opt.data_pkl:
        training_data, validation_data = prepare_dataloaders(opt, device)
    else:
        raise

    print(opt)

    transformer = Transformer(opt.src_vocab_size,
                              opt.trg_vocab_size,
                              src_pad_idx=opt.src_pad_idx,
                              trg_pad_idx=opt.trg_pad_idx,
                              trg_emb_prj_weight_sharing=opt.proj_share_weight,
                              emb_src_trg_weight_sharing=opt.embs_share_weight,
                              d_k=opt.d_k,
                              d_v=opt.d_v,
                              d_model=opt.d_model,
                              d_word_vec=opt.d_word_vec,
                              d_inner=opt.d_inner_hid,
                              n_layers=opt.n_layers,
                              n_head=opt.n_head,
                              dropout=opt.dropout).to(device)

    optimizer = ScheduledOptim(
        optim.Adam(transformer.parameters(), betas=(0.9, 0.98), eps=1e-09),
        2.0, opt.d_model, opt.n_warmup_steps)

    train(transformer, training_data, validation_data, optimizer, device, opt)
Ejemplo n.º 24
0
def main():
    ''' Main function '''
    parser = argparse.ArgumentParser()
    #----------------------参数都在这里面,默认参数!!!!!!!!!!!!!!!!
    parser.add_argument('-data', required=False)

    parser.add_argument('-epoch', type=int, default=1)  #为了跑通我就先写1了.
    parser.add_argument('-batch_size', type=int, default=32)

    #parser.add_argument('-d_word_vec', type=int, default=512)
    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-n_warmup_steps', type=int, default=4000)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', default=None)
    parser.add_argument('-save_model', default='/transformer_my')
    parser.add_argument('-save_mode',
                        type=str,
                        choices=['all', 'best'],
                        default='best')

    parser.add_argument('-no_cuda', action='store_true')
    parser.add_argument('-label_smoothing',
                        action='store_true')  # 这种action里面写上就表示默认调用的话就是true

    opt = parser.parse_args()

    opt.d_word_vec = opt.d_model
    '''
    下面来把参数写这里, 就方便了.
    '''
    opt.saved_weight = '/trained.chkpt'  # 就的模型的位置.
    opt.data = 'yunixng_bash/data/multi30k.atok.low.pt'  # 数据集的位置.
    opt.save_model = 'trained'  # 存模型的名字.
    opt.save_mode = 'best'  # 数据集的位置.
    opt.proj_share_weight = True  # 数据集的位置.
    opt.label_smoothing = True  # 数据集的位置.
    opt.cuda = False
    opt.batch_size = 200
    opt.epoch = 30

    print(opt, 44444444444444444444444444444444444444444444444444444444444444)
    #========= Loading Dataset =========#
    data = torch.load(
        opt.data
    )  # 这里面的数据已经经过编码了. 具体的编码规则也都在data里面,data里面是一个字典.并且src 和tgt的字典是不一样的,所以上面的embs_share_weight 参数一定要false.  数据集一共大小才3mb. 真方便. 就是根目录下面的multi30k.atok.low.pt这个.   应该是一个小数据及,3万个句子对, 字典3k. 只有点常用的英文字. 并且没用使用word-piece. 只是word级别的编码. 所以随便给一个句子,超出字典非常正常. 但是目前用这个,对于测试非常方便,速度很快.
    opt.max_token_seq_len = data['settings'].max_token_seq_len
    # 进行数据长度预处理 , 就是加padding 而已.
    training_data, validation_data = prepare_dataloaders(data, opt)

    opt.src_vocab_size = training_data.dataset.src_vocab_size
    opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size

    #========= Preparing Model =========#
    if opt.embs_share_weight:
        assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \
            'The src/tgt word2idx table are different but asked to share word embedding.'
    print('配的参数都打印在这里了')
    print(opt)

    device = torch.device('cuda' if opt.cuda else 'cpu')
    transformer = Transformer(  # 准备网络模型.
        opt.src_vocab_size,
        opt.tgt_vocab_size,
        opt.max_token_seq_len,
        tgt_emb_prj_weight_sharing=opt.proj_share_weight,
        emb_src_tgt_weight_sharing=opt.embs_share_weight,
        d_k=opt.d_k,
        d_v=opt.d_v,
        d_model=opt.d_model,
        d_word_vec=opt.d_word_vec,
        d_inner=opt.d_inner_hid,
        n_layers=opt.n_layers,
        n_head=opt.n_head,
        dropout=opt.dropout).to(device)

    optimizer = ScheduledOptim(
        optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()),
                   betas=(0.9, 0.98),
                   eps=1e-09), opt.d_model, opt.n_warmup_steps)

    train(transformer, training_data, validation_data, optimizer, device, opt)
Ejemplo n.º 25
0
def main():
    h = logging.StreamHandler()
    formatter = logging.Formatter("[%(asctime)s][%(levelname)s]%(message)s",
                                  datefmt="%Y-%m-%d %H:%M:%S")
    h.setFormatter(formatter)
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    logger.addHandler(h)

    parser = argparse.ArgumentParser()

    parser.add_argument('-data_path', default="../Data/dataset")
    parser.add_argument('-epoch', type=int, default=10)
    parser.add_argument('-b', '--batch_size', type=int, default=512)

    parser.add_argument('-d_model', type=int, default=15)
    parser.add_argument('-d_inner_hid', type=int, default=256)
    parser.add_argument('-d_k', type=int, default=15)
    parser.add_argument('-d_v', type=int, default=15)

    parser.add_argument('-n_head', type=int, default=1)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-warmup',
                        '--n_warmup_steps',
                        type=int,
                        default=100000)
    parser.add_argument('-lr_mul', type=float, default=2.0)
    parser.add_argument('-seed', type=int, default=None)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')
    parser.add_argument('-scale_emb_or_prj', type=str, default='prj')

    parser.add_argument('-output_dir', type=str, default='./checkpoint/')
    parser.add_argument('-summary_dir', type=str, default='./summary')
    parser.add_argument('-save_mode',
                        type=str,
                        choices=['all', 'best'],
                        default='best')

    parser.add_argument('-no_cuda', action='store_true')
    parser.add_argument('-label_smoothing', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model
    logging.info(opt)

    writer = SummaryWriter(log_dir=str(opt.summary_dir))

    if opt.seed is not None:
        torch.manual_seed(opt.seed)
        torch.backends.cudnn.benchmark = False
        np.random.seed(opt.seed)
        random.seed(opt.seed)

    if not os.path.exists(opt.output_dir):
        os.makedirs(opt.output_dir)

    device = torch.device('cuda' if opt.cuda else 'cpu')

    #========= Loading Dataset =========#
    pkl_files = os.listdir(opt.data_path)
    pwd = os.getcwd()
    pkl_files = [
        os.path.join(pwd, opt.data_path, file) for file in pkl_files
        if 'train' in file
    ]
    data_list = [data for data in pkl_files if '.pkl' in data]
    random.shuffle(data_list)
    logging.info(data_list)

    transformer = Transformer(trg_emb_prj_weight_sharing=opt.proj_share_weight,
                              emb_src_trg_weight_sharing=opt.embs_share_weight,
                              d_k=opt.d_k,
                              d_v=opt.d_v,
                              d_model=opt.d_model,
                              d_word_vec=opt.d_word_vec,
                              d_inner=opt.d_inner_hid,
                              n_layers=opt.n_layers,
                              n_head=opt.n_head,
                              dropout=opt.dropout,
                              scale_emb_or_prj=opt.scale_emb_or_prj).to(device)

    optimizer = ScheduledOptim(
        optim.Adam(transformer.parameters(), betas=(0.9, 0.98), eps=1e-09),
        opt.lr_mul, opt.d_model, opt.n_warmup_steps)

    test(transformer, data_list, optimizer, device, opt, writer)
Ejemplo n.º 26
0
def main():
    ''' Main function '''
    parser = argparse.ArgumentParser()

    parser.add_argument('-data', default="./pssp-data/data.pt")

    parser.add_argument('-epoch', type=int, default=10)
    parser.add_argument('-batch_size', type=int, default=17)

    #parser.add_argument('-d_word_vec', type=int, default=512)
    parser.add_argument('-d_model', type=int, default=256)
    parser.add_argument('-d_inner_hid', type=int, default=512)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-n_warmup_steps', type=int, default=4000)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', default=None)
    parser.add_argument('-save_model', default="model")
    parser.add_argument('-save_plot', default="loss.png")
    parser.add_argument('-save_mode', type=str,
                        choices=['all', 'best'], default='best')


    parser.add_argument('-no_cuda', action='store_true')
    parser.add_argument('-label_smoothing', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model

    #========= Loading Dataset =========#
    data = torch.load(opt.data)
    opt.max_token_seq_len = data['settings'].max_token_seq_len

    training_data, validation_data, test_data = prepare_dataloaders(data, opt)

    opt.src_vocab_size = training_data.dataset.src_vocab_size
    opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size

    opt.vocab_src = training_data.dataset.src_word2idx
    opt.vocab_tgt = training_data.dataset.tgt_word2idx

    #========= Preparing Model =========#
    if opt.embs_share_weight:
        assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \
            'The src/tgt word2idx table are different but asked to share word embedding.'

    print(opt)

    device = torch.device('cuda' if opt.cuda else 'cpu')

    transformer = Transformer(
        opt.src_vocab_size,
        opt.tgt_vocab_size,
        opt.max_token_seq_len,
        tgt_emb_prj_weight_sharing=opt.proj_share_weight,
        emb_src_tgt_weight_sharing=opt.embs_share_weight,
        d_k=opt.d_k,
        d_v=opt.d_v,
        d_model=opt.d_model,
        d_word_vec=opt.d_word_vec,
        d_inner=opt.d_inner_hid,
        n_layers=opt.n_layers,
        n_head=opt.n_head,
        dropout=opt.dropout)

    transformer = DataParallel(transformer, range(0, torch.cuda.device_count())).to(device)

    optimizer = ScheduledOptim(
        optim.Adam(
            filter(lambda x: x.requires_grad, transformer.parameters()),
            betas=(0.9, 0.98), eps=1e-09),
        opt.d_model, opt.n_warmup_steps)

    weight_mask = None
    
    crossEntropy = nn.CrossEntropyLoss(weight_mask, reduction='sum', ignore_index=Constants.PAD)

    train_loss, val_loss = train(
        transformer, training_data, validation_data, optimizer, device, opt, crossEntropy)
    print("Starting Test...")
    test(transformer, test_data, device, opt, crossEntropy)
    print("Making loss graph...")
    plt = plot(train_loss, val_loss)
    plt.savefig(opt.save_plot + ".png")
    print("Finished!")
Ejemplo n.º 27
0
def main():
    ''' Main function '''
    parser = argparse.ArgumentParser()

    parser.add_argument('-data', required=True)

    parser.add_argument('-epoch', type=int, default=10)
    parser.add_argument('-batch_size', type=int, default=64)

    #parser.add_argument('-d_word_vec', type=int, default=512)
    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=1024)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-n_warmup_steps', type=int, default=4000)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', default=None)
    parser.add_argument('-save_model', default=None)
    parser.add_argument('-save_mode',
                        type=str,
                        choices=['all', 'best'],
                        default='best')

    parser.add_argument('-no_cuda', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model

    #========= Loading Dataset =========#
    data = torch.load(opt.data)
    opt.max_token_seq_len = data['settings'].max_token_seq_len

    #========= Preparing DataLoader =========#
    training_data = DataLoader(data['dict']['src'],
                               data['dict']['tgt'],
                               src_insts=data['train']['src'],
                               tgt_insts=data['train']['tgt'],
                               batch_size=opt.batch_size,
                               cuda=opt.cuda)

    validation_data = DataLoader(data['dict']['src'],
                                 data['dict']['tgt'],
                                 src_insts=data['valid']['src'],
                                 tgt_insts=data['valid']['tgt'],
                                 batch_size=opt.batch_size,
                                 shuffle=False,
                                 test=True,
                                 cuda=opt.cuda)

    opt.src_vocab_size = training_data.src_vocab_size
    opt.tgt_vocab_size = training_data.tgt_vocab_size

    #========= Preparing Model =========#
    if opt.embs_share_weight and training_data.src_word2idx != training_data.tgt_word2idx:
        print(
            '[Warning]',
            'The src/tgt word2idx table are different but asked to share word embedding.'
        )

    print(opt)

    transformer = Transformer(opt.src_vocab_size,
                              opt.tgt_vocab_size,
                              opt.max_token_seq_len,
                              proj_share_weight=opt.proj_share_weight,
                              embs_share_weight=opt.embs_share_weight,
                              d_k=opt.d_k,
                              d_v=opt.d_v,
                              d_model=opt.d_model,
                              d_word_vec=opt.d_word_vec,
                              d_inner_hid=opt.d_inner_hid,
                              n_layers=opt.n_layers,
                              n_head=opt.n_head,
                              dropout=opt.dropout)

    #print(transformer)

    optimizer = ScheduledOptim(
        optim.Adam(transformer.get_trainable_parameters(),
                   betas=(0.9, 0.98),
                   eps=1e-09), opt.d_model, opt.n_warmup_steps)

    def get_criterion(vocab_size):
        ''' With PAD token zero weight '''
        weight = torch.ones(vocab_size)
        weight[Constants.PAD] = 0
        return nn.CrossEntropyLoss(weight, size_average=False)

    crit = get_criterion(training_data.tgt_vocab_size)

    if opt.cuda:
        transformer = transformer.cuda()
        crit = crit.cuda()

    print("===>TRAIN\n")
    train(transformer, training_data, validation_data, crit, optimizer, opt)
Ejemplo n.º 28
0
def main():
    """ Main function """
    parser = argparse.ArgumentParser()

    parser.add_argument("-data", required=True)

    parser.add_argument("-epoch", type=int, default=10)
    parser.add_argument("-batch_size", type=int, default=64)

    # parser.add_argument("-d_word_vec", type=int, default=512)
    parser.add_argument("-d_model", type=int, default=512)
    parser.add_argument("-d_inner_hid", type=int, default=2048)
    parser.add_argument("-d_k", type=int, default=64)
    parser.add_argument("-d_v", type=int, default=64)

    parser.add_argument("-n_head", type=int, default=8)
    parser.add_argument("-n_layers", type=int, default=6)
    parser.add_argument("-n_warmup_steps", type=int, default=4000)

    parser.add_argument("-dropout", type=float, default=0.1)
    parser.add_argument("-embs_share_weight", action="store_true")
    parser.add_argument("-proj_share_weight", action="store_true")

    parser.add_argument("-log", default=None)
    parser.add_argument("-save_model", default=None)
    parser.add_argument("-save_mode", type=str, choices=["all", "best"], default="best")

    parser.add_argument("-no_cuda", action="store_true")
    parser.add_argument("-label_smoothing", action="store_true")

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model

    # ========= Loading Dataset =========#
    data = torch.load(opt.data)
    opt.max_token_seq_len = data["settings"].max_token_seq_len

    training_data, validation_data = prepare_dataloaders(data, opt)

    opt.src_vocab_size = training_data.dataset.src_vocab_size
    opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size

    # ========= Preparing Model =========#
    if opt.embs_share_weight:
        assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \
            "The src/tgt word2idx table are different but asked to share word embedding."

    print(opt)

    device = torch.device("cuda" if opt.cuda else "cpu")
    transformer = Transformer(
        opt.src_vocab_size,
        opt.tgt_vocab_size,
        opt.max_token_seq_len,
        tgt_emb_prj_weight_sharing=opt.proj_share_weight,
        emb_src_tgt_weight_sharing=opt.embs_share_weight,
        d_k=opt.d_k,
        d_v=opt.d_v,
        d_model=opt.d_model,
        d_word_vec=opt.d_word_vec,
        d_inner=opt.d_inner_hid,
        n_layers=opt.n_layers,
        n_head=opt.n_head,
        dropout=opt.dropout).to(device)

    optimizer = ScheduledOptim(
        optim.Adam(
            filter(lambda x: x.requires_grad, transformer.parameters()),
            betas=(0.9, 0.98), eps=1e-09),
        opt.d_model, opt.n_warmup_steps)

    train(transformer, training_data, validation_data, optimizer, device, opt)
Ejemplo n.º 29
0
def main():
    ''' Main function '''
    parser = argparse.ArgumentParser()
    parser.add_argument('-data_all',
                        default='data/csv/data_train_2_sort.torch')
    parser.add_argument('-save_model', default='module/2018-7-30.pt')
    parser.add_argument('-start_time', default='2018-07-01')
    parser.add_argument('-end_time', default='2018-08-30')

    parser.add_argument('-epoch', type=int, default=16)
    parser.add_argument('-batch_size', type=int, default=128)

    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=32)
    parser.add_argument('-d_v', type=int, default=32)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=2)
    parser.add_argument('-n_warmup_steps', type=int, default=4000)

    parser.add_argument('-dropout', type=float, default=0.3)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', default='log/logs.log')

    parser.add_argument('-save_mode',
                        type=str,
                        choices=['all', 'best'],
                        default='best')

    parser.add_argument('-no_cuda', action='store_true')
    parser.add_argument('-label_smoothing', action='store_true')
    parser.add_argument('-batch_x', default=32)
    parser.add_argument('-batch_y', default=32)
    parser.add_argument('-train_type', default='name')

    opt = parser.parse_args()
    opt.cuda = torch.cuda.is_available()
    opt.d_word_vec = opt.d_model

    # ========= Loading Dataset =========#
    # opt.max_token_seq_len = data['settings'].max_token_seq_len

    training_data, validation_data, voc_name, data_val_ofpa = ld.get_data_loader(
        opt, device)
    opt.src_vocab_size = voc_name
    opt.tgt_vocab_size = opt.src_vocab_size
    if opt.train_type == 'time':
        voc = ld.get_time_vac(opt)
        opt.tgt_vocab_size = voc if voc > 500 else 728

        # ========= Preparing Model =========#
    if opt.embs_share_weight:
        assert opt.src_vocab_size == opt.tgt_vocab_size, \
            'The src/tgt word2idx table are different but asked to share word embedding.'

    print(opt)

    transformer = Transformer(opt.src_vocab_size,
                              opt.tgt_vocab_size,
                              opt.batch_x,
                              tgt_emb_prj_weight_sharing=opt.proj_share_weight,
                              emb_src_tgt_weight_sharing=opt.embs_share_weight,
                              d_k=opt.d_k,
                              d_v=opt.d_v,
                              d_model=opt.d_model,
                              d_word_vec=opt.d_word_vec,
                              d_inner=opt.d_inner_hid,
                              n_layers=opt.n_layers,
                              n_head=opt.n_head,
                              dropout=opt.dropout).to(device)

    optimizer = ScheduledOptim(
        optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()),
                   betas=(0.9, 0.98),
                   eps=1e-09), opt.d_model, opt.n_warmup_steps)
    if opt.train_type == 'time':
        print("train time dim ")
        # train(transformer, train_time, val_time, optimizer, device, opt)
    else:
        train(transformer, training_data, validation_data, optimizer, device,
              opt, data_val_ofpa)
        print('  - (Training)   accuracy: {accu:3.3f} %, '\
              'elapse: {elapse:3.3f} min'.format(
                  accu=100*train_accu,
                  elapse=(time.time()-start)/60))

        start = time.time()
        valid_loss, valid_accu = eval_epoch(model, validation_data, predicates)
        print('  - (Validation)  accuracy: {accu:3.3f} %, '\
                'elapse: {elapse:3.3f} min'.format(
                    accu=100*valid_accu,
                    elapse=(time.time()-start)/60))

        valid_accus += [valid_accu]

device = torch.device('cpu')


word2idx,ints,en1_pos,en2_pos,predicates,relation2idx = data.build_sentences()

training_data, validation_data = prepare_dataloaders(word2idx,ints,en1_pos,en2_pos,predicates)
model = Transformer(
    n_src_vocab=len(word2idx),
    len_max_seq=config.max_seq_len).to(device)

optimizer = ScheduledOptim(
    optim.Adam(
        filter(lambda x: x.requires_grad, model.parameters()),
        betas=(0.9, 0.98), eps=1e-09),
    512, 1000)

train(model, training_data, validation_data, optimizer,predicates)