def __preprocess(self): """ preprocess the data to list of list token idx """ print('\nProcessing data ... ') if self.tokenizer_dir: self.__src_tokenizer = load_pkl( get_file_path(data_dir, 'tokenizer', self.tokenizer_dir, 'tokenizer.pkl')) self.__tar_tokenizer = self.__src_tokenizer elif self.M.checkpoint_params['load_model']: load_model_params = self.M.checkpoint_params['load_model'] tokenizer_path = create_dir_in_root('runtime', 'tokenizer', load_model_params[0], load_model_params[1], 'tokenizer.pkl') self.__src_tokenizer = self.__tar_tokenizer = read_cache( tokenizer_path) else: self.__src_tokenizer = utils.pipeline( self.M.tokenizer_pl, self.__tokenizer_data_src, self.__tokenizer_data_tar, self.M.data_params, ) self.__tar_tokenizer = self.__src_tokenizer del self.__tokenizer_data_src del self.__tokenizer_data_tar params = { **self.M.data_params, 'tokenizer': self.__src_tokenizer, 'src_tokenizer': self.__src_tokenizer, 'tar_tokenizer': self.__tar_tokenizer, } self.__train_src_encode, self.__train_tar_encode, _, _ = utils.pipeline( self.M.encode_pipeline, self.__train_src, self.__train_tar, params) self.__val_src_encode, self.__val_tar_encode, _, _ = utils.pipeline( self.M.encode_pipeline, self.__val_src, self.__val_tar, params) self.__test_src_encode, self.__test_tar_encode, _, _ = utils.pipeline( self.M.encode_pipeline, self.__test_src, self.__test_tar, params) # get vocabulary size self.__src_vocab_size = self.__src_tokenizer.vocab_size self.__tar_vocab_size = self.__tar_tokenizer.vocab_size print('\nFinish preprocessing ')
def __preprocess(self): """ preprocess the data to list of list token idx """ print('\nProcessing data ... ') load_model_params = Model.checkpoint_params['load_model'] if load_model_params: tokenizer_path = create_dir_in_root('runtime', 'tokenizer', load_model_params[0], load_model_params[1], 'tokenizer.pkl') self.__src_tokenizer = self.__tar_tokenizer = read_cache( tokenizer_path) self.__train_src_encode, self.__train_tar_encode, _, _ = utils.pipeline( Model.encode_pipeline, self.__train_src, self.__train_tar, { **Model.data_params, 'tokenizer': self.__src_tokenizer, 'src_tokenizer': self.__src_tokenizer, 'tar_tokenizer': self.__tar_tokenizer, }) else: self.__train_src_encode, self.__train_tar_encode, self.__src_tokenizer, self.__tar_tokenizer = utils.pipeline( Model.preprocess_pipeline, self.__train_src, self.__train_tar, Model.data_params, ) params = { **Model.data_params, 'tokenizer': self.__src_tokenizer, 'src_tokenizer': self.__src_tokenizer, 'tar_tokenizer': self.__tar_tokenizer, } self.__test_src_encode, self.__test_tar_encode, _, _ = utils.pipeline( Model.encode_pipeline, self.__test_src, self.__test_tar, params) # get vocabulary size self.__src_vocab_size = self.__src_tokenizer.vocab_size self.__tar_vocab_size = self.__tar_tokenizer.vocab_size print('\nFinish preprocessing ')
def log(self, kwargs): string = '\n'.join( list( map(lambda x: '{}: {}'.format(x[0], x[1]), list(kwargs.items())))) data = (self.model.name, self.model.TIME, string, self.model.data_params, self.model.model_params, self.model.train_params, self.__train_time, self.__test_train_time, self.__test_test_time) string = '\n---------------------------------------------------' \ '\nmodel_name: {}\nmodel_time: {}\n{}\n' \ 'data_params: {}\nmodel_params: {}\ntrain_params: {}\n' \ 'train_time: {}\ntest_train_time: {}\ntest_test_time: {}\n\n'.format(*data) print(string) with open( os.path.join(create_dir_in_root('runtime', 'log'), '{}.log'.format(self.model.name)), 'ab') as f: f.write(string.encode('utf-8'))
def train(self, train_x, train_y, val_x=None, val_y=None): """ TODO you need to override this train functions :params: train_x (tuple): a tuple with two elements, each one is a np.array (encoder_input, decoder_input) e.g., (train_src_encode, train_tar_encode[:, :-1]) :params: train_y (np.array): decoder_output (ground_truth) train_tar_encode[:, 1:] :params: val_x (tuple): Use for validation, could be None. If None, then no validation; if it is not None, it would be the same as train_X :params: val_y (np.array): Use for validation, could be None. If None, then no validation; if it is not None, it would be the same as train_y no returns """ # if we want to load a trained model if self.checkpoint_params['load_model']: model_dir = create_dir_in_root( *(['runtime', 'models'] + self.checkpoint_params['load_model'])) self.load_model(model_dir) if not self.__finish_train: # TODO # you should write down the train process here # This process should also provide the following functions: # calculate the loss # update model weights # early stop # validation # something like tensorboard (for saving files for tensorboard, you should save it to self.tb_dir) # print out the results to console for each epoch # save the best model to self.model_dir # ... # # for step in range(steps) # ... # load the best model so that it could be tested # from torchtext.data import Field, BucketIterator, TabularDataset # import load.jr_en as loader # JP_TEXT = Field(tokenize=jr_en.jr_tokenizer) # EN_TEXT = Field(tokenize=jr_en.en_tokenizer) # print("Preparing to load data to batchify data") # dataLoader = loader.Loader() # jr_data, en_data = dataLoader.data() # print("Successfully splited the data") # JP_TEXT.build_vocab(jr_data, val) # EN_TEXT.build_vocab(en_data, val) from torchtext.data import Field JP_TEXT = Field(tokenize=jr_en.jr_tokenizer) EN_TEXT = Field(tokenize=jr_en.en_tokenizer) ##batchify the data def batchify(data, batchsize, TEXT): data = TEXT.numericalize([data.examples[0].text]) # Divide the dataset into bsz parts. nbatch = data.size(0) // batchsize # Trim off any extra elements that wouldn't cleanly fit (remainders). data = data.narrow(0, 0, nbatch * batchsize) # Evenly divide the data across the bsz batches. data = data.view(batchsize, -1).t().contiguous() return data.to(device) jr_data = batchify(train_x, self.train_params['batch_size'], JP_TEXT) en_data = batchify(train_x, self.train_params['batch_size'], EN_TEXT) emb_dim = self.model_params["emb_dim"] def get_batch(source, i): seq_len = emb_dim data = source[i:i + seq_len] target = source[i + 1:i + 1 + seq_len].view(-1) return data, target import time criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(self.model.parameters(), lr=self.train_params['learning_rate']) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95) for epoch in range(1, self.checkpoint_params["epoch"] + 1): # optimizer.zero_grad() # output = self.model(train_x) # loss = criterion(output.view(-1, self.checkpoint_params["epoch"]), train_y) # loss.backward() # torch.nn.utils.clip_grad_norm_(self.model.parameters(), 0.5) # optimizer.step() # # # load the best model so that it could be tested # self.load_model() # # self.__finish_train = True # model.train() # Turn on the train mode data, targets = get_batch(jr_data, epoch) total_loss = 0. start_time = time.time() optimizer.zero_grad() output = self.model(data) loss = criterion(output.view(-1, data), targets) loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 0.5) optimizer.step() total_loss += loss.item() log_interval = 200 # if batch % log_interval == 0 and batch > 0: cur_loss = total_loss / log_interval elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | ' 'lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, 1, len(jr_data) // emb_dim, scheduler.get_lr()[0], elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss)))
sub_root_dir = os.path.split(sub_sub_root_dir)[0] root_dir = os.path.split(sub_root_dir)[0] sys.path.append(sub_sub_root_dir) sys.path.append(sub_root_dir) sys.path.append(root_dir) import shutil from lib import utils """ Cleaning the useless model files and tensorboard files """ save_best_model_num = 1 model_dir = utils.create_dir_in_root('runtime', 'models') tb_dir = utils.create_dir_in_root('runtime', 'tensorboard') tokenizer_dir = utils.create_dir_in_root('runtime', 'tokenizer') for model_name in os.listdir(model_dir): tmp_model_dir = os.path.join(model_dir, model_name) print(f'\nchecking {tmp_model_dir} ...') for _date in os.listdir(tmp_model_dir): date_dir = os.path.join(tmp_model_dir, _date) if not os.path.isdir(date_dir): continue model_list = os.listdir(date_dir) print(f'\tchecking {_date}')
def create_dir(self): # create tensorboard path self.tb_dir_word_translate = utils.create_dir_in_root( 'runtime', 'tensorboard', self.name, 'word_translate', self.TIME) self.tb_dir_cdlm_translate = utils.create_dir_in_root( 'runtime', 'tensorboard', self.name, 'cdlm_translate', self.TIME) self.tb_dir_cdlm_ner = utils.create_dir_in_root( 'runtime', 'tensorboard', self.name, 'cdlm_ner', self.TIME) self.tb_dir_cdlm_pos = utils.create_dir_in_root( 'runtime', 'tensorboard', self.name, 'cdlm_pos', self.TIME) self.tb_dir_cdlm_synonym = utils.create_dir_in_root( 'runtime', 'tensorboard', self.name, 'cdlm_synonym', self.TIME) # create model path self.model_dir_word_translate = utils.create_dir_in_root( 'runtime', 'models', self.name, 'word_translate', self.TIME) self.model_dir_cdlm_translate = utils.create_dir_in_root( 'runtime', 'models', self.name, 'cdlm_translate', self.TIME) self.model_dir_cdlm_ner = utils.create_dir_in_root( 'runtime', 'models', self.name, 'cdlm_ner', self.TIME) self.model_dir_cdlm_pos = utils.create_dir_in_root( 'runtime', 'models', self.name, 'cdlm_pos', self.TIME) self.model_dir_cdlm_synonym = utils.create_dir_in_root( 'runtime', 'models', self.name, 'cdlm_synonym', self.TIME) self.checkpoint_path_word_translate = os.path.join( self.model_dir_word_translate, self.name + self.checkpoint_params['extend_name']) self.checkpoint_path_cdlm_translate = os.path.join( self.model_dir_cdlm_translate, self.name + self.checkpoint_params['extend_name']) self.checkpoint_path_cdlm_ner = os.path.join( self.model_dir_cdlm_ner, self.name + self.checkpoint_params['extend_name']) self.checkpoint_path_cdlm_pos = os.path.join( self.model_dir_cdlm_pos, self.name + self.checkpoint_params['extend_name']) self.checkpoint_path_cdlm_synonym = os.path.join( self.model_dir_cdlm_synonym, self.name + self.checkpoint_params['extend_name']) self.tokenizer_dir = utils.create_dir_in_root('runtime', 'tokenizer', self.name, self.TIME)