def __init__(self, params, batch_size): split_per = 0.95 self.batch_size = batch_size self.lang_set = params['lang_set'] self.token_embd_list = [] with open(corpus_embd_file_path_format.format('_'.join(self.lang_set)), "rb") as f: while True: try: self.token_embd_list.append(pickle.load(f)) except: break self.corpus_df = pd.read_csv(corpus_csv_path, sep='\001', encoding='utf-8') df_filter = pd.Series([True] * len(self.corpus_df)) for task in params['task_name']: df_filter = df_filter & (self.corpus_df['task_type'] == task) self.corpus_df = self.corpus_df[df_filter] self.corpus_label = self.corpus_df[ self.corpus_df['data_set'] != 'test'] self.corpus_unlabel = self.corpus_df[self.corpus_df['data_set'] == 'test'] self.corpus_df_train = self.corpus_label[:int( len(self.corpus_label) * split_per)] self.corpus_df_dev = self.corpus_label[ -int(len(self.corpus_label) * (1 - split_per)):] self.corpus_df_test = self.corpus_unlabel logger.info("load {} data of {}".format(len(self.token_embd_list), "/".join(self.lang_set)))
def run_train(self, epochs=1): # epoch for epoch_i in range(self.params['epoch_num'] + 1, self.params['epoch_num'] + epochs + 1): logger.info("epoch: {}".format(epoch_i)) # train_batch phrase self.model.train() iter_i = 0 for iter_i, data_train in enumerate( self.data_stream(data_set='train'), self.iter_count + 1): # data_train = [batch_size, (input_batch, target_batch)] self.run_train_batch(data_train) if iter_i % 100 == 0: self.model.eval() score_tmp = self.score self.run_eval() logger.info( "step:{} loss: {:.4f} train_metric: {:.4f} dev_metric: {:.4f}" .format(iter_i, self.loss, score_tmp, self.score)) model_name = "_".join([ self.params['model_name_prefix'], '_'.join(self.params['task_name']), str(epoch_i) ]) self.save_model(model_name=model_name) self.iter_count = iter_i
def __init__(self, params, model, data_stream, optimizer=None, loss_func=None, eval_func=None, USE_CUDA=True): self.params = params # self.model = model.cuda() if USE_CUDA else model self.model = check_gpu(model) self.data_stream = data_stream self.optimizer = self.build_optimizer( ) if optimizer is None else optimizer self.loss_func = self.build_loss_func( ) if loss_func is None else loss_func self.eval_func = self.build_eval_func( ) if eval_func is None else eval_func # self.USE_CUDA = USE_CUDA self.iter_count = 0 self.loss = 0 self.score = 0 # total_params = sum(p.numel() for p in self.model.parameters())/pow(2,20) total_storage_space = sum(p.numel() * int(str(p.dtype)[-2:]) / 8 for p in self.model.parameters()) / pow( 2, 20) logger.info( "Total Model Storage Space: {:.0f} MB".format(total_storage_space))
def __init__(self, params, data_set): self.corpus_df = pd.read_csv(os.path.join( data_path, 'SemEval2021_Task2_corpus.csv'), sep='\001', encoding='utf-8') self.data_set = data_set if self.data_set == 'train': self.corpus_df = self.corpus_df[ (self.corpus_df['data_set'] == 'training') | (self.corpus_df['data_set'] == 'dev')] elif self.data_set == 'eval': self.corpus_df = self.corpus_df[self.corpus_df['data_set'] == 'trial'] elif self.data_set == 'predict': self.corpus_df = self.corpus_df[self.corpus_df['data_set'] == 'test'] else: logger.info('Invalid data_set: {}'.format(data_set)) # do shuffle if self.data_set == 'train': self.corpus_df = self.corpus_df.sample(frac=1).reset_index( drop=True) self.params = params self.batch_size = batch_size self.n_corpus = len(self.corpus_df) logger.info("data num : {}".format(self.n_corpus))
def load_model_xml(reloaded): # extract para params = AttrDict(reloaded['params']) logger.info("Supported languages: %s" % ", ".join(params.lang2id.keys())) # build dictionary dico = dictionary.Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts']) params['dico'] = dico # update parameters params.n_words = len(dico) params.bos_index = dico.index(dictionary.BOS_WORD) params.eos_index = dico.index(dictionary.EOS_WORD) params.pad_index = dico.index(dictionary.PAD_WORD) params.unk_index = dico.index(dictionary.UNK_WORD) params.mask_index = dico.index(dictionary.MASK_WORD) # build model / reload weights model = TransformerModel(params, params['dico'], True, True) model = framework.check_gpu(model) framework.load_model_params(model=model, model_params_from_file=reloaded['model'], frozen=True) # model.load_state_dict(torch.load( # params["reload_model"], # map_location=lambda storage, loc: storage), # False) return model, params
def load_model_params(model, model_params_from_file, frozen=None): model_params = {} # todo the "FROZEN" tag is mistake for para_name, para_value in model.named_parameters(): if para_name in model_params_from_file: param_tmp = model_params_from_file[para_name] if frozen is not None: param_tmp.requires_grad = not frozen model_params[para_name] = param_tmp logger.info("[{}]{}{}[{}] **INIT_FROM_FILE**".format( 'Not Frozen' if param_tmp.requires_grad else 'Frozen', para_name, list(para_value.size()), str(para_value.dtype).split(".")[-1], )) else: param_tmp = para_value logger.info("[{}]{}{}[{}]".format( 'Not Frozen' if param_tmp.requires_grad else 'Frozen', para_name, list(para_value.size()), str(para_value.dtype).split(".")[-1], )) model.load_state_dict(model_params, strict=False)
prediction = torch.argmax(output, dim=1) correct_num = (prediction == label).sum().float() total_num = len(label) acc = correct_num / total_num return acc def check_gpu(x): if x is None: return None elif torch.cuda.is_available(): return x.cuda() else: return x if torch.cuda.is_available(): logger.info("USE GPU") GPU_OR_CPU = 'cuda' else: logger.info("USE CPU") GPU_OR_CPU = 'cpu' if __name__ == '__main__': logger.info("END") # b = torch.from_numpy(a) # feature_tensor_single.detach().numpy() # 模型中可学习的参数会由net.parameters()返回。 # params = list(net.parameters())
model_bertflow_path = os.path.join(data_root_path, 'BERT-flow', 'exp', 'exp_t_STS-B_ep_1.00_lr_5.00e-05_e_avg-last-2_f_11_1.00e-03_allsplits') model_bertflow_file = os.path.join(model_bertflow_path, 'model.ckpt-269') corpus_embd_file_path = os.path.join(data_path, "corpus_embd.npy") corpus_embd_file_path_format = os.path.join(data_path, "corpus_embd_{}.npy") corpus_token_file_path = os.path.join(data_path, "corpus_token.txt") corpus_csv_path = os.path.join(data_path, 'SemEval2021_Task2_corpus.csv') # print(tensor.device) # print(torch.cuda.device_count()) if __name__ == "__main__": import pickle from util_tools import logger logger.info("BEGIN") corpus_list = [] f = open(corpus_embd_file_path, "rb") i = 0 while True: try: corpus_list.append(pickle.load(f)) i += 1 # print(i) except: f.close() break logger.info("END")
for x, y in zip(df_batch['sent1_token_keyword_idx'], df_batch['sent2_token_keyword_idx']) ] label = np.array(df_batch['label']).astype(np.int) label = framework.check_gpu(torch.tensor(label, dtype=torch.long)) yield (token_embd_batch, keyword_idx), label if __name__ == '__main__': batch_size = 32 params = get_params() # params = get_params(model_init_path = os.path.join(data_path, "polysemy_bertflow_en-en_10")) model = load_model(params) optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.01) loss_func = nn.CrossEntropyLoss() # USE_CUDA = True if torch.cuda.is_available() else False data_stream = generate_data(params=params, batch_size=batch_size) polysemt_bertflow_model = framework.framework(params=params, model=model, data_stream=data_stream, optimizer=optimizer, loss_func=loss_func) # polysemt_bertflow_model.run_eval() polysemt_bertflow_model.run_train(epochs=30) torch.cuda.empty_cache() logger.info('END')