def eval_one_epoch(dataloader, model, eval_loss, eval_steps, data_process_func): # print(len(dataloader)) losses, perplexities = [], [] cuda_logger, eval_logger = loggers.cuda_logger, loggers.validation_logger for step, raw in enumerate(dataloader): step_time = time.time() data = data_process_func(raw) log_info( cuda_logger, 'Allocated batches {}, {}'.format( cuda_mem_in_mb(), {k: v.shape for k, v in data.items()})) with torch.no_grad(): loss = get_model_output(model, data)[0].mean() loss_value = loss.item() eval_loss += loss_value eval_steps += 1 perplex_value = torch.exp(torch.tensor(eval_loss / eval_steps)).item() perplexities.append(perplex_value) losses.append(loss_value) log_info( eval_logger, '{} Iter Loss {} Perplexity {} Time {}'.format( step, loss_value, perplex_value, time.time() - step_time)) return losses, perplexities, eval_loss, eval_steps
def gpt2_eval(gpt2, model, dataset, batch_size=32, data_func=lambda x: x): sample_logger = loggers.sample_logger data_loader = DataLoader(dataset, shuffle=False, batch_size=batch_size, collate_fn=lambda x: x) re_prod, re_avg, gpt2_prod, gpt2_avg = gpt2_eval_one_epoch( data_loader, gpt2, model, data_func) result = { 're_prod': re_prod, 're_avg': re_avg, 'gpt2_prod': gpt2_prod, 'gpt2_avg': gpt2_avg } log_info(sample_logger, 'Total ratio {}'.format(result)) return result
def gpt2_eval_one_epoch(dataloader, gpt2, model, data_func): sample_logger = loggers.sample_logger ratio_prod, ratio_avg = [], [] gpt2_prod, gpt2_avg = [], [] for step, raw in enumerate(dataloader): # step_time = time.time() # print(raw) data = data_func(raw) # print(data) probs = get_seq_prob(model, data, data_func=process_re_data) # print(probs) for i in range(len(probs)): # ep = np.concatenate((prob[0], prob[1])) idx = data['idx'][i] ep = probs[i][1] prob_avg = np.log(np.mean(ep)).item() prob_prod = np.mean(np.log(ep)).item() # print(prob_avg, prob_prod, type(prob_avg), np.array(prob_avg), np.array(idx), idx, type(idx)) ratio_avg.append(np.append(np.array(idx), prob_avg)) ratio_prod.append(np.append(np.array(idx), prob_prod)) # print(probs) probs = get_seq_prob(gpt2, data, data_func=process_re_data) # print(probs) for i in range(len(probs)): # ep = np.concatenate((prob[0], prob[1])) idx = data['idx'][i] ep = probs[i][1] prob_avg = np.log(np.mean(ep)) prob_prod = np.sum(np.log(ep)) gpt2_avg.append(np.append(np.array(idx), prob_avg)) gpt2_prod.append(np.append(np.array(idx), prob_prod)) dl = len(probs) log_info( sample_logger, 'RE Sample {} ratio prod {}, {}, ratio mean {}, {}'.format( dl, [x[-1] for x in ratio_prod[-dl:]], [x[-1] for x in gpt2_prod[-dl:]], [x[-1] for x in ratio_avg[-dl:]], [x[-1] for x in gpt2_avg[-dl:]])) return np.array(ratio_prod), np.array(ratio_avg), np.array( gpt2_prod), np.array(gpt2_avg)
def gpt2_model_eval(config, index): from global_constants import ConfigEnums, main_device ce = ConfigEnums save_path = config[ce.save_path] config[ce.model] = config[ce.model].to(main_device) config[ce.gpt2] = config[ce.gpt2].to(main_device) final_logger = loggers.final_logger eval_params = get_params(config, gpt2_eval) ratios = gpt2_eval(**eval_params) if save_path is not None: log_path = list(os.path.split(save_path)[:-1]) log_path.append('log') log_path.append(str(index) + '/') log_path = '/'.join(log_path) if not os.path.exists(log_path): os.mkdir(log_path) log_info(final_logger, 'saving ratios') torch.save(ratios, log_path + 'gpt2_ratios.pt') log_info(final_logger, 'All saved') return config[ce.model], -1
def evaluate(model, dataset, batch_size, epochs, data_func=lambda x: x): validation_logger = loggers.validation_logger eval_loss, eval_steps = 0, 0 losses, perplexities = [], [] model.eval() for e in range(epochs): data_loader = DataLoader(dataset, shuffle=True, batch_size=batch_size, collate_fn=lambda x: x) epoch_iter = len(data_loader) loss, perp, eval_loss, eval_steps = eval_one_epoch( data_loader, model, eval_loss, eval_steps, data_func) # print(len(losses)) losses.extend(loss) perplexities.extend(perp) loss_seg = losses[e * epoch_iter:] # print(len(loss), len(losses), e * epoch_iter) log_info(validation_logger, '----------------------------------------------------') log_info( validation_logger, 'Epoch {}, Mean Loss {}, Min Loss {}, Accum Loss {}'.format( e, np.mean(loss_seg), np.min(loss_seg), eval_loss / eval_steps)) eval_loss /= eval_steps perplexity = torch.exp(torch.tensor(eval_loss)) log_info(validation_logger, 'Final perplexity {}'.format(perplexity)) return perplexity, torch.tensor(perplexities), torch.tensor(losses)
def train_one_epoch(dataloader, model, optimizer, scheduler, data_process_func, tok): losses = [] cuda_logger, train_logger = loggers.cuda_logger, loggers.train_logger loss = None for step, raw in enumerate(dataloader): step_time = time.time() data = data_process_func(raw) if data is None: log_info(cuda_logger, 'Empty data {} Iter'.format(step)) continue log_info( cuda_logger, 'Allocated batches {}, {}'.format( cuda_mem_in_mb(), {k: v.shape for k, v in data.items()})) loss = get_model_output(model, data)[0].mean() loss_value = loss.item() loss.backward() nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() scheduler.step() model.zero_grad() losses.append(loss_value) log_info( train_logger, '{} Iter Loss {} Time {}'.format(step, loss_value, time.time() - step_time)) return losses, loss
def main(config_file='model_config.json'): import libs os.chdir('/'.join(os.path.abspath(__file__).split('/')[:-1])) libs.log_info(libs.loggers.prepare_logger, 'Using config {}'.format(config_file)) with open(config_file, 'r') as f: config = json.load(f) if os.path.exists( config_file) and os.path.isfile(config_file) else {} models = None for k, v in config.items(): if isinstance(v, list): if models is None: models = len(v) elif models != len(v): raise ValueError('Config field {} has wrong length'.format(k)) models = models if models is not None else 1 for i in range(models): new_config = {} for k, v in config.items(): if isinstance(v, list): new_config[k] = v[i] else: new_config[k] = v start_func(new_config)
def eval_prob_one_epoch(dataloader, gpt2, model, length, num_samples, data_process_func, tokenizer=None): result = pd.DataFrame( columns=['e1', 'e2', 'sent', 'log_prod_prob', 'loss', 'sample_sent']) sample_logger = loggers.sample_logger max_sample = 32 divs = num_samples // max_sample saps = [max_sample] * divs if sum(saps) < num_samples: saps.append(num_samples - divs * max_sample) for step, raw in enumerate(dataloader): data = data_process_func(raw) if data is None: continue for i in range(len(data['e1'])): step_time = time.time() e1, e2 = data['e1'][i], data['e2'][i] e1l, e2l = tokenizer.decode(e1.tolist()), tokenizer.decode( e2.tolist()) sents = [] sent = [] gen_time = time.time() print('sampling {}, {}'.format(e1l, e2l)) for ns in saps: # print(length, ns) sent_temp = sample_sequence_entity(model, length, e1, e2, num_samples=ns, top_k=5) if sent_temp is None: continue sent_temp = sent_temp.cpu() sent.append(sent_temp) print('gen_time: {}'.format(time.time() - gen_time)) # print(sent) eval_time = time.time() for s in sent: for l in range(s.shape[0]): sl = tokenizer.decode(s[l].tolist()) if e1l in sl and e2l in sl: sents.append(s[l]) sl = len(sents) idx = data['idx'][i] res_data = { 'e1': [idx[0]] * sl, 'e2': [idx[1]] * sl, 'sent': sents, 'log_prod_prob': [], 'loss': [], 'sample_sent': [idx[2]] * sl } if sl > 0: divs = sl // max_sample paps = [max_sample] * divs if sum(paps) < sl: paps.append(sl - divs * max_sample) for j, pap in enumerate(paps): temp_data = { 'e1': [e1] * pap, 'e2': [e2] * pap, 'sent': sents[j * max_sample:j * max_sample + pap], 'idx': [idx] * pap } probs = get_seq_prob(gpt2, temp_data, data_func=process_re_data) res_data['log_prod_prob'].extend(get_column(probs, 1)) res_data['loss'].extend(get_column(probs, 2)) result = pd.concat([result, pd.DataFrame(res_data)]) print('eval_time: {}'.format(time.time() - eval_time)) log_info( sample_logger, 'Sampled {} sents for e1 {}, e2 {}'.format( len(sents), tokenizer.decode(e1.tolist()), tokenizer.decode(e2.tolist()))) print('tot time: {}, avg: {}'.format(time.time() - step_time, (time.time() - step_time) / num_samples)) return result
def start_func(config): from global_constants import data_process_func from global_constants import ModelEnums, DatasetEnums, TrainModesEnums, ConfigEnums me, de, tme, ce = ModelEnums, DatasetEnums, TrainModesEnums, ConfigEnums config = {ce[k]: v for k, v in config.items() if k in ce.__members__} # print(config) mode = tme[get_config(config, ce.mode)] fields = mode.value.fields con = {k: get_config(config, k) for k in fields} # print(con) model_type = me[con[ce.model]] load_path = get_config(con, ce.load_path) save_path = get_config(con, ce.save_path) if save_path is not None: if save_path[-1] != '/': save_path += '/' log_path = list(os.path.split(save_path)[:-1]) log_path.append('log/') log_path = '/'.join(log_path) if not os.path.exists(save_path): os.mkdir(save_path) initial_loggers(log_path) prepare_logger, cuda_logger, final_logger = loggers.prepare_logger, loggers.cuda_logger, loggers.final_logger json_encoder = json.JSONEncoder(ensure_ascii=False, indent=2) log_info( prepare_logger, 'config loaded:\n' + json_encoder.encode({k.name: v for k, v in con.items()})) log_info(prepare_logger, 'loading models: ' + load_path) tok = tfm.GPT2Tokenizer.from_pretrained(load_path) log_info(prepare_logger, 'model loaded') log_info(cuda_logger, 'avaliable cudas {}'.format(torch.cuda.device_count())) # log_info(prepare_logger, 'start training:\n\tepochs: {}\n\tbatch_len: {}\n\tbatch_size: {}'.format( # con[ce.epochs], con[ce.batch_len], con[ce.batch_size])) # gpu = GPUtil.getGPUs()[0] # log_info(cuda_logger, 'GPU Free {} Used {} Total {}'.format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryTotal)) log_info(cuda_logger, 'Start cuda memory {}'.format(cuda_mem_in_mb())) log_info(cuda_logger, 'Allocated model {}'.format(cuda_mem_in_mb())) model = model_type.value.from_pretrained(load_path) dataset_type = de[con[ce.dataset_type]] dataset_class = dataset_type.value.class_type con[ce.data_func] = data_process_func[mode][model_type] \ [dataset_type](max_len=con[ce.max_len], batch_size=con[ce.batch_size] if ce.batch_size in con else 1) con[ce.dataset_type] = dataset_class con[ce.tokenizer] = tok con[ce.model] = model if ce.gpt2 in con: con[ce.gpt2] = tfm.GPT2LMHeadModel.from_pretrained(con[ce.gpt2]) method = mode.value.func con[ce.idx_file] = open(con[ce.idx_path], 'r') if ce.ent_file in dataset_type.value.fields: con[ce.ent_file] = open(con[ce.ent_path], 'r') if ce.sent_file in dataset_type.value.fields: con[ce.sent_file] = open(con[ce.sent_path], 'r') dataset_parameters = {k.name: con[k] for k in dataset_type.value.fields} ids = con[ce.ids] if ids == '': ids = None if ids is not None: with open(ids, 'r') as f: ids = json.load(f) ids = np.array_split(ids, con[ce.loaders]) ids = [x.tolist() for x in ids] loaders = [] for i in range(con[ce.loaders]): dataset_parameters[ce.ids] = ids[i] loaders.append(dataset_type(**dataset_parameters)) first_len = loaders[0].get_loaded_length()[0] all_len = sum([x.get_loaded_length()[0] for x in loaders]) dataset_parameters[ce.ids] = list( range(all_len, all_len + con[ce.eval_len] * first_len)) con[ce.eval_set] = dataset_type(**dataset_parameters) for i in range(con[ce.loaders]): new_con = dict(con) new_con[ce.dataset] = loaders[i] if new_con[ce.dataset] is None: break new_con[ce.epoch_iter] = len(new_con[ce.dataset]) // ( new_con[ce.batch_size] if ce.batch_size in new_con else 1) new_model, loss = method(new_con, i) con[ce.model] = new_model con[ce.prev_eval_loss] = loss
def single_train(config, index): from global_constants import ConfigEnums, main_device ce = ConfigEnums save_path = config[ce.save_path] save_model = config[ce.save_model] config[ce.save_path] = config[ce.save_path] if config[ ce.save_model] else None config[ce.model] = config[ce.model].to(main_device) final_logger = loggers.final_logger train_params = get_params(config, train) new_model, train_losses = train(**train_params) new_model = get_module_from_parallel(new_model) config[ce.dataset] = config[ce.evalset] eval_params = get_params(config, evaluate) perplexity, perplexities, eval_losses = evaluate(**eval_params) refuse = False loss = torch.mean(eval_losses) log_info(final_logger, 'final mean loss {}'.format(loss)) # if loss > config[ce.prev_eval_loss]: # new_model.load_state_dict(model_state) # refuse = True # log_info(final_logger, 'loss {} is high, refused'.format(index)) # loss = config[ce.prev_eval_loss] # else: # config[ce.prev_eval_loss] = loss if save_path is not None: if save_model and not refuse: new_model = get_module_from_parallel(new_model) tokenizer = get_module_from_parallel(config[ce.tokenizer]) log_info(final_logger, 'saving trained models: ' + save_path) new_model.save_pretrained(save_path) tokenizer.save_pretrained(save_path) log_path = list(os.path.split(save_path)[:-1]) log_path.append('log') log_path.append(str(index) + '/') log_path = '/'.join(log_path) if not os.path.exists(log_path): os.mkdir(log_path) log_info(final_logger, 'saving training losses') torch.save(train_losses, log_path + 'train_losses.pt') log_info(final_logger, 'saving evaluation losses') torch.save(eval_losses, log_path + 'eval_losses.pt') torch.save(perplexity, log_path + 'perplexity.pt') torch.save(perplexities, log_path + 'perplexities.pt') log_info(final_logger, 'mean eval losses {}'.format(torch.mean(eval_losses))) log_info(final_logger, 'All saved') return new_model, loss
def train(model, dataset, batch_size, epochs, epoch_iter, learning_rate=1e-2, weight_decay=1e-4, save_path=None, from_checkpoint=False, continue_train=False, tokenizer=None, data_func=lambda x: x): loss_logger, train_logger = loggers.loss_logger, loggers.train_logger no_decay = ['bias', 'LayerNorm.weight'] optimizer_params = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = tfm.AdamW(optimizer_params, lr=learning_rate) scheduler = tfm.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=epochs * epoch_iter) losses = [] if from_checkpoint: epoch, mini_epoch, model_state, optimizer_state, scheduler_state, loss = load_checkpoint( save_path + 'checkpoint.pt') model.load_state_dict(model_state) optimizer.load_state_dict(optimizer_state) scheduler.load_state_dict(scheduler_state) if continue_train: epochs = epochs - epoch + 1 model = nn.DataParallel(model) model.train() for e in range(epochs): data_loader = DataLoader(dataset, shuffle=False, batch_size=batch_size, collate_fn=lambda x: x) epoch_start = time.time() loss_value, loss = train_one_epoch(data_loader, model, optimizer, scheduler, data_process_func=data_func, tok=tokenizer) losses.extend(loss_value) if save_path is not None: get_module_from_parallel(model).save_pretrained(save_path) if tokenizer is not None: tokenizer.save_pretrained(save_path) check_point = { 'model': model, 'epoch': e, 'optimizer': optimizer, 'scheduler': scheduler, 'loss': loss } save_checkpoint(save_path + 'checkpoint.pt', check_point) log_info(loss_logger, 'saved models for in epoch {}'.format(e)) loss_seg = losses[e * epoch_iter:] log_info(train_logger, '-' * 50) log_info( train_logger, 'Epoch {}, Mean Loss {}, Min Loss {}'.format( e, np.mean(loss_seg), np.min(loss_seg))) time_diff = time.time() - epoch_start log_info( train_logger, 'Time {}, Epoch Time {}, Avg Iter Time {}'.format( datetime.now().strftime("%d/%m/%Y %H:%M:%S"), time_diff, time_diff / epoch_iter)) return model, torch.tensor(losses)