def test(self): for test_iterator, test_output_path in \ zip(self.test_iterators, self.config['Test']['output_path']): create_path(get_path_prefix(test_output_path)) self.model.eval() with torch.no_grad(): hypotheses = [] with tqdm(test_iterator) as bar: bar.set_description("inference") for batch in bar: # [batch size, max len] new_batch = SrcTestBatch( batch.src, self.vocab['src'].stoi['<pad>']) result = self.model.classify_forward( new_batch.src, new_batch.src_mask, None, train=False) logits = result['emb_classify_logits'] prediction = torch.max(logits, dim=-1)[1] for i in range(0, prediction.size(0)): predict = prediction[i].item() for domain in self.domain_dict: if self.domain_dict[domain] == predict: hypotheses.append(domain) with open(test_output_path, 'w', encoding='utf-8') as f: f.write('\n'.join(hypotheses))
def main(): torch.manual_seed(3333) np.random.seed(3333) config_file_path = sys.argv[1] print('read config') with open(config_file_path, 'r') as config_file: config = yaml.load(config_file) # ================================================================================== # # set the device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # set the data fields mt_data_loader = MTDataLoader(config) mt_data_loader.build_vocab() vocab = mt_data_loader.vocab model_builder = ModelBuilder() model = model_builder.build_model( model_name=config['copy_adapter']['model_name'], model_config=config['Model'], vocab=vocab, device=device, load_pretrained=False, pretrain_path=None) model_dict = model.state_dict() # load from trained model load_model_dict = torch.load(config['copy_adapter']['load_model_path']) model_dict.update(load_model_dict) # copy adapter parameters according to dict for copy_item in config['copy_adapter']['copy_dict']: src_adapter_domain = copy_item['src'] trg_adapter_domain = copy_item['trg'] for parameter_name in model_dict.keys(): if trg_adapter_domain in parameter_name: src_adapter_parameter_name = parameter_name.replace( trg_adapter_domain, src_adapter_domain) # copy value model_dict[parameter_name] = model_dict[ src_adapter_parameter_name] print(parameter_name, src_adapter_parameter_name) model.load_state_dict(model_dict) create_path(get_path_prefix(config['copy_adapter']['save_path'])) torch.save(model.state_dict(), config['copy_adapter']['save_path'])
def main(): torch.manual_seed(3333) np.random.seed(3333) config_file_path = sys.argv[1] print('read config') with open(config_file_path, 'r') as config_file: config = yaml.load(config_file) # ================================================================================== # # set the device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # set the data fields mt_data_loader = MTDataLoader(config) mt_data_loader.build_vocab() vocab = mt_data_loader.vocab model_builder = ModelBuilder() model = model_builder.build_model(model_name=config['load_multiple_model']['model_name'], model_config=config['Model'], vocab=vocab, device=device, load_pretrained=False, pretrain_path=None) model_dict = model.state_dict() load_model_dicts = [torch.load(model_path) for model_path in config['load_multiple_model']['load_path']] check_inconsistent(load_model_dicts) for load_model_dict in load_model_dicts: model_dict.update(load_model_dict) model.load_state_dict(model_dict) create_path(get_path_prefix(config['load_multiple_model']['save_path'])) torch.save(model.state_dict(), config['load_multiple_model']['save_path'])
def main(): print('cuda is available: ', torch.cuda.is_available()) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') config_file_path = sys.argv[1] print('read config') with open(config_file_path, 'r') as config_file: config = yaml.load(config_file) score_ranges = config['score_ranges'] bert_model_path = config['bert_model_path'] tokenizer = BertTokenizer.from_pretrained(bert_model_path) print('read dataset') test_dataset_file = config['test_dataset_file'] test_dataset = pd.read_csv(test_dataset_file, delimiter='\t', usecols=['essay_set', 'essay_id', 'essay']) essay_set = set(test_dataset['essay_set']) for set_id, essay_set_id in enumerate(essay_set): if config['need_test'][set_id] is False: continue print('begin set ', essay_set_id, 'processing') with open(config['essay_prompt'][set_id]) as f: prompt = [f.read()] prompt_process = process_data(prompt, tokenizer, True, 300) prompt_inputs = prompt_process['inputs'] prompt_sent_count = prompt_process['sent_count'] prompt_sent_length = prompt_process['sent_length'] prompt_mask = prompt_process['attention_mask'] test_dataset_in_set = test_dataset[test_dataset.essay_set == essay_set_id] test_essays = test_dataset_in_set.essay.values test_dataset_process = process_data(test_essays, tokenizer, config['split_segment'], config['segment_max_len']) ids = test_dataset_in_set.essay_id.values test_features = get_feature_from_test_ids(ids, config['test_feature']) test_inputs = test_dataset_process['inputs'] test_sent_count = test_dataset_process['sent_count'] test_sent_length = test_dataset_process['sent_length'] test_masks = test_dataset_process['attention_mask'] test_inputs = torch.tensor(test_inputs).to(device) # test_labels = torch.tensor(test_labels).to(device) test_masks = torch.tensor(test_masks).to(device) test_sent_count = torch.tensor(test_sent_count).to(device) test_sent_length = torch.tensor(test_sent_length).to(device) test_features = torch.tensor(test_features).to(device) prompt_inputs = torch.tensor(prompt_inputs).to(device) prompt_mask = torch.tensor(prompt_mask).to(device) prompt_sent_count = torch.tensor(prompt_sent_count).to(device) prompt_sent_length = torch.tensor(prompt_sent_length).to(device) test_data = TensorDataset(test_inputs, test_masks, test_sent_count, test_sent_length, test_features) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=config['batch_size'][set_id]) print('begin set ', essay_set_id, 'setup model') model = make_model(config, device, set_id) # set optimizer only to update the new parameters model.load_state_dict(torch.load(config['model_save_path'][set_id])) # begin training print('begin set ', essay_set_id, 'begin test') # evaluation model.eval() with torch.no_grad(): dev_predict = [] for batch in test_dataloader: batch_inputs, batch_masks, batch_sent_count, batch_sent_length, batch_feature = batch if 'classifier' in config['model']: result = model(batch_inputs, batch_masks, batch_sent_count, batch_sent_length, prompt_inputs, prompt_mask, prompt_sent_count, prompt_sent_length, batch_feature, None) else: result = model(batch_inputs, batch_masks, batch_sent_count, batch_sent_length, prompt_inputs, prompt_mask, prompt_sent_count, prompt_sent_length, score_ranges[set_id][0], score_ranges[set_id][1], batch_feature, None) prediction = result['prediction'] dev_predict.append(prediction) dev_predict = torch.cat(dev_predict, dim=0) samples = [] for i in range(0, len(ids)): samples.append({}) samples[i]['domain1_score'] = np.around(dev_predict[i].item()) samples[i]['essay_id'] = ids[i] samples[i]['essay_set'] = essay_set_id create_path(get_path_prefix(config['test_output_path'][set_id])) save_to_tsv(samples, config['test_output_path'][set_id]) del model del test_inputs del test_masks torch.cuda.empty_cache()
def main(): torch.manual_seed(3333) np.random.seed(3333) config_file_path = sys.argv[1] print('read config') with open(config_file_path, 'r') as config_file: config = yaml.load(config_file) # ================================================================================== # # set the device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # set the data fields mt_data_loader = MTDataLoader(config) mt_data_loader.build_vocab() vocab = mt_data_loader.vocab model_builder = ModelBuilder() model = model_builder.build_model( model_name=config['generate_adapter']['model_name'], model_config=config['Model'], vocab=vocab, device=device, load_pretrained=False, pretrain_path=None) model_dict = model.state_dict() # load from trained model load_model_dict = torch.load(config['generate_adapter']['load_model_path']) model_dict.update(load_model_dict) model.load_state_dict(model_dict) # from xx-generate to xx for generate_item in config['generate_adapter']['generate_dict']: src_adapter_name = generate_item['src'] trg_adapter_name = generate_item['trg'] for i in range(0, len(model.encoder.layers)): print(i) w1_weight, w1_bias, w2_weight, w2_bias = model.encoder.layers[ i].adapters.adapter_layers[src_adapter_name].generate_param( model.encoder.layers[i].adapters.adapter_layers) model.encoder.layers[i].adapters.adapter_layers[ trg_adapter_name].w_1.weight.data = w1_weight.transpose(0, 1) model.encoder.layers[i].adapters.adapter_layers[ trg_adapter_name].w_1.bias.data = w1_bias model.encoder.layers[i].adapters.adapter_layers[ trg_adapter_name].w_2.weight.data = w2_weight.transpose(0, 1) model.encoder.layers[i].adapters.adapter_layers[ trg_adapter_name].w_2.bias.data = w2_bias for i in range(0, len(model.encoder.layers)): print(i) w1_weight, w1_bias, w2_weight, w2_bias = model.decoder.layers[ i].adapters.adapter_layers[src_adapter_name].generate_param( model.decoder.layers[i].adapters.adapter_layers) model.decoder.layers[i].adapters.adapter_layers[ trg_adapter_name].w_1.weight.data = w1_weight.transpose(0, 1) model.decoder.layers[i].adapters.adapter_layers[ trg_adapter_name].w_1.bias.data = w1_bias model.decoder.layers[i].adapters.adapter_layers[ trg_adapter_name].w_2.weight.data = w2_weight.transpose(0, 1) model.decoder.layers[i].adapters.adapter_layers[ trg_adapter_name].w_2.bias.data = w2_bias create_path(get_path_prefix(config['generate_adapter']['save_path'])) model_dict = model.state_dict() for generate_item in config['generate_adapter']['generate_dict']: src_adapter_name = generate_item['src'] trg_adapter_name = generate_item['trg'] for parameter_name in model_dict.keys(): if trg_adapter_name in parameter_name and 'sublayer_connection' in parameter_name and 'generate' not in parameter_name: src_adapter_parameter_name = parameter_name.replace( trg_adapter_name, src_adapter_name) # copy value print(parameter_name, src_adapter_parameter_name) model_dict[parameter_name] = model_dict[ src_adapter_parameter_name] model_dict = {k: v for k, v in model_dict.items() if 'generate' not in k} torch.save(model_dict, config['generate_adapter']['save_path'])
def decoding(self): for test_iterator, test_output_path, test_ref_file_path in \ zip(self.test_iterators, self.config['Test']['output_path'], self.test_ref_file_paths): create_path(get_path_prefix(test_output_path)) self.model.eval() with torch.no_grad(): hypotheses = [] with open(test_ref_file_path, 'r', encoding='utf-8') as f: references = f.read().splitlines() with tqdm(test_iterator) as bar: bar.set_description("inference") for batch in bar: # [batch size, max len] if self.config['Test']['target_domain'] is None: new_batch = SrcTestBatch( batch.src, self.vocab['src'].stoi['<pad>']) result = self.model.classify_forward( new_batch.src, new_batch.src_mask) logits = result['emb_classify_logits'] logits = torch.softmax(logits, dim=-1) target_domain_prob, target_domain = torch.max( logits, -1) for i in range(0, target_domain_prob.size(0)): if target_domain_prob[ i] < 0.90 and target_domain[i].item( ) != 1: print('change') target_domain[i] = 1 else: target_domain = self.config['Test'][ 'target_domain'] search_results = self.decoding_step( batch, target_domain) prediction = search_results['prediction'] for i in range(prediction.size(0)): hypotheses.append( tensor2str(prediction[i], self.vocab['trg'])) if self.config['Vocab']['use_bpe']: hypotheses = [de_bpe(sent) for sent in hypotheses] test_initial_output_path = test_output_path + '.initial' with open(test_initial_output_path, 'w', encoding='utf-8') as f: f.write("\n".join(hypotheses)) os.system(self.detokenize_script + ' -l ' + self.target_language + ' < ' + test_initial_output_path + ' > ' + test_output_path) with open(test_output_path, 'r', encoding='utf-8') as f: hypotheses = f.read().splitlines() bleu_score = sacrebleu.corpus_bleu( hypotheses, [references], tokenize=self.config['Test']['tokenize']) print('some examples') for i in range(3): print("hyp: ", hypotheses[i]) print("ref: ", references[i]) print() print('bleu scores: ', bleu_score) print()
def main(): torch.manual_seed(0) print('cuda is available: ', torch.cuda.is_available()) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') config_file_path = sys.argv[1] print('read config') with open(config_file_path, 'r') as config_file: config = yaml.load(config_file) score_ranges = config['score_ranges'] bert_model_path = config['bert_model_path'] tokenizer = BertTokenizer.from_pretrained(bert_model_path) print('read dataset') # dataset = [] train_dataset_file = config['train_dataset_file'] train_dataset = pd.read_csv( train_dataset_file, delimiter='\t', usecols=['essay_set', 'essay_id', 'essay', 'domain1_score']) dev_dataset_file = config['dev_dataset_file'] dev_dataset = pd.read_csv( dev_dataset_file, delimiter='\t', usecols=['essay_set', 'essay_id', 'essay', 'domain1_score']) essay_set = set(train_dataset['essay_set']) # use to save tensor board create_path(config['record_path']) writer = SummaryWriter(config['record_path']) for set_id, essay_set_id in enumerate(essay_set): if config['need_training'][set_id] is False: continue print('begin set ', essay_set_id, 'processing') # prepare prompt train_essays = [] validation_essays = [] train_ids = [] validation_ids = [] train_features = [] validation_features = [] train_labels = [] validation_labels = [] train_prompt_essays = [] validation_prompt_essays = [] train_max_scores = [] train_min_scores = [] validation_max_scores = [] validation_min_scores = [] train_domain_label = [] validation_domain_label = [] # train_avg_len = [] # validation_avg_len = [] # train_score_bias = [] # validation_score_bias = [] current_domain = 0 for current_set_id, current_essay_set_id in enumerate(essay_set): if current_essay_set_id not in config['used_set'][set_id]: # if current_set_id == set_id: continue else: print('used set', current_essay_set_id) current_train_dataset_in_set = train_dataset[ train_dataset.essay_set == current_essay_set_id] current_validation_dataset_int_set = dev_dataset[ dev_dataset.essay_set == current_essay_set_id] current_train_essays = current_train_dataset_in_set.essay.values train_essays.extend(current_train_essays) current_validation_essays = current_validation_dataset_int_set.essay.values validation_essays.extend(current_validation_essays) current_train_ids = current_train_dataset_in_set.essay_id.values train_ids.extend(current_train_ids) current_validation_ids = current_validation_dataset_int_set.essay_id.values validation_ids.extend(current_validation_ids) current_train_features = get_feature_from_ids( current_train_ids, config['train_feature']) train_features.extend(current_train_features) current_validation_features = get_feature_from_ids( current_validation_ids, config['validation_feature']) validation_features.extend(current_validation_features) current_train_labels = current_train_dataset_in_set.domain1_score.values train_labels.extend(current_train_labels) current_validation_labels = current_validation_dataset_int_set.domain1_score.values validation_labels.extend(current_validation_labels) current_train_max_scores = [ config['score_ranges'][current_set_id][1] ] * len(current_train_ids) current_train_min_scores = [ config['score_ranges'][current_set_id][0] ] * len(current_train_ids) current_validation_max_scores = [ config['score_ranges'][current_set_id][1] ] * len(current_validation_ids) current_validation_min_scores = [ config['score_ranges'][current_set_id][0] ] * len(current_validation_ids) train_max_scores.extend(current_train_max_scores) train_min_scores.extend(current_train_min_scores) validation_max_scores.extend(current_validation_max_scores) validation_min_scores.extend(current_validation_min_scores) # current_train_avg_len = [config['avg_length'][current_set_id]] * len(current_train_ids) # current_validation_avg_len = [config['avg_length'][current_set_id]] * len(current_validation_ids) # train_avg_len.extend(current_train_avg_len) # validation_avg_len.extend(current_validation_avg_len) # current_train_score_bias = [config['score_bias'][current_set_id]] * len(current_train_ids) # current_validation_score_bias = [config['score_bias'][current_set_id]] * len(current_validation_ids) # train_score_bias.extend(current_train_score_bias) # validation_score_bias.extend(current_validation_score_bias) current_train_domain_label = [current_domain ] * len(current_train_ids) current_validation_domain_label = [current_domain ] * len(current_validation_ids) train_domain_label.extend(current_train_domain_label) validation_domain_label.extend(current_validation_domain_label) current_domain += 1 with open(config['essay_prompt'][set_id]) as f: current_prompt_essays = [f.read()] current_train_prompt_essays = current_prompt_essays * len( current_train_ids) train_prompt_essays.extend(current_train_prompt_essays) current_validation_prompt_essays = current_prompt_essays * len( current_validation_ids) validation_prompt_essays.extend(current_validation_prompt_essays) train_prompt_process = process_data(train_prompt_essays, tokenizer, config['split_segment'], config['segment_max_len']) validation_prompt_process = process_data(validation_prompt_essays, tokenizer, config['split_segment'], config['segment_max_len']) train_prompt_inputs = train_prompt_process['inputs'] train_prompt_sent_count = train_prompt_process['sent_count'] train_prompt_sent_length = train_prompt_process['sent_length'] train_prompt_mask = train_prompt_process['attention_mask'] validation_prompt_inputs = validation_prompt_process['inputs'] validation_prompt_sent_count = validation_prompt_process['sent_count'] validation_prompt_sent_length = validation_prompt_process[ 'sent_length'] validation_prompt_mask = validation_prompt_process['attention_mask'] train_dataset_process = process_data(train_essays, tokenizer, config['split_segment'], config['segment_max_len']) dev_dataset_process = process_data(validation_essays, tokenizer, config['split_segment'], config['segment_max_len']) train_inputs = train_dataset_process['inputs'] train_sent_count = train_dataset_process['sent_count'] train_sent_length = train_dataset_process['sent_length'] train_masks = train_dataset_process['attention_mask'] validation_inputs = dev_dataset_process['inputs'] validation_sent_count = dev_dataset_process['sent_count'] validation_sent_length = dev_dataset_process['sent_length'] validation_masks = dev_dataset_process['attention_mask'] train_inputs = torch.tensor(train_inputs).to(device) validation_inputs = torch.tensor(validation_inputs).to(device) train_labels = torch.tensor(train_labels).to(device) validation_labels = torch.tensor(validation_labels).to(device) train_masks = torch.tensor(train_masks).to(device) validation_masks = torch.tensor(validation_masks).to(device) train_sent_counts = torch.tensor(train_sent_count).to(device) validation_sent_counts = torch.tensor(validation_sent_count).to(device) train_sent_length = torch.tensor(train_sent_length).to(device) validation_sent_length = torch.tensor(validation_sent_length).to( device) train_features = torch.tensor(train_features).to(device) validation_features = torch.tensor(validation_features).to(device) train_max_scores = torch.tensor(train_max_scores).to(device) train_min_scores = torch.tensor(train_min_scores).to(device) validation_max_scores = torch.tensor(validation_max_scores).to(device) validation_min_scores = torch.tensor(validation_min_scores).to(device) train_domain_label = torch.tensor(train_domain_label).to(device) validation_domain_label = torch.tensor(validation_domain_label).to( device) train_prompt_inputs = torch.tensor(train_prompt_inputs).to(device) train_prompt_mask = torch.tensor(train_prompt_mask).to(device) train_prompt_sent_count = torch.tensor(train_prompt_sent_count).to( device) train_prompt_sent_length = torch.tensor(train_prompt_sent_length).to( device) validation_prompt_inputs = torch.tensor(validation_prompt_inputs).to( device) validation_prompt_mask = torch.tensor(validation_prompt_mask).to( device) validation_prompt_sent_count = torch.tensor( validation_prompt_sent_count).to(device) validation_prompt_sent_length = torch.tensor( validation_prompt_sent_length).to(device) # train_avg_len = torch.tensor(train_avg_len).to(device) # validation_avg_len = torch.tensor(validation_avg_len).to(device) # train_score_bias = torch.tensor(train_score_bias).to(device) # validation_score_bias = torch.tensor(validation_score_bias).to(device) train_data = TensorDataset( train_inputs, train_masks, train_labels, train_sent_counts, train_sent_length, train_features, train_prompt_inputs, train_prompt_mask, train_prompt_sent_count, train_prompt_sent_length, train_max_scores, train_min_scores, train_domain_label, ) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=config['batch_size'][set_id]) validation_data = TensorDataset( validation_inputs, validation_masks, validation_labels, validation_sent_counts, validation_sent_length, validation_features, validation_prompt_inputs, validation_prompt_mask, validation_prompt_sent_count, validation_prompt_sent_length, validation_max_scores, validation_min_scores, validation_domain_label, ) validation_sampler = SequentialSampler(validation_data) validation_dataloader = DataLoader( validation_data, sampler=validation_sampler, batch_size=config['batch_size'][set_id]) print('begin set ', essay_set_id, 'setup model') print('model: ', config['model']) model = make_model(config, device, set_id) # print_model(model) # set optimizer only to update the new parameters for name, param in model.named_parameters(): if 'bert' in name \ and 'pooler' not in name \ and '11' not in name: param.requires_grad = False parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = optim.Adam(parameters, lr=0.00005) # lr_scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True, # min_lr=0.00001) # begin training print('begin set ', essay_set_id, 'begin training') create_path(get_path_prefix(config['model_save_path'][set_id])) epoch = config['epoch_num'][set_id] global_step = 0 # best_validation_kappa = 0 best_validation_loss = 100000 for current_epoch in trange(epoch, desc='Epoch'): train_loss = [] for step, batch in enumerate(tqdm(train_dataloader)): model.train() batch_inputs, batch_masks, batch_labels, batch_sent_count, batch_sent_length, batch_features, \ prompt_inputs, prompt_mask, prompt_sent_count, prompt_sent_length, max_scores, min_scores, \ train_domain_label = batch optimizer.zero_grad() total_batch_count = int(train_inputs.shape[0] / batch_inputs.shape[0]) p = (step + current_epoch * total_batch_count) / epoch / total_batch_count alpha = 2. / (1. + np.exp(-10 * p)) - 1 result = model(batch_inputs, batch_masks, batch_sent_count, batch_sent_length, prompt_inputs, prompt_mask, prompt_sent_count, prompt_sent_length, min_scores, max_scores, batch_features, batch_labels, domain_label=train_domain_label, alpha=alpha) result['loss'].backward() nn.utils.clip_grad_norm_(parameters, 1.0) train_loss.append(result['loss'].item()) # print('loss: ', result['loss'].item()) optimizer.step() global_step += 1 # evaluation # if global_step % 25 == 0: dev_true = [] dev_predict = [] model.eval() dev_loss = [] with torch.no_grad(): for batch in validation_dataloader: batch_inputs, batch_masks, batch_labels, batch_sent_count, batch_sent_length, batch_features, \ prompt_inputs, prompt_mask, prompt_sent_count, prompt_sent_length, max_scores, min_scores, \ domain_label = batch result = model(batch_inputs, batch_masks, batch_sent_count, batch_sent_length, prompt_inputs, prompt_mask, prompt_sent_count, prompt_sent_length, min_scores, max_scores, batch_features, batch_labels, domain_label=None, alpha=None) # prediction = result['prediction'] dev_loss.append(result['loss'].item()) # dev_true.append(batch_labels) # dev_predict.append(prediction) # dev_true = torch.cat(dev_true, dim=0) # dev_predict = torch.cat(dev_predict, dim=0) # dev_kappa = kappa(y_true=dev_true, y_pred=dev_predict, weights='quadratic') # writer.add_scalar(tag='set' + str(essay_set_id) + '_epoch_dev_kappa', scalar_value=dev_kappa, # global_step=current_epoch) dev_loss = np.sum(dev_loss) / len(validation_ids) writer.add_scalar(tag='set' + str(essay_set_id) + '_epoch_dev_loss', scalar_value=dev_loss, global_step=current_epoch) if dev_loss < best_validation_loss: print('get better result save') best_validation_loss = dev_loss # best_validation_kappa = dev_kappa torch.save(model.state_dict(), config['model_save_path'][set_id]) # lr_scheduler.step(np.average(dev_loss)) # print('dev_kappa is', dev_kappa) print('dev loss ', dev_loss) writer.add_scalar('set' + str(essay_set_id) + '_epoch_avg_train_loss', scalar_value=np.sum(train_loss) / len(train_ids), global_step=current_epoch) print('average train loss: ', np.sum(train_loss) / len(train_ids)) print() del model del train_inputs del validation_inputs del train_masks del validation_masks del train_prompt_inputs del train_prompt_mask del validation_prompt_inputs del validation_prompt_mask torch.cuda.empty_cache()
def main(): torch.manual_seed(0) print('cuda is available: ', torch.cuda.is_available()) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') config_file_path = sys.argv[1] print('read config') with open(config_file_path, 'r') as config_file: config = yaml.load(config_file) score_ranges = config['score_ranges'] bert_model_path = config['bert_model_path'] tokenizer = BertTokenizer.from_pretrained(bert_model_path) print('read dataset') # dataset = [] train_dataset_file = config['train_dataset_file'] train_dataset = pd.read_csv(train_dataset_file, delimiter='\t', usecols=['essay_set', 'essay_id', 'essay', 'domain1_score']) dev_dataset_file = config['dev_dataset_file'] dev_dataset = pd.read_csv(dev_dataset_file, delimiter='\t', usecols=['essay_set', 'essay_id', 'essay', 'domain1_score']) # dataset.append(_train_dataset_) # dataset.append(_dev_dataset_) # dataset = pd.concat(dataset, axis=0, ignore_index=True) essay_set = set(train_dataset['essay_set']) # use to save tensor board create_path(config['record_path']) writer = SummaryWriter(config['record_path']) for set_id, essay_set_id in enumerate(essay_set): if config['need_training'][set_id] is False: continue print('begin set ', essay_set_id, 'processing') with open(config['essay_prompt'][set_id]) as f: prompt = [f.read()] prompt_process = process_data(prompt, tokenizer, config['split_segment'], config['segment_max_len']) prompt_inputs = prompt_process['inputs'] prompt_sent_count = prompt_process['sent_count'] prompt_sent_length = prompt_process['sent_length'] prompt_mask = prompt_process['attention_mask'] train_dataset_in_set = train_dataset[train_dataset.essay_set == essay_set_id] dev_dataset_in_set = dev_dataset[dev_dataset.essay_set == essay_set_id] # essays train_essays = train_dataset_in_set.essay.values dev_essays = dev_dataset_in_set.essay.values # ids train_ids = train_dataset_in_set.essay_id.values validation_ids = dev_dataset_in_set.essay_id.values train_features = get_feature_from_ids(train_ids, config['train_feature']) validation_features = get_feature_from_ids(validation_ids, config['validation_feature']) train_labels = train_dataset_in_set.domain1_score.values validation_labels = dev_dataset_in_set.domain1_score.values train_dataset_process = process_data(train_essays, tokenizer, config['split_segment'], config['segment_max_len']) dev_dataset_process = process_data(dev_essays, tokenizer, config['split_segment'], config['segment_max_len']) train_inputs = train_dataset_process['inputs'] train_sent_count = train_dataset_process['sent_count'] train_sent_length = train_dataset_process['sent_length'] train_masks = train_dataset_process['attention_mask'] validation_inputs = dev_dataset_process['inputs'] validation_sent_count = dev_dataset_process['sent_count'] validation_sent_length = dev_dataset_process['sent_length'] validation_masks = dev_dataset_process['attention_mask'] # train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(essay_tokens_pad, labels, # random_state=3, # test_size=config['dev_dataset_ratio']) # train_masks, validation_masks, _, _ = train_test_split(attention_mask, essay_tokens_pad, # random_state=3, test_size=config['dev_dataset_ratio']) # # train_sent_counts, validation_sent_counts, train_sent_length, validation_sent_length = train_test_split( # essay_sent_count, essay_sent_length, random_state=3, test_size=config['dev_dataset_ratio'] # ) # print(train_sent_count) train_inputs = torch.tensor(train_inputs).to(device) validation_inputs = torch.tensor(validation_inputs).to(device) train_labels = torch.tensor(train_labels).to(device) validation_labels = torch.tensor(validation_labels).to(device) train_masks = torch.tensor(train_masks).to(device) validation_masks = torch.tensor(validation_masks).to(device) train_sent_counts = torch.tensor(train_sent_count).to(device) validation_sent_counts = torch.tensor(validation_sent_count).to(device) train_sent_length = torch.tensor(train_sent_length).to(device) validation_sent_length = torch.tensor(validation_sent_length).to(device) train_features = torch.tensor(train_features).to(device) validation_features = torch.tensor(validation_features).to(device) prompt_inputs = torch.tensor(prompt_inputs).to(device) prompt_mask = torch.tensor(prompt_mask).to(device) prompt_sent_count = torch.tensor(prompt_sent_count).to(device) prompt_sent_length = torch.tensor(prompt_sent_length).to(device) train_data = TensorDataset(train_inputs, train_masks, train_labels, train_sent_counts, train_sent_length, train_features) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=config['batch_size'][set_id]) validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels, validation_sent_counts, validation_sent_length, validation_features) validation_sampler = SequentialSampler(validation_data) validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=config['batch_size'][set_id]) print('begin set ', essay_set_id, 'setup model') print('model: ', config['model']) model = make_model(config, device, set_id) # print_model(model) # set optimizer only to update the new parameters for name, param in model.named_parameters(): if 'bert' in name \ and 'pooler' not in name \ and '11' not in name: # \ # and '10' not in name \ # and '9' not in name: param.requires_grad = False parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = optim.Adam(parameters, lr=0.00005) # lr_scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True, # min_lr=0.00001) # begin training print('begin set ', essay_set_id, 'begin training') create_path(get_path_prefix(config['model_save_path'][set_id])) epoch = config['epoch_num'][set_id] global_step = 0 best_validation_kappa = 0 best_validation_loss = 100000 for current_epoch in trange(epoch, desc='Epoch'): train_loss = [] for step, batch in enumerate(tqdm(train_dataloader)): model.train() batch_inputs, batch_masks, batch_labels, batch_sent_count, batch_sent_length, batch_features = batch optimizer.zero_grad() if 'classifier' in config['model']: result = model(batch_inputs, batch_masks, batch_sent_count, batch_sent_length, prompt_inputs, prompt_mask, prompt_sent_count, prompt_sent_length, batch_features, batch_labels) else: result = model(batch_inputs, batch_masks, batch_sent_count, batch_sent_length, prompt_inputs, prompt_mask, prompt_sent_count, prompt_sent_length, score_ranges[set_id][0], score_ranges[set_id][1], batch_features, batch_labels) result['loss'].backward() nn.utils.clip_grad_norm_(parameters, 1.0) train_loss.append(result['loss'].item() / batch_inputs.shape[0]) # print('loss: ', result['loss'].item()) optimizer.step() global_step += 1 # evaluation # if global_step % 25 == 0: dev_true = [] dev_predict = [] model.eval() dev_loss = [] with torch.no_grad(): for batch in validation_dataloader: batch_inputs, batch_masks, batch_labels, batch_sent_count, batch_sent_length, batch_features = batch if 'classifier' in config['model']: result = model(batch_inputs, batch_masks, batch_sent_count, batch_sent_length, prompt_inputs, prompt_mask, prompt_sent_count, prompt_sent_length, batch_features, batch_labels) else: result = model(batch_inputs, batch_masks, batch_sent_count, batch_sent_length, prompt_inputs, prompt_mask, prompt_sent_count, prompt_sent_length, score_ranges[set_id][0], score_ranges[set_id][1], batch_features, batch_labels) prediction = result['prediction'] dev_loss.append(result['loss'].item()) dev_true.append(batch_labels) dev_predict.append(prediction) dev_true = torch.cat(dev_true, dim=0) dev_predict = torch.cat(dev_predict, dim=0) dev_kappa = kappa(y_true=dev_true, y_pred=dev_predict, weights='quadratic') writer.add_scalar(tag='set' + str(essay_set_id) + '_epoch_dev_kappa', scalar_value=dev_kappa, global_step=current_epoch) dev_loss = np.sum(dev_loss) / validation_ids.shape[0] writer.add_scalar(tag='set' + str(essay_set_id) + '_epoch_dev_loss', scalar_value=dev_loss, global_step=current_epoch) # if dev_loss < best_validation_loss: # print('get better result save') # best_validation_loss = dev_loss # # best_validation_kappa = dev_kappa # torch.save(model.state_dict(), config['model_save_path'][set_id]) if dev_kappa > best_validation_kappa: print('get better kappa result, save') best_validation_kappa = dev_kappa torch.save(model.state_dict(), config['model_save_path'][set_id]) # lr_scheduler.step(np.average(dev_loss)) print('dev_kappa is', dev_kappa) print('dev loss ', dev_loss) writer.add_scalar('set'+str(essay_set_id)+'_epoch_avg_train_loss', scalar_value=np.average(train_loss), global_step=current_epoch) print('average train loss: ', np.average(train_loss)) print() del model del train_inputs del validation_inputs del train_masks del validation_masks torch.cuda.empty_cache()
def main(): print('cuda is available: ', torch.cuda.is_available()) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') config_file_path = sys.argv[1] print('read config') with open(config_file_path, 'r') as config_file: config = yaml.load(config_file) score_ranges = config['score_ranges'] bert_model_path = config['bert_model_path'] tokenizer = BertTokenizer.from_pretrained(bert_model_path) print('read dataset') test_dataset_file = config['test_dataset_file'] test_dataset = pd.read_csv(test_dataset_file, delimiter='\t', usecols=['essay_set', 'essay_id', 'essay']) essay_set = set(test_dataset['essay_set']) for set_id, essay_set_id in enumerate(essay_set): if config['need_test'][set_id] is False: continue print('begin set ', essay_set_id, 'processing') test_dataset_in_set = test_dataset[test_dataset.essay_set == essay_set_id] test_essays = test_dataset_in_set.essay.values test_dataset_process = process_data(test_essays, tokenizer, config['split_segment'], config['segment_max_len']) ids = test_dataset_in_set.essay_id.values test_features = get_feature_from_test_ids(ids, config['test_feature']) test_inputs = test_dataset_process['inputs'] test_sent_count = test_dataset_process['sent_count'] test_sent_length = test_dataset_process['sent_length'] test_masks = test_dataset_process['attention_mask'] test_inputs = torch.tensor(test_inputs).to(device) # test_labels = torch.tensor(test_labels).to(device) test_masks = torch.tensor(test_masks).to(device) test_sent_count = torch.tensor(test_sent_count).to(device) test_sent_length = torch.tensor(test_sent_length).to(device) test_features = torch.tensor(test_features).to(device) with open(config['essay_prompt'][set_id]) as f: test_prompt = [f.read()] * len(ids) test_prompt_process = process_data(test_prompt, tokenizer, config['split_segment'], config['segment_max_len']) test_prompt_inputs = test_prompt_process['inputs'] test_prompt_sent_count = test_prompt_process['sent_count'] test_prompt_sent_length = test_prompt_process['sent_length'] test_prompt_mask = test_prompt_process['attention_mask'] test_prompt_inputs = torch.tensor(test_prompt_inputs).to(device) test_prompt_mask = torch.tensor(test_prompt_mask).to(device) test_prompt_sent_count = torch.tensor(test_prompt_sent_count).to(device) test_prompt_sent_length = torch.tensor(test_prompt_sent_length).to(device) test_max_scores = [config['score_ranges'][set_id][1]] * len(ids) test_min_scores = [config['score_ranges'][set_id][0]] * len(ids) test_max_scores = torch.tensor(test_max_scores).to(device) test_min_scores = torch.tensor(test_min_scores).to(device) test_data = TensorDataset(test_inputs, test_masks, test_sent_count, test_sent_length, test_features, test_prompt_inputs, test_prompt_mask, test_prompt_sent_count, test_prompt_sent_length, test_max_scores, test_min_scores) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=config['batch_size'][set_id]) print('begin set ', essay_set_id, 'setup model') model = make_model(config, device, set_id) # set optimizer only to update the new parameters model.load_state_dict(torch.load(config['model_save_path'][set_id])) # begin training print('begin set ', essay_set_id, 'begin test') # evaluation model.eval() with torch.no_grad(): dev_predict = [] for batch in test_dataloader: batch_inputs, batch_masks, batch_sent_count, batch_sent_length, batch_feature, \ prompt_inputs, prompt_mask, prompt_sent_count, prompt_sent_length, \ batch_max_scores, batch_min_scores = batch result = model(batch_inputs, batch_masks, batch_sent_count, batch_sent_length, prompt_inputs, prompt_mask, prompt_sent_count, prompt_sent_length, batch_min_scores, batch_max_scores, batch_feature, None, None, None) prediction = result['prediction'] prediction = prediction[:, 0] dev_predict.append(prediction) dev_predict = torch.cat(dev_predict, dim=0) dev_predict = dev_predict.tolist() predict_average = np.average(dev_predict) gap = config['mean_score'][set_id] - predict_average # predict_average = np.average(dev_predict) # gap = config['mean_score'][set_id] - predict_average # # if essay_set_id in [1, 2, 7, 8]: # if gap < 0: # gap = -math.pow(-gap, 0.666) # else: # gap = math.pow(gap, 0.666) # # # # dev_predict = [temp + gap for temp in dev_predict] # if essay_set_id in [2, 3, 4, 5, 6]: # dev_predict = more_uniform(dev_predict) # dev_predict = [temp if temp > score_ranges[set_id][0] else score_ranges[set_id][0] for temp in dev_predict] # dev_predict = [temp if temp < score_ranges[set_id][1] else score_ranges[set_id][1] for temp in dev_predict] samples = [] for i in range(0, len(ids)): samples.append({}) samples[i]['domain1_score'] = dev_predict[i] # np.around(dev_predict[i]) # np.around(dev_predict[i].item()) samples[i]['essay_id'] = ids[i] samples[i]['essay_set'] = essay_set_id create_path(get_path_prefix(config['test_output_path'][set_id])) save_to_tsv(samples, config['test_output_path'][set_id]) del model del test_inputs del test_masks torch.cuda.empty_cache()