def load_tag_data(config, data_type): train_data, dev_data, test_data = build_dataset(config) if data_type == 'train': tag_data = build_iterator(train_data, config) elif data_type == 'dev': tag_data = build_iterator(dev_data, config) elif data_type == 'test': tag_data = build_iterator(test_data, config) return tag_data
def Pred(model, config): print("Testing...") with open("../user_data/tmp_data/B_nli_256.txt") as f: test_nli = json.load(f) with open("../user_data/tmp_data/B_tnews_256.txt") as f: test_news = json.load(f) with open("../user_data/tmp_data/B_emotion_256.txt") as f: test_emotion = json.load(f) print(len(test_nli), len(test_news), len(test_emotion)) test_iter1 = build_iterator(test_nli, config) test_iter2 = build_iterator(test_news, config) test_iter3 = build_iterator(test_emotion, config) test(config, model, test_iter1, test_iter2, test_iter3) print("End..")
def wunaijiade(self): dataset = 'THUCNews' # 数据集 model_name = args.model # bert x = import_module('models.' + model_name) config = x.Config(dataset) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed_all(1) torch.backends.cudnn.deterministic = True # 保证每次结果一样 start_time = time.time() print("Loading data...") test_data = build_dataset(config) # test_data = build_dataset(config) test_iter = build_iterator(test_data, config) # print("test_data",test_data) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) # train model = x.Model(config) # .to(config.device) # model = x.Model(config) # beipoyuanli.train(config, model, train_iter, dev_iter, test_iter) aa = train(config, model, test_iter) # train(config, model, test_iter) return aa
def main(): dataset = 'C:/Users/USER/Documents/Capstone_Project/datalogs' x = import_module('models.{}'.format('RNN')) config = x.Config(dataset) train_data, dev_data, test_data = build_dataset(config, True, False) dev_iter = build_iterator(dev_data, config, do_dev=True) vocab = pickle.load(open(config.vocab_path, 'rb')) vocab = pickle.load(open(config.vocab_path, 'rb')) re_vocab = {token_id: token for token, token_id in vocab.items()} x = [] real_y = [] for i, (Queries, Responses) in enumerate(dev_iter): x = x + [sentence(q, re_vocab) for q in Queries[0].cpu().tolist()] real_y = real_y + [ sentence(r, re_vocab) for r in Responses[0].cpu().tolist() ] data = [] for i in range(len(x)): data.append('Pair {}'.format(i + 1)) data.append('Query: {}'.format(x[i])) data.append('Original Response: {}'.format(real_y[i])) data.append(' ') data = pd.DataFrame(data) data.to_csv(os.path.join('results_token.txt'), sep='\t', encoding='utf8', header=False, index=False)
def Test(model_name): dataset = 'C:/Users/USER/Documents/Capstone_Project/datalogs' # 数据集 p = os.path.dirname(os.path.dirname((os.path.abspath(__file__)))) if p not in sys.path: sys.path.append(p) do_train = False do_test = True x = import_module('models.{}'.format(model_name)) config = x.Config(dataset) np.random.seed(156) torch.cuda.manual_seed_all(1024) torch.backends.cudnn.deterministic = True start_time = time.time() print('Loading data...') train_data, dev_data, test_data = build_dataset(config, do_train, do_test) if do_test: test_iter = build_iterator(test_data, config, do_dev=True) time_dif = get_time_dif(start_time) model = x.Seq2SeqModel(config).to(config.device) for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) if do_test: test(config, model, test_iter)
def build(hidden_size, batch_size, max_len, cuda): bidirectional = False model_name = 'bert' x = import_module('models.' + model_name) config = x.Config(batch_size) train_data = build_dataset(config) train_dataloader = build_iterator(train_data, config) val_data, test_data = build_dataset_eval(config) val_dataloader = build_iterator_eval(val_data, config) test_dataloader = build_iterator_eval(test_data, config) encoder = x.Model(config).to(config.device) decoder = DecoderRNN(len(config.tokenizer.vocab), max_len, hidden_size * 2 if bidirectional else hidden_size, dropout_p=0.2, use_attention=True, bidirectional=bidirectional, eos_id=config.tokenizer.convert_tokens_to_ids([SEP ])[0], sos_id=config.tokenizer.convert_tokens_to_ids([CLS ])[0]) seq2seq = Seq2seq(encoder, decoder) if cuda: seq2seq.cuda() optimizer = torch.optim.Adam(lr=1e-3, params=seq2seq.parameters()) Tensor = torch.cuda.LongTensor if cuda else torch.LongTensor if cuda: seq2seq.cuda() Tensor = torch.cuda.LongTensor if cuda else torch.LongTensor loss_fun = torch.nn.NLLLoss(reduce=False) return seq2seq, optimizer, Tensor, train_dataloader, val_dataloader, test_dataloader, loss_fun, config
def build_dataset(self, path): # 加载数据集 # [(tokens, int(id), seq_len, mask)] config = self.config print('\nloading predict set ...') predict_data = self.load_dataset(path, config.pad_size) print('Done!') self.predict_iter = build_iterator(predict_data, config)
def predict_text(self, input_text): label_dict = {0: "other", 1: "weather"} model_in = self.load_dataset(input_text, self.vocab) test_iter = build_iterator([model_in], self.config) with torch.no_grad(): for texts, labels in test_iter: outputs = self.model(texts) label = torch.max(outputs.data, 1)[1].cpu().numpy()[0] return label_dict[label]
def build(hidden_size, batch_size, max_len, cuda): bidirectional = False model_name = 'bert' x = import_module('models.' + model_name) config = x.Config(batch_size) train_data = build_dataset(config) train_dataloader = build_iterator(train_data, config) val_data, test_data = build_dataset_eval(config) val_dataloader = build_iterator_eval(val_data, config) test_dataloader = build_iterator_eval(test_data, config) encoder = x.Model(config).to(config.device) decoder = DecoderRNN(len(config.tokenizer.vocab), max_len, hidden_size * 2 if bidirectional else hidden_size, dropout_p=0.2, use_attention=True, bidirectional=bidirectional, eos_id=config.tokenizer.convert_tokens_to_ids([SEP ])[0], sos_id=config.tokenizer.convert_tokens_to_ids([CLS ])[0]) decoder = decoder.to(config.device) seq2seq = Seq2seq(encoder, decoder) if cuda: seq2seq.cuda() param_optimizer = list(seq2seq.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] # optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate) print(len(train_data)) optimizer = BertAdam(optimizer_grouped_parameters, lr=config.learning_rate, warmup=0.03, t_total=len(train_data) * config.num_epochs) Tensor = torch.cuda.LongTensor if cuda else torch.LongTensor if cuda: seq2seq.cuda() Tensor = torch.cuda.LongTensor if cuda else torch.LongTensor loss_fun = torch.nn.NLLLoss(reduce=False) return seq2seq, optimizer, Tensor, train_dataloader, val_dataloader, test_dataloader, loss_fun, config
async def delete_all(self, ctx, mode_pattern): """Deletes all database entries This should be used with a lot of caution. There is no way to retract the deleted entries. Example: !delete_all active""" iterator = build_iterator(modes=mode_pattern) for mode, in iterator: delete_all(mode) await ctx.send(f'Emptied the database for {mode}.')
async def delete_me(self, ctx, mode_pattern): """Deletes the calling user from the database This should be used with a lot of caution. There is no way to retract your deleted entry. Example: !delete_me active""" iterator = build_iterator(modes=mode_pattern) for mode, in iterator: delete_user(ctx.author.id, mode) await ctx.send( f'Deleted the user {ctx.author.display_name} from {mode_pattern}.')
async def empty_me(self, ctx, mode_pattern, day_pattern): """Empties the time intervals of the calling user This can be used to reset your time intervals on certain days, but be careful. There is no way to retract the deleted information. Example: !empty_me active weekdays""" iterator = build_iterator(modes=mode_pattern, days=day_pattern) for mode, day in iterator: empty_user(ctx.author.id, mode, day) await ctx.send( f'Emptied time intervals of {ctx.author.display_name} on {long_name(day_pattern)} in {mode_pattern}.' )
async def show_all(self, ctx, mode_pattern, day_pattern): """Prints currently registered time intervals. Reads the database for the given mode(s) and returns a formatted version of the time intervals of all the server's users on the given day(s). Example: !show_all active weekdays""" iterator = build_iterator(modes=mode_pattern, days=day_pattern) output = '**All currently registered time intervals:**\n' for mode, day in iterator: output += all_intervals_format(mode, day) await ctx.send(output)
def evaluate(config, model, data_set, test=False): model.eval() loss_total = 0 predict_all = np.array([], dtype=int) labels_all = np.array([], dtype=int) batch_size = 10 b_len = len(data_set) // batch_size data_set = build_iterator(data_set, config) embeddings = [] with torch.no_grad(): for batch_data, label in data_set: outputs = model(batch_data) embeddings.append(outputs.cpu().detach().numpy()) return np.concatenate(embeddings, 0)
async def empty_all(self, ctx, mode_pattern, day_pattern): """Empties time intervals of all users This can be used to reset everything, but be careful. There is no way to retract the deleted information. Example: !empty_all active weekdays""" iterator = build_iterator(modes=mode_pattern, days=day_pattern) for mode, day in iterator: empty_all(mode, day) await ctx.send( f'Emptied the time intervals for everyone on {long_name(day_pattern)} in {mode_pattern}.' )
def predict(textList): key = [] value = [] newTextList = [] for i in range(len(textList)): tmp = [] tmp.append(i) key.append(i) value.append(tmp) listmap = dict(zip(key,value)) for i in range(len(textList)): tmpList = get_split_text(textList[i]) newTextList.extend(tmpList) listmap[i]=len(tmpList) print("listmap:",listmap) print("new_predict_all len:", len(newTextList)) #print(newTextList) test_data = load_dataset(newTextList, config.pad_size) test_iter = build_iterator(test_data, config) predict_all_int = np.array([], dtype=int) with torch.no_grad(): for texts, lables in test_iter: outputs = model(texts) predict = torch.max(outputs.data, 1)[1].cpu().numpy() predict_all_int = np.append(predict_all_int, predict) predict_all_s = [] for i in range(len(predict_all_int)): predict_all_s.append(class1.get(predict_all_int[i])) new_predict_all = [] index = 0 for i in range(len(key)): tmpPredict = [] num = listmap[i] for j in range(num): tmpPredict.append(predict_all_s[index]) index = index+1 print("tmpPredict:",tmpPredict) new_predict_all.extend(most_common(tmpPredict)) return new_predict_all
async def when(self, ctx, day_pattern): """Calculates the common time intervals of all members. This computes the intersection of the time intervals in active mode for all members of the server on the given day(s). Example: !when weekend""" iterator = build_iterator(days=day_pattern) output = '**Common time intervals for all members:**\n' for day, in iterator: common_interval = get_common_interval(day) if is_empty(common_interval): output += f'\t**{long_name(day)}**: No common time interval.\n' else: output += f'\t**{long_name(day)}**: {time_intervals_to_str_readable(common_interval)}\n' await ctx.send(output)
async def to_profile(self, ctx, day_pattern): """Sets the time intervals of the calling user to his/her profile After this call, the time intervals in active, which are used to compute common time intervals, are set to equal the ones in profile. Be careful, the time intervals in active cannot be restored. Example: !to_profile all""" iterator = build_iterator(days=day_pattern) user_id = ctx.author.id if not in_database(user_id, 'profile'): await ctx.send('You have no registered times in profile.') return for day, in iterator: interval = get_time_interval(user_id, day, 'profile') set_time_interval(user_id, day, 'active', interval) await ctx.send( f'Set the time intervals for {ctx.author.display_name} on {long_name(day_pattern)} to his/her profile.' )
async def show_me(self, ctx, mode_pattern, day_pattern): """Prints currently registered time intervals for the calling user. Reads the database for the given mode(s) and returns a formatted version of the time intervals of the calling user on the given day(s). Example: !show_me profile fri""" iterator = build_iterator(modes=mode_pattern, days=day_pattern) user_id = ctx.author.id for mode, day in iterator: if day == 'mon': if not in_database(user_id, mode): await ctx.send( f'You have no registered time intervals in {mode}.') return output = f'**Time intervals for {ctx.author.display_name} in {mode}:**\n' interval = get_time_interval(user_id, day, mode) output += f'\t**{long_name(day)}:** {time_intervals_to_str_readable(interval)}\n' await ctx.send(output)
# parser = argparse.ArgumentParser(description='Chinese Text Classification') # parser.add_argument('--model', type=str, required=True, help='choose a model: Bert, ERNIE') # args = parser.parse_args() if __name__ == '__main__': os.environ["CUDA_VISIBLE_DEVICES"] = "0" dataset = 'THUCNews' # 数据集 # model_name = args.model # bert model_name = 'bert' x = import_module('models.' + model_name) config = x.Config(dataset) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed_all(1) torch.backends.cudnn.deterministic = True # 保证每次结果一样 start_time = time.time() print("Loading data...") train_data, dev_data, test_data = build_dataset(config) train_iter = build_iterator(train_data, config) dev_iter = build_iterator(dev_data, config) test_iter = build_iterator(test_data, config) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) # train model = x.Model(config).to(config.device) train(config, model, train_iter, dev_iter, test_iter)
x = import_module('models.' + model_name) # 配置参数 config = x.Config(dataset) # 固定以下参数是为了保证每次结果一样 np.random.seed(1) # 为CPU设置种子用于生成随机数 torch.manual_seed(1) # #为所有GPU设置随机种子 torch.cuda.manual_seed_all(1) # 这个参数为True, 每次返回的卷积算法将是确定的,即默认算法 torch.backends.cudnn.deterministic = True # 保证每次结果一样 start_time = time.time() print("Loading data...") OCNLI_train, OCNLI_dev, OCEMOTION_train, OCEMOTION_dev, TNEWS_train, TNEWS_dev = build_dataset( config, mode='train') OCNLI_train_iter = build_iterator(OCNLI_train, config) OCEMOTION_train_iter = build_iterator(OCEMOTION_train, config) TNEWS_train_iter = build_iterator(TNEWS_train, config) OCNLI_dev_iter = build_iterator(OCNLI_dev, config) OCEMOTION_dev_iter = build_iterator(OCEMOTION_dev, config) TNEWS_dev_iter = build_iterator(TNEWS_dev, config) time_dif = get_time_dif(start_time) # train model = x.Model(config).to(config.device) train(config, model, OCNLI_train_iter, OCNLI_dev_iter, OCEMOTION_train_iter, OCEMOTION_dev_iter, TNEWS_train_iter, TNEWS_dev_iter)
final_result.append(dic) # 输出json文件 import json with open(output_path, 'w') as f: for each in final_result: json_str = json.dumps(each) # dumps f.write(json_str) f.write('\n') if __name__ == '__main__': dataset = '.' # 数据集 model_name = 'bert' # bert # 动态导入模块 x = import_module('models.' + model_name) # 配置参数 config = x.Config(dataset) model = x.Model(config).to(config.device) OCNLI_test, OCEMOTION_test, TNEWS_test = build_dataset(config, mode='test') OCNLI_test_iter = build_iterator(OCNLI_test, config) OCEMOTION_test_iter = build_iterator(OCEMOTION_test, config) TNEWS_test_iter = build_iterator(TNEWS_test, config) # 第一个任务的提交 submit_test(config, model, OCNLI_test_iter, config.OCLI_submit_output_path, 0) submit_test(config, model, OCEMOTION_test_iter, config.OCEMOTION_submit_output_path, 1) submit_test(config, model, TNEWS_test_iter, config.TNEWS_submit_output_path, 2)
# start_time = time.time() # print("Loading data...") # train_data, dev_data, test_data = build_dataset(config) # train_iter = build_iterator(train_data, config) # dev_iter = build_iterator(dev_data, config) # test_iter = build_iterator(test_data, config) # time_dif = get_time_dif(start_time) # print("Time usage:", time_dif) # train for i in range(3,5): config.train_path = dataset + '/data/fold5/cvfold'+str(i)+'_train.txt' config.dev_path = dataset + '/data/fold5/cvfold'+str(i)+'_dev.txt' config.test_path = dataset + '/data/fold5/cv_valid.txt' config.save_path = dataset + '/saved_dict/' + config.model_name + '512-5fold-'+str(i)+'.bin' #if i==0 or i==1: # config.num_epochs = 1 submit_data = build_dataset(config) #train_iter = build_iterator(train_data, config) #dev_iter = build_iterator(dev_data, config) #test_iter = build_iterator(test_data, config) submit_iter = build_iterator(submit_data, config) model = x.Model(config).to(config.device) test(config, model, submit_iter, 'bertdrop_submitb_'+str(i)+'.npy') #test(config, model, test_iter,'bertRNN_valid_'+str(i)+'.npy') #test(config, model, dev_iter, 'bertRNN_train_'+str(i)+'.npy') #model.load_state_dict(torch.load(config.save_path)) #train(config, model, train_iter, dev_iter, test_iter)
do_test = args.do_test if (do_train or do_test) == False: raise ValueError( 'At lest one of `do_train` or `do_test` muest be True.') x = import_module('models.{}'.format(model_name)) config = x.Config(dataset) np.random.seed(156) torch.cuda.manual_seed_all(1024) torch.backends.cudnn.deterministic = True start_time = time.time() print('Loading data...') train_data, dev_data, test_data = build_dataset(config, do_train, do_test) if do_train: train_iter = build_iterator(train_data, config) dev_iter = build_iterator(dev_data, config, do_dev=True) if do_test: test_iter = build_iterator(test_data, config, do_dev=True) time_dif = get_time_dif(start_time) model = x.Seq2SeqModel(config).to(config.device) for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) if do_train: train(config, model, train_iter, dev_iter) if do_test:
# dataset = 'data/Intention2_V2' # 数据集 # dataset = 'data/Intention2_V2' # 数据集 # dataset = 'data/Intention135' # 数据集 dataset = args.data_dir # 数据集 task_name = args.task_name # model_dir = 'data/Intention2/saved_dict/bert.ckpt' model_name = args.model # bert x = import_module('models.' + model_name) config = x.Config(dataset, task_name) # config.save_path = model_dir np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed_all(1) torch.backends.cudnn.deterministic = True # 保证每次结果一样 start_time = time.time() print("Loading data...") train_data, dev_data, test_data = build_dataset(config) test_iter = build_iterator(test_data, config) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) # train model = x.Model(config).to(config.device) # train(config, model, train_iter, dev_iter, test_iter) out_result_file = dataset + '/result/model_name.result.txt' predict(config, model, test_iter, out_file=out_result_file)
from utils import build_dataset, build_iterator, get_time_dif, load_vocabulary, build_vocab torch.manual_seed(1) torch.cuda.manual_seed_all(1) torch.backends.cudnn.deterministic = True start_time = time.time() print('加载数据...') build_vocab(config.input_file, os.path.join(config.vocab_path, 'in_vocab')) build_vocab(config.slot_file, os.path.join(config.vocab_path, 'slot_vocab')) build_vocab(config.intent_file, os.path.join(config.vocab_path, 'intent_vocab'), pad=False, unk=False) in_vocab = load_vocabulary(os.path.join(config.vocab_path, 'in_vocab')) slot_vocab = load_vocabulary(os.path.join(config.vocab_path, 'slot_vocab')) intent_vocab = load_vocabulary(os.path.join(config.vocab_path, 'intent_vocab')) train_data, dev_data, test_data = build_dataset(in_vocab['vocab'], slot_vocab['vocab'], intent_vocab['vocab']) train_iter = build_iterator(train_data) dev_iter = build_iterator(dev_data) test_iter = build_iterator(test_data) time_dif = get_time_dif(start_time) print('time usage:', time_dif) config.n_vocab = len(in_vocab['vocab']) x = import_module(model_name) model = x.Model(config).to(torch.device('cuda')) init_network(model) print(model.parameters) train(config, model, train_iter, dev_iter, test_iter) # test(config, model, test_iter)
if not hasattr(config, 'STLR'): setattr(config, 'STLR', True) else: setattr(config, 'STLR', False) # random initialize np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed_all(1) torch.backends.cudnn.deterministic = True # 保证每次结果一样 start_time = time.time() print("Loading data...") if not os.path.exists('./datasets'): os.makedirs('./datasets') build_dataset(config) train_iter = build_iterator('train', config) dev_iter = build_iterator('dev', config) test_iter = build_iterator('test', config) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) # train model = model.Model(config).to(config.device) train(config, model, train_iter, dev_iter, test_iter, save_loss=save_loss_flag)
def main(): config = DCMN_Config() eval_seq_dataset, eval_dcmn_dataset = build_dataset_eval(config) eval_dataloader = build_iterator(eval_seq_dataset, eval_dcmn_dataset, config) seq2seq, seq_optimizer, seq_scheduler, seq_loss_fun = build_seq2seq( config, 768, config.no_cuda) dcmn = BertForMultipleChoiceWithMatch.from_pretrained( config.bert_model, num_choices=config.num_choices) dcmn.to(config.dcmn_device) save_file_best = torch.load('./backup/bert/best_save.data', map_location=torch.device('cuda:2')) # dcmn.load_state_dict(save_file_best['dcmn_para']) seq2seq.load_state_dict(save_file_best['para']) save_file_best = torch.load('./backup/dcmn/best_save.data', map_location=torch.device('cuda:2')) dcmn.load_state_dict(save_file_best['dcmn_para']) dcmn.eval() seq2seq.eval() # src = '[CLS] [MASK] computed tomography [MASK] showed [MASK] patient [MASK] was [MASK] fine [MASK] . [SEP]' # src_ids, src_masks = seq_tokenize([src], config) # decoder_outputs, decoder_hidden, ret_dict = seq2seq([src_ids, src_masks], src_ids, 0.0, False) # symbols = ret_dict['sequence'] # symbols = torch.cat(symbols, 1).data.cpu().numpy() # results = decode_sentence(symbols, config) # print(results) results = [] seq_srcs_all = [] for step, (seq_batches, dcmn_batches) in enumerate( tqdm(eval_dataloader, desc="Evaluating")): seq_srcs, seq_tars, cudics, k_cs = [[_[__] for _ in seq_batches] for __ in range(4)] outs = [] if len(dcmn_batches) > 0: for p in range(0, len(dcmn_batches), config.batch_size): dcmn_batches_smaller = dcmn_batches[p:p + config.batch_size] input_ids, input_mask, segment_ids, doc_len, ques_len, option_len, labels = [ torch.LongTensor([_[__] for _ in dcmn_batches_smaller ]).to(config.dcmn_device) for __ in range(7) ] with torch.no_grad(): logits = dcmn(input_ids, segment_ids, input_mask, doc_len, ques_len, option_len) outs_smaller = np.argmax(logits.detach().cpu().numpy(), axis=1) outs.extend(outs_smaller) seq_srcs = remove_unk(seq_srcs, outs, k_cs) seq_srcs_all.extend(seq_srcs) src_ids, src_masks = seq_tokenize(seq_srcs, config) decoder_outputs, decoder_hidden, ret_dict = seq2seq( [src_ids, src_masks], src_ids, 0.0, False) symbols = ret_dict['sequence'] symbols = torch.cat(symbols, 1).data.cpu().numpy() results.extend(decode_sentence(symbols, config)) with open('./outs/outs-new.pkl', 'wb') as f: pickle.dump(results, f) sentences = [] for words in results: words = words.replace('[MASK] ', '') words = words.replace(' - ', '-').replace(' . ', '.').replace(' / ', '/') sentences.append(words.strip()) with open('./result/tmp.out.txt', 'w', encoding='utf-8') as f: f.writelines([x.lower() + '\n' for x in sentences]) bleu, hit, com, ascore = get_score() print('bleu:{}, hit:{}, com:{}, ascore:{}'.format(bleu, hit, com, ascore))
test_inputs = load_dataset(df_test, test_categories, config.pad_size) gkf = GroupKFold(n_splits=5).split(X=df_train.q2, groups=df_train.id) valid_preds = [] test_preds = [] print("一共"+str(df_train.shape[0])+"个训练语句") oof = np.zeros((len(df_train), 1)) for fold, (train_idx, valid_idx) in enumerate(gkf): # if fold<10: # continue model = x.Model(config).to(config.device) print("Loading " + str(fold + 1) + " fold data...") train_idx = shuffle(train_idx) train_inputs = [datas[i] for i in train_idx] valid_inputs = [datas[i] for i in valid_idx] train_iter = build_iterator(train_inputs, config.batch_size, config) dev_iter = build_iterator(valid_inputs, config.test_batch, config) test_iter = build_iterator(test_inputs, config.test_batch, config) valid_outputs = np.array([], dtype=int) for d, (text, labels) in enumerate(dev_iter): valid_outputs = np.append(valid_outputs, labels.data.cpu().numpy()) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) train(config, model, train_iter, dev_iter, fold) oof_p = predict(config, model, dev_iter, fold, activation='softmax') oof[valid_idx] = oof_p valid_preds.append(oof_p) f1, t = search_f1(valid_outputs, valid_preds[-1]) print('validation score = ', f1) each_fold_predict = predict(config, model, test_iter, fold, activation='softmax')
def build_dcmn(): config = DCMN_Config() output_eval_file = os.path.join(config.output_dir, config.output_file) if os.path.exists( output_eval_file) and config.output_file != 'output_test.txt': raise ValueError( "Output file ({}) already exists and is not empty.".format( output_eval_file)) with open(output_eval_file, "w") as writer: writer.write( "***** Eval results Epoch %s *****\t\n" % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))) dic = str([(name, value) for name, value in vars(config).items()]) writer.write("%s\t\n" % dic) random.seed(config.seed) np.random.seed(config.seed) torch.manual_seed(config.seed) if not config.no_cuda > 0: torch.cuda.manual_seed_all(config.seed) # train_seq_dataset, train_dcmn_dataset = build_dataset(config) # with open('./data/train_seq_dataset.pkl', 'wb') as f: # pickle.dump(train_seq_dataset, f) # with open('./data/train_dcmn_dataset.pkl', 'wb') as f: # pickle.dump(train_dcmn_dataset, f) with open('./data/train_seq_dataset.pkl', 'rb') as f: train_seq_dataset = pickle.load(f) with open('./data/train_dcmn_dataset.pkl', 'rb') as f: train_dcmn_dataset = pickle.load(f) train_dataloader = build_iterator(train_seq_dataset, train_dcmn_dataset, config) # eval_seq_dataset, eval_dcmn_dataset = build_dataset_eval(config) # with open('./data/eval_seq_dataset.pkl', 'wb') as f: # pickle.dump(eval_seq_dataset, f) # with open('./data/eval_dcmn_dataset.pkl', 'wb') as f: # pickle.dump(eval_dcmn_dataset, f) with open('./data/eval_seq_dataset.pkl', 'rb') as f: eval_seq_dataset = pickle.load(f) with open('./data/eval_dcmn_dataset.pkl', 'rb') as f: eval_dcmn_dataset = pickle.load(f) eval_dataloader = build_iterator(eval_seq_dataset, eval_dcmn_dataset, config) num_train_steps = int( len(train_seq_dataset) / config.batch_size / config.gradient_accumulation_steps * config.num_train_epochs) t_total = num_train_steps config.t_total = t_total dcmn_t_total = 0 for step, (seq_batches, dcmn_batches) in enumerate(train_dataloader): if len(dcmn_batches) > 0: dcmn_t_total += len(dcmn_batches) // config.batch_size if len(dcmn_batches) % config.batch_size > 0: dcmn_t_total += 1 dcmn_t_total *= config.num_train_epochs model = BertForMultipleChoiceWithMatch.from_pretrained( config.bert_model, num_choices=config.num_choices) model.to(config.dcmn_device) param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] optimizer = AdamW(params=optimizer_grouped_parameters, lr=config.dcmn_learning_rate, correct_bias=False) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=int(config.dcmn_warmup_proportion * dcmn_t_total), num_training_steps=dcmn_t_total) # PyTorch scheduler loss_fun = torch.nn.CrossEntropyLoss() return model, config, train_dataloader, eval_dataloader, optimizer, scheduler, loss_fun