def test_item_file(end_test_file, embedding_file_path, vocab_file_path, use_gpu): embed = torch.Tensor(np.load(embedding_file_path)['arr_0']) with open(vocab_file_path) as f: word2id = json.load(f) vocab = Vocab(embed, word2id) #with open(end_test_file) as f: # examples = [json.loads(line) for line in f] with open(end_test_file) as f: examples = list() for line in f: if line and not line.isspace(): examples.append(json.loads(line)) #print(examples[0]) test_dataset = Dataset(examples) test_iter = DataLoader(dataset=test_dataset, batch_size=args.batch_size, shuffle=False) load_dir = os.path.join(args.input, 'model_files', 'CNN_RNN.pt') if use_gpu: checkpoint = torch.load(load_dir) else: checkpoint = torch.load(load_dir, map_location=lambda storage, loc: storage) if not use_gpu: checkpoint['args'].device = None net = getattr(models, checkpoint['args'].model)(checkpoint['args']) net.load_state_dict(checkpoint['model']) if use_gpu: net.cuda() net.eval() doc_num = len(test_dataset) all_targets = [] all_results = [] all_probs = [] all_acc = [] all_p = [] all_r = [] all_f1 = [] all_sum = [] for batch in tqdm(test_iter): features, targets, summaries, doc_lens = vocab.make_features(batch) if use_gpu: probs = net(Variable(features).cuda(), doc_lens) else: probs = net(Variable(features), doc_lens) start = 0 for doc_id, doc_len in enumerate(doc_lens): doc = batch['doc'][doc_id].split('\n')[:doc_len] stop = start + doc_len prob = probs[start:stop] hyp = [] for _p, _d in zip(prob, doc): print(_p) print(_d) if _p > 0.5: hyp.append(_d) if len(hyp) > 0: print(hyp) all_sum.append("###".join(hyp)) else: all_sum.append('') all_targets.append(targets[start:stop]) all_probs.append(prob) start = stop file_path_elems = end_test_file.split('/') file_name = 'TR-' + file_path_elems[len(file_path_elems) - 1] with open(os.path.join(args.output, file_name), mode='w', encoding='utf-8') as f: for text in all_sum: f.write(text.strip() + '\n') for item in all_probs: all_results.append([1 if tmp > 0.5 else 0 for tmp in item.tolist()]) print(len(all_results)) print(len(all_targets)) print(len(all_probs)) for _1, _2, _3 in zip(all_results, all_targets, all_probs): _2 = _2.tolist() _3 = _3.tolist() print("*" * 3) print('probs : ', _3) print('results : ', _1) print('targets : ', _2) tmp_acc = accuracy_score(_1, _2) tmp_p = precision_score(_1, _2) tmp_r = recall_score(_1, _2) tmp_f1 = f1_score(_1, _2) print('acc : ', tmp_acc) print('p : ', tmp_p) print('r : ', tmp_r) print('f1 : ', tmp_f1) all_acc.append(tmp_acc) all_p.append(tmp_p) all_r.append(tmp_r) all_f1.append(tmp_f1) print('all dataset acc : ', np.mean(all_acc)) print('all dataset p : ', np.mean(all_p)) print('all dataset r : ', np.mean(all_r)) print('all dataset f1 : ', np.mean(all_f1)) print('all results length : ', len(all_results))
def train(): print("*"*100) print("train begin") # use gpu use_gpu = args.device is not None if torch.cuda.is_available() and not use_gpu: print("WARNING: You have a CUDA device, should run with -device 0") if use_gpu: # set cuda device and seed torch.cuda.set_device(args.device) torch.cuda.manual_seed(args.seed) torch.manual_seed(args.seed) random.seed(args.seed) numpy.random.seed(args.seed) os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_id) # 路径准备 embedding_file_path = os.path.join(args.project, "embedding.npz") vocab_file_path = os.path.join(args.project, "word2id.json") end_train_file = os.path.join(args.input, "train_files", "train.txt") train_files_dir = os.path.join(args.input, "train_files") # 合并同后缀文本文件 merge_same_suf_text_file(train_files_dir, end_train_file, '.txt') print('Loading vocab,train and val dataset.Wait a second,please') embed = torch.Tensor(np.load(embedding_file_path)['arr_0']) # embed = torch.Tensor(list(np.load(args.embedding))) with open(vocab_file_path) as f: word2id = json.load(f) vocab = Vocab(embed, word2id) with open(end_train_file) as f: examples = list() for line in tqdm(f): if line and not line.isspace(): examples.append(json.loads(line)) train_dataset = Dataset(examples) print(train_dataset[:1]) args.embed_num = embed.size(0) # 从embeding中读取维度 args.embed_dim = embed.size(1) # args.kernel_sizes = [int(ks) for ks in args.kernel_sizes.split(',')] net = getattr(models, args.model)(args, embed) if use_gpu: net.cuda() train_iter = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=False) criterion = nn.BCELoss() params = sum(p.numel() for p in list(net.parameters())) / 1e6 print('#Params: %.1fM' % (params)) min_loss = float('inf') optimizer = torch.optim.Adam(net.parameters(), lr=args.learning_rate) net.train() t1 = time() for epoch in range(1, args.max_epoch + 1): print("*"*10, 'epoch ', str(epoch), '*'*50) for i, batch in enumerate(train_iter): print("*"*10, 'batch', i, '*'*10) features, targets, _, doc_lens = vocab.make_features(batch, args.seq_trunc) features, targets = Variable(features), Variable(targets.float()) if use_gpu: features = features.cuda() targets = targets.cuda() probs = net(features, doc_lens) loss = criterion(probs, targets) optimizer.zero_grad() loss.backward() clip_grad_norm(net.parameters(), args.max_norm) optimizer.step() net.save() print('Epoch: %2d Loss: %f' % (epoch, loss)) t2 = time() print('Total Cost:%f h' % ((t2 - t1) / 3600)) print("模型配置文件保存至输出文件夹")