def train(cfg): """ training begin :param cfg: config file :return: """ datasets = build_dataset(cfg) algo = TFIDFClustring(cfg) vocab = Vocab(cfg) summary = SummaryTxt(cfg) keyword = Keyword(cfg, summary) processed_news_num = 0 batch_size = cfg.SOLVER.BATCH_SIZE print('start training:') for seg_id in trange(0, datasets.file_num, batch_size): seg = [] for batch_idx in range(batch_size): batch, seg_size = datasets.getitem(seg_id + batch_idx) seg.extend(batch) processed_news_num += seg_size algo.run(segments=seg, vocab=vocab, seg_id=seg_id, keyword=keyword, summary=summary) # keyword.update_per_seg(new_updated_topic=new_updated_topic) print("seg idx: {}. processed news: {}".format(seg_id, processed_news_num)) pass
def load_corpus_data(data_path, language_name, start_token, end_token, mask_token, vocab_path, rebuild_vocab, unk="UNK", threshold=0): if rebuild_vocab: v = Vocab(language_name, start_token, end_token, mask_token, threshold=threshold) corpus = [] with open(data_path) as f: data = f.read().strip().split("\n") for line in data: line = line.strip() line = " ".join([start_token, line, end_token]) if rebuild_vocab: v.add_sentence(line) corpus.append(line) data2index = [] if rebuild_vocab: v.add_unk(unk) v.save(vocab_path) else: v = Vocab.load(vocab_path) for line in corpus: data2index.append([v.get_index(token) for token in line.split()]) return data2index, v
print(args) # prepare datasets and obtain the arguments t = datetime.datetime.now() timestamp = str(t.date()) + '-' + str(t.hour) + '-' + str( t.minute) + '-' + str(t.second) seed = args.seed torch.manual_seed(seed) np.random.seed(seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True print("Loading the vocab...") vocab = Vocab(os.path.join(args.data_path, args.vocab_file), 3) trainset = HistoryDataset(data_dir=args.data_path, segment_file='train_' + args.segment_file, vectors_file=args.vectors_file, chain_file='train_' + args.chains_file, split=args.split) testset = HistoryDataset(data_dir=args.data_path, segment_file='test_' + args.segment_file, vectors_file=args.vectors_file, chain_file='test_' + args.chains_file, split='test') valset = HistoryDataset(data_dir=args.data_path, segment_file='val_' + args.segment_file,
def test_item_file(end_test_file, embedding_file_path, vocab_file_path, use_gpu): embed = torch.Tensor(np.load(embedding_file_path)['arr_0']) with open(vocab_file_path) as f: word2id = json.load(f) vocab = Vocab(embed, word2id) #with open(end_test_file) as f: # examples = [json.loads(line) for line in f] with open(end_test_file) as f: examples = list() for line in f: if line and not line.isspace(): examples.append(json.loads(line)) #print(examples[0]) test_dataset = Dataset(examples) test_iter = DataLoader(dataset=test_dataset, batch_size=args.batch_size, shuffle=False) load_dir = os.path.join(args.input, 'model_files', 'CNN_RNN.pt') if use_gpu: checkpoint = torch.load(load_dir) else: checkpoint = torch.load(load_dir, map_location=lambda storage, loc: storage) if not use_gpu: checkpoint['args'].device = None net = getattr(models, checkpoint['args'].model)(checkpoint['args']) net.load_state_dict(checkpoint['model']) if use_gpu: net.cuda() net.eval() doc_num = len(test_dataset) all_targets = [] all_results = [] all_probs = [] all_acc = [] all_p = [] all_r = [] all_f1 = [] all_sum = [] for batch in tqdm(test_iter): features, targets, summaries, doc_lens = vocab.make_features(batch) if use_gpu: probs = net(Variable(features).cuda(), doc_lens) else: probs = net(Variable(features), doc_lens) start = 0 for doc_id, doc_len in enumerate(doc_lens): doc = batch['doc'][doc_id].split('\n')[:doc_len] stop = start + doc_len prob = probs[start:stop] hyp = [] for _p, _d in zip(prob, doc): print(_p) print(_d) if _p > 0.5: hyp.append(_d) if len(hyp) > 0: print(hyp) all_sum.append("###".join(hyp)) else: all_sum.append('') all_targets.append(targets[start:stop]) all_probs.append(prob) start = stop file_path_elems = end_test_file.split('/') file_name = 'TR-' + file_path_elems[len(file_path_elems) - 1] with open(os.path.join(args.output, file_name), mode='w', encoding='utf-8') as f: for text in all_sum: f.write(text.strip() + '\n') for item in all_probs: all_results.append([1 if tmp > 0.5 else 0 for tmp in item.tolist()]) print(len(all_results)) print(len(all_targets)) print(len(all_probs)) for _1, _2, _3 in zip(all_results, all_targets, all_probs): _2 = _2.tolist() _3 = _3.tolist() print("*" * 3) print('probs : ', _3) print('results : ', _1) print('targets : ', _2) tmp_acc = accuracy_score(_1, _2) tmp_p = precision_score(_1, _2) tmp_r = recall_score(_1, _2) tmp_f1 = f1_score(_1, _2) print('acc : ', tmp_acc) print('p : ', tmp_p) print('r : ', tmp_r) print('f1 : ', tmp_f1) all_acc.append(tmp_acc) all_p.append(tmp_p) all_r.append(tmp_r) all_f1.append(tmp_f1) print('all dataset acc : ', np.mean(all_acc)) print('all dataset p : ', np.mean(all_p)) print('all dataset r : ', np.mean(all_r)) print('all dataset f1 : ', np.mean(all_f1)) print('all results length : ', len(all_results))
import json from collections import defaultdict from utils.Vocab import Vocab with open('data/test_segments.json', 'r') as file: test_sg = json.load(file) with open('data/test_chains.json', 'r') as file: test = json.load(file) vocab = Vocab('data/vocab.csv', 3) # given an img, provides the chains for which it was the target target2chains = defaultdict(list) for ch in test: target_id = ch['target'] segment_list = ch['segments'] target2chains[target_id].append(segment_list) id_list = [] # segments ids, in the order in which they were encountered in the chains in the whole dataset for c in test: segments = c['segments'] for s in segments: if s not in id_list:
def train(): print("*"*100) print("train begin") # use gpu use_gpu = args.device is not None if torch.cuda.is_available() and not use_gpu: print("WARNING: You have a CUDA device, should run with -device 0") if use_gpu: # set cuda device and seed torch.cuda.set_device(args.device) torch.cuda.manual_seed(args.seed) torch.manual_seed(args.seed) random.seed(args.seed) numpy.random.seed(args.seed) os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_id) # 路径准备 embedding_file_path = os.path.join(args.project, "embedding.npz") vocab_file_path = os.path.join(args.project, "word2id.json") end_train_file = os.path.join(args.input, "train_files", "train.txt") train_files_dir = os.path.join(args.input, "train_files") # 合并同后缀文本文件 merge_same_suf_text_file(train_files_dir, end_train_file, '.txt') print('Loading vocab,train and val dataset.Wait a second,please') embed = torch.Tensor(np.load(embedding_file_path)['arr_0']) # embed = torch.Tensor(list(np.load(args.embedding))) with open(vocab_file_path) as f: word2id = json.load(f) vocab = Vocab(embed, word2id) with open(end_train_file) as f: examples = list() for line in tqdm(f): if line and not line.isspace(): examples.append(json.loads(line)) train_dataset = Dataset(examples) print(train_dataset[:1]) args.embed_num = embed.size(0) # 从embeding中读取维度 args.embed_dim = embed.size(1) # args.kernel_sizes = [int(ks) for ks in args.kernel_sizes.split(',')] net = getattr(models, args.model)(args, embed) if use_gpu: net.cuda() train_iter = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=False) criterion = nn.BCELoss() params = sum(p.numel() for p in list(net.parameters())) / 1e6 print('#Params: %.1fM' % (params)) min_loss = float('inf') optimizer = torch.optim.Adam(net.parameters(), lr=args.learning_rate) net.train() t1 = time() for epoch in range(1, args.max_epoch + 1): print("*"*10, 'epoch ', str(epoch), '*'*50) for i, batch in enumerate(train_iter): print("*"*10, 'batch', i, '*'*10) features, targets, _, doc_lens = vocab.make_features(batch, args.seq_trunc) features, targets = Variable(features), Variable(targets.float()) if use_gpu: features = features.cuda() targets = targets.cuda() probs = net(features, doc_lens) loss = criterion(probs, targets) optimizer.zero_grad() loss.backward() clip_grad_norm(net.parameters(), args.max_norm) optimizer.step() net.save() print('Epoch: %2d Loss: %f' % (epoch, loss)) t2 = time() print('Total Cost:%f h' % ((t2 - t1) / 3600)) print("模型配置文件保存至输出文件夹")