def on_exception(self, exception): if isinstance(exception, KeyboardInterrupt): logger.error( "[Error] Caught keyboard interrupt on worker. Stopping supervisor..." ) state = { 'iter': self.step, 'encoder_state_dict': self.model.encoder.state_dict(), 'decoder_state_dict': self.model.decoder.state_dict(), 'reduce_state_dict': self.model.reduce_state.state_dict(), 'optimizer': self.optimizer.state_dict(), 'current_loss': self.running_avg_loss } model_save_path = os.path.join(self.config.model_path, 'earlystop_step_%d.pkl' % self.step) # torch.save(state, model_save_path) #self.model.cpu() torch.save(self.model, model_save_path) #if self.config.use_gpu: # self.model.cuda() logger.info('[INFO] Saving early stop model to %s', model_save_path) if self.quit_all is True: sys.exit(0) # 直接退出程序 else: pass else: raise exception # 抛出陌生Error
def on_valid_end(self, eval_result, metric_key, optimizer, is_better_eval): logger.info(' | end of valid {:3d} | time: {:5.2f}s | '.format( self.epoch, (time.time() - self.valid_start_time))) # early stop if not is_better_eval: if self.wait == self.patience: state = { 'iter': self.step, 'encoder_state_dict': self.model.encoder.state_dict(), 'decoder_state_dict': self.model.decoder.state_dict(), 'reduce_state_dict': self.model.reduce_state.state_dict(), 'optimizer': self.optimizer.state_dict(), 'current_loss': self.running_avg_loss } model_save_path = os.path.join( self.config.model_path, 'earlystop_step_%d.pkl' % self.step) # torch.save(state, model_save_path) #self.model.cpu() torch.save(self.model, model_save_path) #if self.config.use_gpu: # self.model.cuda() logger.info('[INFO] Saving early stop model to %s', model_save_path) raise EarlyStopError("Early stopping raised.") else: self.wait += 1 else: self.wait = 0
def set_up_data(mode, config): datainfo = prepare_dataInfo(mode=mode, test_data_path=config.decode_data_path, train_data_path=config.train_data_path, vocab_size=config.vocab_size, config=config) logger.info('-' * 10 + "set up data done!" + '-' * 10) return datainfo
def set_up_data(data_config): paths = { "train": os.path.join(data_config.train_path, args.train_file), "test": os.path.join(data_config.train_path, args.test_file) } datainfo, vocab = ScisummGraphLoader(setting=args.setting).process( paths, data_config, args.load_vocab) logger.info('-' * 10 + "set up data done!" + '-' * 10) return datainfo, vocab
def set_up_data(): paths = { "train": os.path.join(config.train_path, "train.jsonl"), "dev": os.path.join(config.train_path, "val.jsonl") } datainfo, vocabs = ScisummGraphLoader(setting=args.setting).process( paths, config, args.load_vocab) logger.info('-' * 10 + "set up data done!" + '-' * 10) return datainfo, vocabs
def get_metric(self, reset=True): logger.info("[INFO] Hyps and Refer number is %d, %d", len(self.prediction), len(self.referece)) if len(self.prediction) == 0 or len(self.referece) == 0: logger.error("During testing, no hyps or refers is selected!") return rouge = Rouge() scores_all = rouge.get_scores(self.prediction, self.referece, avg=True) if reset: self.prediction = [] self.referece = [] logger.info(scores_all) scores_all = remend_score(scores_all) return scores_all
def run_train(config): train_dir, model_dir = initial_dir('train', config) config.train_path = train_dir config.model_path = model_dir print_config(config, train_dir) datainfo = set_up_data('train', config) train_sampler = BucketSampler(batch_size=config.batch_size, seq_len_field_name='enc_len') criterion = MyLoss(config=config, padding_idx=datainfo.vocabs["train"].to_index(PAD_TOKEN)) model = Model(vocab=datainfo.vocabs["train"], config=config) params = list(model.encoder.parameters()) + list(model.decoder.parameters()) + \ list(model.reduce_state.parameters()) initial_lr = config.lr_coverage if config.is_coverage else config.lr optimizer = Adagrad(params, lr=initial_lr, initial_accumulator_value=config.adagrad_init_acc) train_loader = datainfo.datasets["train"] valid_loader = datainfo.datasets["dev"] summary_writer = tf.compat.v1.summary.FileWriter(train_dir) trainer = Trainer(model=model, train_data=train_loader, optimizer=optimizer, loss=criterion, batch_size=config.batch_size, check_code_level=-1, n_epochs=config.n_epochs, print_every=100, dev_data=valid_loader, metrics=FastRougeMetric(pred='prediction', art_oovs='article_oovs', abstract_sentences='abstract_sentences', config=config, vocab=datainfo.vocabs["train"]), metric_key="rouge-l-f", validate_every=-1, save_path=model_dir, callbacks=[TrainCallback(config, summary_writer, patience=10)], use_tqdm=False, device=config.visible_gpu) logger.info("-" * 5 + "start training" + "-" * 5) traininfo = trainer.train(load_best_model=True) logger.info(' | end of Train | time: {:5.2f}s | '.format(traininfo["seconds"])) logger.info('[INFO] best eval model in epoch %d and iter %d', traininfo["best_epoch"], traininfo["best_step"]) logger.info(traininfo["best_eval"]) bestmodel_save_path = os.path.join(config.model_path, 'bestmodel.pkl') # this is where checkpoints of best models are saved state = { 'encoder_state_dict': model.encoder.state_dict(), 'decoder_state_dict': model.decoder.state_dict(), 'reduce_state_dict': model.reduce_state.state_dict() } torch.save(state, bestmodel_save_path) # 不是作为形参传入到Trainer里面的么,怎么里面的model变化会影响到外面的? logger.info('[INFO] Saving eval best model to %s', bestmodel_save_path)
def run_train(): datainfo, vocabs = set_up_data() train_sampler = RandomSampler() criterion = SummLoss(config=config, padding_idx=vocabs.to_index(PAD_TOKEN)) model = CGSum(config, vocab=vocabs) model.to(device) initial_lr = config.lr logger.info(f"learning rate = {initial_lr}") optimizer = Adagrad(filter(lambda p: p.requires_grad, model.parameters()), lr=initial_lr, initial_accumulator_value=config.adagrad_init_acc) train_loader = datainfo.datasets["train"] valid_loader = datainfo.datasets["dev"] callbacks = [ TrainCallback(config, patience=10), FitlogCallback(), LRDecayCallback(optimizer.param_groups, steps=args.weight_decay_step) ] trainer = Trainer(model=model, train_data=train_loader, optimizer=optimizer, loss=criterion, batch_size=config.batch_size, check_code_level=-1, sampler=train_sampler, n_epochs=config.n_epochs, print_every=100, dev_data=valid_loader, update_every=args.update_every, metrics=FastRougeMetric( pred='prediction', art_oovs='article_oovs', abstract_sentences='abstract_sentences', config=config, vocab=datainfo.vocabs["vocab"]), metric_key="rouge-l-f", validate_every=args.validate_every * args.update_every, save_path=None, callbacks=callbacks, use_tqdm=True) logger.info("-" * 5 + "start training" + "-" * 5) traininfo = trainer.train(load_best_model=True) logger.info(' | end of Train | time: {:5.2f}s | '.format( traininfo["seconds"])) logger.info('[INFO] best eval model in epoch %d and iter %d', traininfo["best_epoch"], traininfo["best_step"])
def on_valid_end(self, eval_result, metric_key, optimizer, is_better_eval): logger.info(' | end of valid {:3d} | time: {:5.2f}s | '.format( self.epoch, (time.time() - self.valid_start_time))) # save the better checkpoint if is_better_eval: logger.info("got better results on dev, save checkpoint.. ") model_save_path = os.path.join( self.config.model_path, f'CGSum_{self.config.setting}_{self.config.n_hop}hopNbrs.pt') checkpoint = { "state_dict": self.model.state_dict(), "config": self.model.config.__dict__ } torch.save(checkpoint, model_save_path) # early stop if not is_better_eval: if self.wait == self.patience: raise EarlyStopError("Early stopping raised.") else: self.wait += 1 else: self.wait = 0
def on_backward_begin(self, loss): self.loss_update_every.append(loss.item()) if isinstance(loss, tuple) and not np.isfinite(loss[0].item()): logger.error("train Loss is not finite. Stopping.") logger.info(loss[0].item()) for name, param in self.model.named_parameters(): if param.requires_grad: logger.info(name) logger.info(param.grad.data.sum()) raise Exception("train Loss is not finite. Stopping.") if self.step % self.update_every == 0: assert len(self.loss_update_every) == self.update_every loss_batch = sum(self.loss_update_every) self.loss_update_every = [] # report the loss if self.step < 10 or self.step % 1000 == 0: logger.info( "|epoch: %d step: %d log_loss: %.4f |" % (self.epoch, self.step / self.update_every, loss_batch)) self.running_avg_loss = calc_running_avg_loss( loss_batch, self.running_avg_loss, self.step / self.update_every)
def get_metric(self, reset=True): logger.info("[INFO] Hyps and Refer number is %d, %d", len(self.prediction), len(self.referece)) if len(self.prediction) == 0 or len(self.referece) == 0: logger.error("During testing, no hyps or refers is selected!") return if isinstance(self.referece[0], list): logger.info("Multi Reference summaries!") scores_all = pyrouge_score_all_multi(self.prediction, self.referece, self.config) else: scores_all = pyrouge_score_all(self.prediction, self.referece, self.config) if reset: self.prediction = [] self.referece = [] logger.info(scores_all) return scores_all
def on_backward_begin(self, loss): """ :param loss: [] :return: """ print("|epoch: %d step: %d loss: %.4f|" % (self.epoch, self.step, loss.item())) if not np.isfinite(loss.item()): logger.error("train Loss is not finite. Stopping.") logger.info(loss.item()) for name, param in self.model.named_parameters(): if param.requires_grad: logger.info(name) logger.info(param.grad.data.sum()) raise Exception("train Loss is not finite. Stopping.") self.running_avg_loss = calc_running_avg_loss(loss.item(), self.running_avg_loss, self.summary_writer, self.step)
def set_up_data(mode): datainfo = prepare_dataInfo(mode, config.train_data_path, config.eval_data_path, config.decode_data_path, config.vocab_path, config.vocab_size, config) logger.info('-' * 10 + "set up data done!" + '-' * 10) return datainfo
def on_epoch_end(self): logger.info(' | end of epoch {:3d} | time: {:5.2f}s | '.format( self.epoch, (time.time() - self.epoch_start_time)))
parser.add_argument('-lr_coverage', default=0.15, type=float) parser.add_argument('-test_data_name', required=True, type=str) parser.add_argument('-test_model', default='', type=str) args = parser.parse_args() args.train_data_path = os.path.join(args.dataset_path, args.train_data_path) # args.eval_data_path = os.path.join(args.dataset_path, args.eval_data_path) args.decode_data_path = os.path.join(args.dataset_path, args.decode_data_path) # args.vocab_path = os.path.join(args.dataset_path, args.vocab_path) args.log_root = os.path.join(args.root, args.log_root) if args.visible_gpu != -1: args.use_gpu = True torch.cuda.set_device(args.visible_gpu) print("using gpu: ", args.visible_gpu) else: args.use_gpu = False logger.info("------start mode test-------") if args.test_model == '': k_model_path_list = getting_k_model_path(args.model_file_path, args.top_k) for tmp_path in k_model_path_list: run_test(tmp_path, args) else: run_test(args.test_model, args)
def prepare_dataInfo(mode, vocab_size, config, train_data_path=None, dev_data_path=None, test_data_path=None): def sent_to_words(sents): result = [] for sent in sents: result.extend([ word.strip() for word in sent.split(" ") if len(word.strip()) != 0 ]) return result # dataloader = Cnn_dailymailLodaer() # 适用于输入是json的文件,每个json必须有field :text和summary,二者都是tokenized dataloader = JsonLoader(fields={ "text": "words", "summary": "abstract_sentences" }) if mode == 'train': if train_data_path is None or dev_data_path is None: print("training with no train data path or dev data path! ") paths = {"train": train_data_path, "dev": dev_data_path} else: if test_data_path is None: print("testing with no test data path ! ") paths = {"train": train_data_path, "test": test_data_path} # dataInfo = dataloader.process(paths, vocab_path, vocab_size) print("=" * 10) print(paths) dataInfo = dataloader.load(paths) for key, _dataset in dataInfo.datasets.items(): _dataset.apply(lambda ins: " ".join(ins['words']), new_field_name='article') _dataset.apply(lambda ins: sent_to_words(ins['words']), new_field_name='words') _dataset.apply( lambda ins: sent_tokenize(" ".join(ins['abstract_sentences'])), new_field_name='abstract_sentences') vocab = Vocabulary(max_size=vocab_size - 2, padding=PAD_TOKEN, unknown=UNKNOWN_TOKEN) vocab.from_dataset(dataInfo.datasets['train'], field_name='words') vocab.add(START_DECODING) vocab.add(STOP_DECODING) print(vocab.to_word(0)) print(len(vocab)) assert vocab_size == len(vocab), "vocab_size error!!!" dataInfo.set_vocab(vocab, "train") for key, dataset in dataInfo.datasets.items(): data_dict = { "enc_len": [], "enc_input": [], "dec_input": [], "target": [], "dec_len": [], "article_oovs": [], "enc_input_extend_vocab": [] } for instance in dataset: article = instance["article"] abstract_sentences = instance["abstract_sentences"] enc_len, enc_input, dec_input, target, dec_len, article_oovs, enc_input_extend_vocab = getting_full_info( article, abstract_sentences, dataInfo.vocabs['train'], config) data_dict["enc_len"].append(enc_len) data_dict["enc_input"].append(enc_input) data_dict["dec_input"].append(dec_input) data_dict["target"].append(target) data_dict["dec_len"].append(dec_len) data_dict["article_oovs"].append(article_oovs) data_dict["enc_input_extend_vocab"].append(enc_input_extend_vocab) logger.info("-----prepare_dataInfo for dataset " + key + "-----") logger.info( str(len(data_dict["enc_len"])) + " " + str(len(data_dict["enc_input"])) + " " + str(len(data_dict["dec_input"])) + " " + str(len(data_dict["target"])) + " " + str(len(data_dict["dec_len"])) + " " + str(len(data_dict["article_oovs"])) + " " + str(len(data_dict["enc_input_extend_vocab"]))) dataset.add_field("enc_len", data_dict["enc_len"]) dataset.add_field("enc_input", data_dict["enc_input"]) dataset.add_field("dec_input", data_dict["dec_input"]) dataset.add_field("target", data_dict["target"]) dataset.add_field("dec_len", data_dict["dec_len"]) dataset.add_field("article_oovs", data_dict["article_oovs"]) dataset.add_field("enc_input_extend_vocab", data_dict["enc_input_extend_vocab"]) dataset.set_input("enc_len", "enc_input", "dec_input", "dec_len", "article_oovs", "enc_input_extend_vocab") dataset.set_target("target", "article_oovs", "abstract_sentences") ''' for name, dataset in dataInfo.datasets.items(): for field_name in dataset.get_field_names(): dataset.apply_field(convert_list_to_ndarray, field_name=field_name, new_field_name=field_name) ''' return dataInfo
config.max_graph_enc_steps = args.max_graph_enc_steps config.min_dec_steps = args.min_dec_steps # mode config.mode = args.mode config.setting = args.setting # save model if not os.path.exists(config.model_path): if config.model_path.__contains__("/"): os.makedirs(config.model_path, 0o777) else: os.mkdir(config.model_path) # fitlog dir logger.info(f"set fitlog dir to {args.fitlog_dir}") if not os.path.exists(args.fitlog_dir): os.mkdir(args.fitlog_dir) fitlog.set_log_dir(args.fitlog_dir) fitlog.add_hyper(args) if not os.path.exists(config.model_path): os.mkdir(config.model_path) if args.visible_gpu != -1: config.use_gpu = True torch.cuda.set_device(args.visible_gpu) device = torch.device(args.visible_gpu) else: config.use_gpu = False
parser.add_argument('-max_grad_norm', default=2.0, type=float) parser.add_argument('-is_pointer_gen', dest='pointer_gen', nargs='?', const=True, default=False, type=bool) parser.add_argument('-is_coverage', nargs='?', const=True, default=False, type=bool) parser.add_argument('-cov_loss_wt', default=1.0, type=float) parser.add_argument('-eps', default=1e-12, type=float) # parser.add_argument('-max_iterations', default=500000, required=True, type=int) parser.add_argument("-n_epochs", default=33, type=int, required=True) parser.add_argument('-lr_coverage', default=0.15, type=float) args = parser.parse_args() args.train_data_path = os.path.join(args.dataset_path, args.train_data_path) args.eval_data_path = os.path.join(args.dataset_path, args.eval_data_path) # args.decode_data_path = os.path.join(args.dataset_path, args.decode_data_path) # args.vocab_path = os.path.join(args.dataset_path, args.vocab_path) args.log_root = os.path.join(args.root, args.log_root) if args.visible_gpu != -1: args.use_gpu = True torch.cuda.set_device(args.visible_gpu) print("using gpu: ", args.visible_gpu) else: args.use_gpu = False logger.info("------start mode train------") run_train(args)
def process(self, paths, config, load_vocab_file=True): """ :param paths: dict path for each dataset :param load_vocab_file: bool build vocab (False) or load vocab (True) :return: DataBundle datasets: dict keys correspond to the paths dict vocabs: dict key: vocab(if "train" in paths), domain(if domain=True), tag(if tag=True) embeddings: optional """ vocab_size = config.vocab_size def _merge_abstracts(abstracts): merged = [] for abstract in abstracts: merged.extend(abstract[:self.max_concat_len] + [SEP]) if len(abstracts) == 0: assert merged == [] return merged[:-1] def _pad_graph_inputs(graph_inputs): pad_text_wd = [] max_len = config.max_graph_enc_steps for graph_input in graph_inputs: if len(graph_input) < max_len: pad_num = max_len - len(graph_input) graph_input.extend([PAD_TOKEN] * pad_num) else: graph_input = graph_input[:max_len] pad_text_wd.append(graph_input) if len(pad_text_wd) == 0: pad_text_wd.append([PAD_TOKEN] * max_len) return pad_text_wd def _get_nbr_input_len(input_wd): enc_len = [ min(len(text), config.max_graph_enc_steps) for text in input_wd ] if len(enc_len) == 0: enc_len = [0] return enc_len def _pad_article(text_wd): token_num = len(text_wd) max_len = config.max_enc_steps if config.neighbor_process == "sep": max_len += self.max_concat_len * self.max_concat_num if token_num < max_len: padding = [PAD_TOKEN] * (max_len - token_num) article = text_wd + padding else: article = text_wd[:max_len] return article def _split_list(input_list): return [text.split() for text in input_list] def sent_tokenize(abstract): abs_list = abstract.split(".") return [(abst + ".") for abst in abs_list[:-1]] def _article_token_mask(text_wd): max_enc_len = config.max_enc_steps if config.neighbor_process == "sep": max_enc_len += self.max_concat_len * self.max_concat_num token_num = len(text_wd) if token_num < max_enc_len: mask = [1] * token_num + [0] * (max_enc_len - token_num) else: mask = [1] * max_enc_len return mask def generate_article_input(text, abstracts): if config.neighbor_process == "sep": text_wd = text.split()[:config.max_enc_steps] text_wd.append(SEP) abstracts_wd = _merge_abstracts(abstracts) return text_wd + abstracts_wd else: return text.split() def generate_graph_inputs(graph_struct): graph_inputs_ = [ graph_strut_dict[pid][config.graph_input_type] for pid in graph_struct ] return _split_list(graph_inputs_[1:]) def generate_graph_structs(paper_id): sub_graph_dict = {} sub_graph_set = [] n_hop = config.n_hop max_neighbor_num = config.max_neighbor_num k_nbrs = _k_hop_neighbor(paper_id, n_hop, max_neighbor_num) for sub_g in k_nbrs: sub_graph_set += sub_g for node in sub_graph_set: sub_graph_dict[node] = [] for sub_g in k_nbrs: for centre_node in sub_g: nbrs = graph_strut_dict[centre_node]['references'] c_nbrs = list(set(nbrs).intersection(sub_graph_set)) sub_graph_dict[centre_node].extend(c_nbrs) for c_nbr in c_nbrs: sub_graph_dict[c_nbr].append(centre_node) # in python 3.6, the first in subgraph dict is source paper return sub_graph_dict def _k_hop_neighbor(paper_id, n_hop, max_neighbor): sub_graph = [[] for _ in range(n_hop + 1)] level = 0 visited = set() q = deque() q.append([paper_id, level]) curr_node_num = 0 while len(q) != 0: paper_first = q.popleft() paper_id_first, level_first = paper_first if level_first > n_hop: return sub_graph sub_graph[level_first].append(paper_id_first) curr_node_num += 1 if curr_node_num > max_neighbor: return sub_graph visited.add(paper_id_first) for pid in graph_strut_dict[paper_id_first]["references"]: if pid not in visited and pid in graph_strut_dict: q.append([pid, level_first + 1]) visited.add(pid) return sub_graph def generate_dgl_graph(paper_id, graph_struct, nodes_num): g = dgl.DGLGraph() assert len(graph_struct) == nodes_num g.add_nodes(len(graph_struct)) pid2idx = {} for index, key_node in enumerate(graph_struct): pid2idx[key_node] = index assert pid2idx[paper_id] == 0 for index, key_node in enumerate(graph_struct): neighbor = [pid2idx[node] for node in graph_struct[key_node]] # add self loop neighbor.append(index) key_nodes = [index] * len(neighbor) g.add_edges(key_nodes, neighbor) return g train_ds = None dataInfo = self.load(paths) # pop nodes in train graph in inductive setting if config.mode == "test" and self.setting == "inductive": dataInfo.datasets.pop("train") graph_strut_dict = {} for key, ds in dataInfo.datasets.items(): for ins in ds: graph_strut_dict[ins["paper_id"]] = ins logger.info(f"the input graph G_v has {len(graph_strut_dict)} nodes") for key, ds in dataInfo.datasets.items(): # process summary ds.apply(lambda x: x['abstract'].split(), new_field_name='summary_wd') ds.apply(lambda x: sent_tokenize(x['abstract']), new_field_name='abstract_sentences') # generate graph ds.apply(lambda x: generate_graph_structs(x["paper_id"]), new_field_name="graph_struct") ds.apply(lambda x: generate_graph_inputs(x["graph_struct"]), new_field_name='graph_inputs_wd') ds.apply(lambda x: len(x["graph_inputs_wd"]) + 1, new_field_name="nodes_num") # pad input ds.apply(lambda x: generate_article_input(x['introduction'], x[ "graph_inputs_wd"]), new_field_name='input_wd') ds.apply(lambda x: _article_token_mask(x["input_wd"]), new_field_name="enc_len_mask") ds.apply(lambda x: sum(x["enc_len_mask"]), new_field_name="enc_len") ds.apply(lambda x: _pad_article(x["input_wd"]), new_field_name="pad_input_wd") ds.apply(lambda x: _get_nbr_input_len(x["graph_inputs_wd"]), new_field_name="nbr_inputs_len") ds.apply(lambda x: _pad_graph_inputs(x["graph_inputs_wd"]), new_field_name="pad_graph_inputs_wd") if key == "train": train_ds = ds vocab_dict = {} if not load_vocab_file: logger.info("[INFO] Build new vocab from training dataset!") if train_ds is None: raise ValueError("Lack train file to build vocabulary!") vocabs = Vocabulary(max_size=config.vocab_size - 2, padding=PAD_TOKEN, unknown=UNKNOWN_TOKEN) vocabs.from_dataset(train_ds, field_name=["input_wd", "summary_wd"]) vocabs.add_word(START_DECODING) vocabs.add_word(STOP_DECODING) vocab_dict["vocab"] = vocabs # save vocab with open(os.path.join(config.train_path, "vocab"), "w", encoding="utf8") as f: for w, idx in vocabs: f.write(str(w) + "\t" + str(idx) + "\n") logger.info( "build new vocab ends.. please reRun the code with load_vocab = True" ) exit(0) else: logger.info("[INFO] Load existing vocab from %s!" % config.vocab_path) word_list = [] cnt = 3 # pad and unk if config.neighbor_process == "sep": cnt += 1 with open(config.vocab_path, 'r', encoding='utf8') as vocab_f: for line in vocab_f: pieces = line.split("\t") word_list.append(pieces[0]) cnt += 1 if cnt > vocab_size: break vocabs = Vocabulary(max_size=vocab_size, padding=PAD_TOKEN, unknown=UNKNOWN_TOKEN) vocabs.add_word_lst(word_list) vocabs.add(START_DECODING) vocabs.add(STOP_DECODING) if config.neighbor_process == "sep": vocabs.add(SEP) vocabs.build_vocab() vocab_dict["vocab"] = vocabs logger.info(f"vocab size = {len(vocabs)}") assert len(vocabs) == config.vocab_size dataInfo.set_vocab(vocabs, "vocab") for key, dataset in dataInfo.datasets.items(): # do not process the training set in test mode if config.mode == "test" and key == "train": continue data_dict = { "enc_input": [], "nbr_inputs": [], "graph": [], "dec_input": [], "target": [], "dec_len": [], "article_oovs": [], "enc_input_extend_vocab": [], } logger.info( f"start construct the input of the model for {key} set, please wait..." ) for instance in dataset: graph_inputs = instance["pad_graph_inputs_wd"] abstract_sentences = instance["summary_wd"] enc_input = instance["pad_input_wd"] enc_input, nbr_inputs, dec_input, target, dec_len, article_oovs, enc_input_extend_vocab = \ getting_full_info(enc_input, graph_inputs, abstract_sentences, dataInfo.vocabs['vocab'], config) graph = generate_dgl_graph(instance["paper_id"], instance["graph_struct"], instance["nodes_num"]) data_dict["graph"].append(graph) data_dict["enc_input"].append(enc_input) data_dict["nbr_inputs"].append(nbr_inputs) data_dict["dec_input"].append(dec_input) data_dict["target"].append(target) data_dict["dec_len"].append(dec_len) data_dict["article_oovs"].append(article_oovs) data_dict["enc_input_extend_vocab"].append( enc_input_extend_vocab) dataset.add_field("enc_input", data_dict["enc_input"]) dataset.add_field("nbr_inputs", data_dict["nbr_inputs"]) dataset.add_field("dec_input", data_dict["dec_input"]) dataset.add_field("target", data_dict["target"]) dataset.add_field("dec_len", data_dict["dec_len"]) dataset.add_field("article_oovs", data_dict["article_oovs"]) dataset.add_field("enc_input_extend_vocab", data_dict["enc_input_extend_vocab"]) dataset.add_field("graph", data_dict["graph"]) dataset.set_ignore_type( 'graph') # without this line, there may be some errors dataset.set_input("graph") dataset.set_input("nbr_inputs_len", "nbr_inputs", "enc_len", "enc_input", "enc_len_mask", "dec_input", "dec_len", "article_oovs", "nodes_num", "enc_input_extend_vocab") dataset.set_target("target", "article_oovs", "abstract_sentences") dataset.delete_field('graph_inputs_wd') dataset.delete_field('pad_graph_inputs_wd') dataset.delete_field('input_wd') dataset.delete_field('pad_input_wd') logger.info("------load dataset over---------") return dataInfo, vocabs
help="path to generated abstracts") parser.add_argument("--setting", default="inductive", choices=["transductive", "inductive"]) args = parser.parse_args() config = Config() # load checkpoint if args.model_name is None: cpts = glob.glob(os.path.join(args.model_dir, "CGSum*")) cpts.sort(key=os.path.getmtime) # choice the last checkpoint by default cpt_file = cpts[-1] else: cpt_file = os.path.join(args.model_dir, args.model_name) logger.info(f"loading checkpoint from: {cpt_file}") checkpoint = torch.load(cpt_file) # load the config file config.__dict__ = checkpoint["config"] config.min_dec_steps = args.min_dec_steps config.max_dec_steps = args.max_dec_steps config.max_graph_enc_steps = args.max_graph_enc_steps # write args to config # paths config config.train_path = args.dataset_dir if args.dataset_dir is not None else os.path.join( "SSN", args.setting) config.vocab_path = os.path.join(config.train_path, args.vocab_file) config.model_path = args.model_dir config.decode_path = args.decode_dir