def __init__(self, config, emb_dicts, examples_file): self.examples = load(examples_file) self.num_auginfos_per_example = [ len(e["selected_info_processed"]) for e in self.examples ] self.num_auginfos = np.cumsum(self.num_auginfos_per_example) self.num = self.num_auginfos[-1]
def eval(self, dataloader, eval_file, output_file): eval_dict = load(eval_file) result = self._valid(eval_dict, dataloader) print("eval: " + self._result2string(result, self.result_keys)) if output_file is not None: with open(output_file, 'w', encoding='utf8') as outfile: json.dump(result, outfile) return result
def main(args): # prepro files CURRENT_PATH = os.getcwd().split("/") DATA_PATH = "/".join(CURRENT_PATH[:-4]) + "/Datasets/" DATA_ACS_INFO_FILE_PATH = DATA_PATH + "processed/SQuAD1.1-Zhou/squad_ans_clue_style_info.pkl" SAMPLE_PROBS_FILE_PATH = DATA_PATH + "processed/SQuAD1.1-Zhou/squad_sample_probs.pkl" SQUAD_FILE = DATA_PATH + "original/SQuAD1.1-Zhou/train.txt" # !!!NOTICE: remember to clear these files when needed, otherwise we won't re-calculate. if not os.path.isfile( SAMPLE_PROBS_FILE_PATH) or args.not_processed_sample_probs_file: print(SAMPLE_PROBS_FILE_PATH + " not exist.\nNow start generate these files.\n") # if not exist, generate mapping dict and save to file get_sample_probs(filename=SQUAD_FILE, filetype="squad", save_dataset_info_file=DATA_ACS_INFO_FILE_PATH, save_sample_probs_file=SAMPLE_PROBS_FILE_PATH, sent_limit=100, ques_limit=50, answer_limit=30, is_clue_topN=20, debug=args.debug, debug_length=20, answer_length_bin_width=3, answer_length_min_val=0, answer_length_max_val=30, clue_dep_dist_bin_width=2, clue_dep_dist_min_val=0, clue_dep_dist_max_val=20) SAMPLE_PROBS = load(SAMPLE_PROBS_FILE_PATH) print(SAMPLE_PROBS_FILE_PATH + " loaded.\n") # excute tasks if args.debug: args.da_start_index = 0 args.da_end_index = 10 if args.da_task == "file2sentences": file2sentences(args.da_input_file, args.da_input_type, args.da_sentences_file, args.da_paragraphs_file, max_plength=args.para_limit, max_length=args.sent_limit) if args.da_task == "sentences2augmented_sentences": sentences2augmented_sentences( args.da_sentences_file, args.da_augmented_sentences_file, args.da_start_index, args.da_end_index, SAMPLE_PROBS, args.num_sample_answer, args.num_sample_clue, args.num_sample_style, args.max_sample_times)
def prepro(config, augmented_sentences_pkl_file, processed_augmented_sentences_pkl_file): debug = config.debug debug_length = config.debug_batchnum * config.batch_size # get train spacy processed examples and counters examples = load(augmented_sentences_pkl_file) examples = get_spacy_processed_examples(config, examples, debug, debug_length, shuffle=False) # get emb_mats and emb_dicts emb_dicts = load(config.emb_dicts_file) # get featured examples examples = get_featured_examples(config, examples, emb_dicts) save(processed_augmented_sentences_pkl_file, examples, message="processed_augmented_sentences_pkl_file")
def get_augmented_sents_examples(augmented_sentences_pkl_file, debug=False, debug_length=20, sent_limit=100, ans_limit=30): """ This is used to load the augmented sentences data that generated by DA_main.py """ examples = load(augmented_sentences_pkl_file) result = [] para_id = 0 for example in tqdm(examples): ans_sent = example["context"] for info in example["selected_infos"]: answer_text = info["answer"]["answer_text"] answer_start = info["answer"]["char_start"] # filter answer_bio_ids = info["answer"]["answer_bio_ids"] answer_length = answer_bio_ids.count("B") + answer_bio_ids.count( "I") if (len(example["ans_sent_doc"]) > sent_limit or answer_length > ans_limit): continue for clue in info["clues"]: clue_text = clue["clue_text"] clue_start = ans_sent.find(clue_text) if clue_start < 0: # not -1 continue for style_text in info["styles"]: output_e = { "paragraph": ans_sent, "question": "", "ques_type": style_text, # string type "answer": answer_text, "answer_start": answer_start, "clue": clue_text, "clue_start": clue_start, "para_id": example["sid"] } # because our paragraph is sentence actually. result.append(output_e) para_id += 1 if debug and para_id >= debug_length: break return result
def __init__(self, config, emb_dicts, examples_file): self.examples = load(examples_file) self.num = len(self.examples) # print(self.examples[0]) # refine examples according to config here. start = datetime.now() # change 1: get is_clue by clue top N. for example in self.examples: # start revise ans_sent_is_clue by clue augmenter # TODO: maybe we can put it into prepro if this strategy has a good performance. Also save time. clue_info = FQG_data_augmentor.get_clue_info( example["question"], example["ans_sent"], example["answer"], None, chunklist=None, y1_in_sent=example["y1_in_sent"], doc=example["ans_sent_doc"], ques_doc=example["ques_doc"], sent_limit=config.sent_limit) example["ans_sent_is_clue"] = clue_info[ "selected_clue_binary_ids_padded"] example["ans_sent_is_clue_ids"] = feature2ids( example["ans_sent_is_clue"], emb_dicts["is_clue"], len(example["ans_sent_doc"]), config.sent_limit) # end revise ans_sent_is_clue by clue augmenter example["switch_soft"] = ( ((example["switch_soft"] > 0).astype(float) + (example["switch_soft"] <= config.soft_copy_topN + 1).astype(float)) == 2.0).astype(float) example["copy_position_soft"] = example[ "copy_position_soft"] * example["switch_soft"] # if config.debug: # print("clue_info: ", clue_info) # print("sentence: ", example["ans_sent"]) # print("question: ", example["question"]) # print("answer: ", example["answer"]) # change 2: refine vocabulary if (config.use_refine_copy_tgt or config.use_refine_copy_src or config.use_refine_copy_tgt_src): assert (config.refined_src_vocab_limit <= config.tgt_vocab_limit) assert (config.refined_tgt_vocab_limit <= config.refined_src_vocab_limit) assert (config.refined_copy_vocab_limit <= config.refined_tgt_vocab_limit) OOV_id = emb_dicts["word"]["<oov>"] for i in range(self.num): # refine switch and copy_position example = self.examples[i] switch = np.zeros(config.ques_limit, dtype=np.int32) copy_position = np.zeros(config.ques_limit, dtype=np.int32) tgt = np.zeros(config.ques_limit, dtype=np.int32) # iterate over question tokens for idx, tgt_word in enumerate(example["ques_tokens"]): # get question token's word index word_idx = None for each in (tgt_word, tgt_word.lower(), tgt_word.capitalize(), tgt_word.upper()): if each in emb_dicts["word"]: word_idx = emb_dicts["word"][each] break # get refined copy compare_idx = word_idx OOV_idx = emb_dicts["word"]["<oov>"] # oov or low-freq as copy target if (compare_idx is None) or \ (compare_idx >= config.refined_copy_vocab_limit) or \ compare_idx == OOV_idx: if tgt_word.lower() in example["src_tokens"]: switch[idx] = 1 # NOTICE: we can revise here, as tgt_word can show multiple times copy_position[idx] = \ example["src_tokens"].index(tgt_word.lower()) # get refined tgt if (config.use_refine_copy_tgt or config.use_refine_copy_tgt_src): if (compare_idx is None) or \ (compare_idx >= config.refined_tgt_vocab_limit) or \ compare_idx == OOV_idx: tgt[idx] = OOV_id else: tgt[idx] = word_idx # assign new values self.examples[i]["switch_oov"] = switch self.examples[i]["copy_position_oov"] = copy_position # refine tgt ids if (config.use_refine_copy_tgt or config.use_refine_copy_tgt_src): self.examples[i]["tgt"] = tgt # refine src ids if (config.use_refine_copy_src or config.use_refine_copy_tgt_src): c_mask = (example['ans_sent_word_ids'] >= config.refined_src_vocab_limit) self.examples[i]['ans_sent_word_ids'] = \ c_mask * OOV_id + \ (1 - c_mask) * example['ans_sent_word_ids'] q_mask = (example['ques_word_ids'] >= config.refined_src_vocab_limit) self.examples[i]['ques_word_ids'] = \ q_mask * OOV_id + \ (1 - q_mask) * example['ques_word_ids'] print("num_total_examples: ", len(self.examples)) print(("Time of refine data: {}").format(datetime.now() - start))
def prepro(config): emb_tags = config.emb_config.keys() emb_config = config.emb_config emb_mats = {} emb_dicts = {} debug = config.debug debug_length = config.debug_batchnum * config.batch_size # get train spacy processed examples and counters if not config.processed_by_spacy and not config.processed_example_features: train_examples = get_raw_examples(config.train_file, config.data_type, debug, debug_length) train_examples, train_meta, train_eval = get_spacy_processed_examples( config, train_examples, debug, debug_length, shuffle=False) dev_examples = get_raw_examples(config.dev_file, config.data_type, debug, debug_length) dev_examples, dev_meta, dev_eval = get_spacy_processed_examples( config, dev_examples, debug, debug_length, shuffle=False) test_examples = get_raw_examples(config.test_file, config.data_type, debug, debug_length) test_examples, test_meta, test_eval = get_spacy_processed_examples( config, test_examples, debug, debug_length, shuffle=False) counters = get_updated_counters_by_examples(config, None, train_examples, increment=1, init=True, finish=True) # only use train data final_counters = copy.deepcopy(counters) save(config.train_examples_file, train_examples, message="train examples") save(config.dev_examples_file, dev_examples, message="dev examples") save(config.test_examples_file, test_examples, message="test examples") save(config.train_meta_file, train_meta, message="train meta") save(config.dev_meta_file, dev_meta, message="dev meta") save(config.test_meta_file, test_meta, message="test meta") save(config.train_eval_file, train_eval, message="train eval") save(config.dev_eval_file, dev_eval, message="dev eval") save(config.test_eval_file, test_eval, message="test eval") save(config.counters_file, final_counters, message="counters") else: train_examples = load(config.train_examples_file) train_meta = load(config.train_meta_file) train_eval = load(config.train_eval_file) dev_examples = load(config.dev_examples_file) dev_meta = load(config.dev_meta_file) dev_eval = load(config.dev_eval_file) test_examples = load(config.test_examples_file) test_meta = load(config.test_meta_file) test_eval = load(config.test_eval_file) final_counters = load(config.counters_file) counters = final_counters # get emb_mats and emb_dicts if not config.processed_emb: for tag in emb_tags: emb_mats[tag], emb_dicts[tag] = get_embedding( final_counters[tag], tag, emb_file=emb_config[tag]["emb_file"], size=emb_config[tag]["emb_size"], vec_size=emb_config[tag]["emb_dim"]) save(config.emb_mats_file, emb_mats, message="embedding mats") save(config.emb_dicts_file, emb_dicts, message="embedding dicts") else: emb_mats = load(config.emb_mats_file) emb_dicts = load(config.emb_dicts_file) for k in emb_dicts: print("Embedding dict length: " + k + " " + str(len(emb_dicts[k]))) # get related_words_dict and related_words_ids_mat if not config.processed_related_words: related_words_dict = get_related_words_dict( list(emb_dicts["word"].keys()), config.max_topN) related_words_ids_mat = get_related_words_ids_mat_with_related_words_dict( emb_dicts["word"], config.max_topN, related_words_dict) save(config.related_words_dict_file, related_words_dict, message="related words dict") save(config.related_words_ids_mat_file, related_words_ids_mat, message="related words ids mat") else: related_words_dict = load(config.related_words_dict_file) related_words_ids_mat = load(config.related_words_ids_mat_file) # get featured examples # TODO: handle potential insert SOS EOS problem when extracting tag features if not config.processed_example_features: train_examples, train_meta = get_featured_examples( config, train_examples, train_meta, "train", emb_dicts, related_words_ids_mat, related_words_dict) dev_examples, dev_meta = get_featured_examples(config, dev_examples, dev_meta, "dev", emb_dicts, related_words_ids_mat, related_words_dict) test_examples, test_meta = get_featured_examples( config, test_examples, test_meta, "test", emb_dicts, related_words_ids_mat, related_words_dict) save(config.train_examples_file, train_examples, message="train examples") save(config.dev_examples_file, dev_examples, message="dev examples") save(config.test_examples_file, test_examples, message="test examples") save(config.train_meta_file, train_meta, message="train meta") save(config.dev_meta_file, dev_meta, message="dev meta") save(config.test_meta_file, test_meta, message="test meta") save(config.train_eval_file, train_eval, message="train eval") save(config.dev_eval_file, dev_eval, message="dev eval") save(config.test_eval_file, test_eval, message="test eval") else: train_examples = load(config.train_examples_file) train_meta = load(config.train_meta_file) train_eval = load(config.train_eval_file) dev_examples = load(config.dev_examples_file) dev_meta = load(config.dev_meta_file) dev_eval = load(config.dev_eval_file) test_examples = load(config.test_examples_file) test_meta = load(config.test_meta_file) test_eval = load(config.test_eval_file) # print to txt to debug """
def __init__(self, config, emb_mats, emb_dicts, dropout=0.1): super().__init__() self.config = config self.config.n_best = 1 self.dicts = emb_dicts self.dicts["idx2tgt"] = dict([[v, k] for k, v in emb_dicts["word"].items()]) self.PAD = emb_dicts["word"]["<pad>"] # input, output embedder self.enc_embedder = Embedder(config, emb_mats, emb_dicts, dropout) if config.share_embedder: self.dec_embedder = self.enc_embedder else: self.dec_embedder = Embedder(config, emb_mats, emb_dicts, dropout) self.enc_emb_tags = config.emb_tags self.dec_emb_tags = ["word"] self.src_vocab_limit = config.emb_config["word"]["emb_size"] total_emb_size = self.enc_embedder.get_total_emb_dim(self.enc_emb_tags) if self.config.use_clue_info: self.clue_threshold = 0.5 clue_embedding_dim = config.emb_config["is_overlap"]["emb_dim"] self.clue_embedder = nn.Embedding( num_embeddings=3, # 0: PAD, 1: not overlap, 2: overlap embedding_dim=clue_embedding_dim, padding_idx=0) if self.config.use_style_info: style_embedding_dim = config.emb_config["is_overlap"][ "emb_dim"] # NOTICE self.style_emb_mat = nn.Parameter( torch.randn(self.config.num_question_style, style_embedding_dim)).to(DEVICE) nn.init.xavier_normal_(self.style_emb_mat, math.sqrt(3)) # encoder enc_input_size = total_emb_size if self.config.use_clue_info: enc_input_size += clue_embedding_dim self.encoder = Encoder(config, enc_input_size, dropout) # decoder dec_input_size = config.emb_config["word"]["emb_dim"] self.decoder = Decoder(config, dec_input_size, dropout) self.decIniter = DecIniter(config) # generator self.predict_size = min(config.tgt_vocab_limit, len(emb_dicts["word"])) if config.use_refine_copy_tgt or config.use_refine_copy_tgt_src: self.predict_size = min(config.refined_tgt_vocab_limit, len(emb_dicts["word"])) self.generator = Generator( config.dec_rnn_size // config.maxout_pool_size, self.predict_size) if self.config.copy_type in ["soft", "soft-oov"]: self.related_words_ids_mat = load( config.related_words_ids_mat_file) self.related_words_mask = torch.zeros( [self.predict_size, self.predict_size]).to(DEVICE) for i in range(self.predict_size): related_ids = [ i for i in self.related_words_ids_mat[i] if i != -1 ][:self.config.soft_copy_topN] self.related_words_mask[i, related_ids] = 1
def main(args): # import model according to input args if args.net == "FQG": from model.FQG_model import FQG as Model else: print("Default use s2s_qanet model.") from model.FQG_model import FQG as Model # configure according to input args and some experience emb_config["word"]["emb_size"] = args.tgt_vocab_limit args.emb_config["word"]["emb_size"] = args.tgt_vocab_limit args.brnn = True args.lower = True args.share_embedder = True # configure for complete experiment and ablation models # get checkpoint save path args_for_checkpoint_folder_name = [ args.net, args.data_type, args.copy_type, args.copy_loss_type, args.soft_copy_topN, args.only_copy_content, args.use_vocab_mask, args.use_clue_info, args.use_style_info, args.use_refine_copy_tgt, args.use_refine_copy_src, args.use_refine_copy_tgt_src, args.beam_size ] # NOTICE: change here. Also notice, debug mode will replace the model. save_dir = args.checkpoint_dir args.checkpoint_dir = get_checkpoint_dir(save_dir, args_for_checkpoint_folder_name) if args.mode != "train": args.resume = args.checkpoint_dir + "model_best.pth.tar" # !!!!! NOTICE: so set --resume won't change it. print(args) # set device, random seed, logger device, use_cuda, n_gpu = set_device(args.no_cuda) set_random_seed(args.seed) # logger = set_logger(args.log_file) logger = None # check whether need data preprocessing. If yes, preprocess data if args.not_processed_data: # use --not_processed_data --spacy_not_processed_data for complete prepro prepro(args) # data emb_mats = load(args.emb_mats_file) emb_dicts = load(args.emb_dicts_file) train_dataloader = get_loader(args, emb_dicts, args.train_examples_file, args.batch_size, shuffle=True) dev_dataloader = get_loader(args, emb_dicts, args.dev_examples_file, args.batch_size, shuffle=False) test_dataloader = get_loader(args, emb_dicts, args.test_examples_file, args.batch_size, shuffle=False) # model model = Model(args, emb_mats, emb_dicts) summarize_model(model) if use_cuda and args.use_multi_gpu and n_gpu > 1: if EXP_PLATFORM.lower() == "venus": pass else: model = nn.DataParallel(model) model.to(device) partial_models = None partial_resumes = None partial_trainables = None # optimizer and scheduler parameters = filter(lambda p: p.requires_grad, model.parameters()) for p in parameters: if p.dim() == 1: p.data.normal_(0, math.sqrt(6 / (1 + p.size(0)))) elif list(p.shape) == [args.tgt_vocab_limit, 300]: print("omit embeddings.") else: nn.init.xavier_normal_(p, math.sqrt(3)) optimizer = Optim(args.optim, args.learning_rate, max_grad_norm=args.max_grad_norm, max_weight_value=args.max_weight_value, lr_decay=args.learning_rate_decay, start_decay_at=args.start_decay_at, decay_bad_count=args.halve_lr_bad_count) optimizer.set_parameters(model.parameters()) scheduler = None loss = {} loss["P"] = torch.nn.CrossEntropyLoss() loss["D"] = torch.nn.BCEWithLogitsLoss(reduction="sum") # trainer trainer = Trainer(args, model, train_dataloader=train_dataloader, dev_dataloader=dev_dataloader, loss=loss, optimizer=optimizer, scheduler=scheduler, device=device, emb_dicts=emb_dicts, logger=logger, partial_models=partial_models, partial_resumes=partial_resumes, partial_trainables=partial_trainables) # start train/eval/test model start = datetime.now() if args.mode == "train": trainer.train() elif args.mode == "eval_train": args.use_ema = False trainer.eval(train_dataloader, args.train_eval_file, args.train_output_file) elif args.mode in ["eval", "evaluation", "valid", "validation"]: args.use_ema = False trainer.eval(dev_dataloader, args.dev_eval_file, args.eval_output_file) elif args.mode == "test": args.use_ema = False trainer.eval(test_dataloader, args.test_eval_file, args.test_output_file) else: print("Error: set mode to be train or eval or test.") print(("Time of {} model: {}").format(args.mode, datetime.now() - start))
def evaluate_and_filter(input_file, input_augmented_pkl_file, output_file): augmented_examples = load(input_augmented_pkl_file) pid_sid_ans2ans_labels = {} for e in augmented_examples: pid = e["pid"] sid = e["sid"] for info in e["selected_info_processed"]: ans = info["answer_text"] compound_id = str(pid) + "_" + str(sid) + "_" + str(ans) ans_chunk_label = info["answer_chunk_tag"] ans_ner_label = get_answer_ner_tag(e["ans_sent_doc"], ans, processed_by_spacy=True) if compound_id in pid_sid_ans2ans_labels: pass else: pid_sid_ans2ans_labels[compound_id] = { "ans_chunk_label": ans_chunk_label, "ans_ner_label": ans_ner_label } outfile = open(output_file, 'w', encoding='utf8') with codecs.open(input_file, encoding='utf8') as infile: lines = infile.readlines() i = 0 for line in lines: line_split = str(line).rstrip().split("\t") example_pid = line_split[ 0] # same with get_qa_input_file in QG_augment_main.py example_sid = line_split[1] q = line_split[2] example_ans_sent = line_split[3] example_answer_text = line_split[4] example_paragraph = line_split[7] paragraph_readibility = get_readibility(example_paragraph) paragraph_perplexity = get_perplexity(example_paragraph) paragraph_length = len(example_paragraph.split()) ans_sent_readibility = get_readibility(example_ans_sent) ans_sent_perplexity = get_perplexity(example_ans_sent) ans_sent_length = len(example_ans_sent.split()) question_readibility = get_readibility(q) question_perplexity = get_perplexity(q) question_length = len(q.split()) question_type_text, question_type_id = get_question_type(q) answer_readibility = get_readibility(example_answer_text) answer_perplexity = get_perplexity(example_answer_text) compound_id = str(example_pid) + "_" + str( example_sid) + "_" + str(example_answer_text) answer_chunk_tag = pid_sid_ans2ans_labels[compound_id][ "ans_chunk_label"] answer_ner_tag = pid_sid_ans2ans_labels[compound_id][ "ans_ner_label"] answer_length = len(example_answer_text.split()) # TODO: filter here !!! if i == 0: head = "\t".join([ "pid", "sid", "question", "ans_sent", "answer", "s_char_start", "s_char_end", "paragraph", "p_char_start", "p_char_end", "entailment_score", "p_readibility", "p_perplexity", "p_length", "s_readibility", "s_perplexity", "s_length", "q_readibility", "q_perplexity", "q_length", "q_type", "q_type_id", "a_readibility", "a_perplexity", "a_length", "a_chunk_tag", "a_ner_tag" ]) outfile.write(head + "\n") line_split += [ paragraph_readibility, paragraph_perplexity, paragraph_length, ans_sent_readibility, ans_sent_perplexity, ans_sent_length, question_readibility, question_perplexity, question_length, question_type_text, question_type_id, answer_readibility, answer_perplexity, answer_length, answer_chunk_tag, answer_ner_tag ] output_list = [str(item) for item in line_split] outfile.write( "\t".join(output_list).rstrip().replace("\n", "\\n") + "\n") i = i + 1 outfile.close() infile.close()
def main(args): # import model according to input args if args.net == "FQG": from model.FQG_model import FQG as Model else: print("Default use s2s_qanet model.") from model.FQG_model import FQG as Model # configure according to input args and some experience emb_config["word"]["emb_size"] = args.tgt_vocab_limit args.emb_config["word"]["emb_size"] = args.tgt_vocab_limit args.brnn = True args.lower = True args.share_embedder = True # configure for complete experiment and ablation models # get checkpoint save path args_for_checkpoint_folder_name = [ args.net, args.data_type, args.copy_type, args.copy_loss_type, args.soft_copy_topN, args.only_copy_content, args.use_vocab_mask, args.use_clue_info, args.use_style_info, args.use_refine_copy_tgt, args.use_refine_copy_src, args.use_refine_copy_tgt_src, args.beam_size ] # NOTICE: change here. Keep the same with QG_main.py. Otherwise, there may be error. save_dir = args.checkpoint_dir args.checkpoint_dir = get_checkpoint_dir(save_dir, args_for_checkpoint_folder_name) # args.mode = "test" # if args.mode != "train": args.resume = args.checkpoint_dir + "model_best.pth.tar" # !!!!! NOTICE: so set --resume won't change it. print(args) # set device, random seed, logger device, use_cuda, n_gpu = set_device(args.no_cuda) set_random_seed(args.seed) # logger = set_logger(args.log_file) logger = None # check whether need data preprocessing. If yes, preprocess data #if args.mode == "prepro": prepro(args, args.da_augmented_sentences_file, args.qg_augmented_sentences_file) # return # data emb_mats = load(args.emb_mats_file) emb_dicts = load(args.emb_dicts_file) dataloader = get_loader(args, emb_dicts, args.qg_augmented_sentences_file, args.batch_size, shuffle=False) # model model = Model(args, emb_mats, emb_dicts) summarize_model(model) if use_cuda and args.use_multi_gpu and n_gpu > 1: model = nn.DataParallel(model) model.to(device) partial_models = None partial_resumes = None partial_trainables = None # optimizer and scheduler parameters = filter(lambda p: p.requires_grad, model.parameters()) for p in parameters: if p.dim() == 1: p.data.normal_(0, math.sqrt(6 / (1 + p.size(0)))) elif list(p.shape) == [args.tgt_vocab_limit, 300]: print("omit embeddings.") else: nn.init.xavier_normal_(p, math.sqrt(3)) optimizer = Optim(args.optim, args.learning_rate, max_grad_norm=args.max_grad_norm, max_weight_value=args.max_weight_value, lr_decay=args.learning_rate_decay, start_decay_at=args.start_decay_at, decay_bad_count=args.halve_lr_bad_count) optimizer.set_parameters(model.parameters()) scheduler = None loss = {} loss["P"] = torch.nn.CrossEntropyLoss() loss["D"] = torch.nn.BCEWithLogitsLoss(reduction="sum") # trainer trainer = Trainer(args, model, train_dataloader=None, dev_dataloader=None, loss=loss, optimizer=optimizer, scheduler=scheduler, device=device, emb_dicts=emb_dicts, logger=logger, partial_models=partial_models, partial_resumes=partial_resumes, partial_trainables=partial_trainables) # start train/eval/test model start = datetime.now() args.use_ema = False trainer.test(dataloader, args.qg_result_file) get_qa_input_file(args.qg_result_file, args.da_paragraphs_file, args.qa_data_file) # TODO: delete duplicate examples. different clue, style may generate the same question... print(("Time of {} model: {}").format(args.mode, datetime.now() - start))
def prepro(config): emb_tags = config.emb_tags emb_config = config.emb_config emb_mats = {} emb_dicts = {} debug = config.debug debug_length = config.debug_batchnum * config.batch_size # get examples and counters if not config.processed_example_features: examples = get_raw_examples(config, config.train_file, debug, debug_length) examples = get_featured_examples(config, examples) counters = get_counters(examples, config.emb_tags, config.emb_not_count_tags) save(config.train_examples_file, (examples, 0), message="examples") save(config.counters_file, counters, message="counters") else: examples, num_relations = load(config.train_examples_file) counters = load(config.counters_file) # get emb_mats and emb_dicts if not config.processed_emb: for tag in emb_tags: emb_mats[tag], emb_dicts[tag] = get_embedding( counters[tag], tag, emb_file=emb_config[tag]["emb_file"], size=emb_config[tag]["emb_size"], vec_size=emb_config[tag]["emb_dim"]) save(config.emb_mats_file, emb_mats, message="embedding mats") save(config.emb_dicts_file, emb_dicts, message="embedding dicts") else: emb_mats = load(config.emb_mats_file) emb_dicts = load(config.emb_dicts_file) for k in emb_dicts: print("Embedding dict length: " + k + " " + str(len(emb_dicts[k]))) if not config.processed_example_graph_features: # NOTICE: we should set update_edge_types2ids = True only for train dataset #if config.processed_emb and "edge_types" in emb_dicts: # edge_types2ids = emb_dicts["edge_types"] #else: edge_types2ids = {} examples, num_relations, edge_types2ids = get_graph_examples( config, examples, config.edge_types_list, emb_dicts, edge_types2ids, update_edge_types2ids=True) emb_dicts["edge_types"] = edge_types2ids save(config.train_examples_file, (examples, num_relations), message="examples") save(config.emb_dicts_file, emb_dicts, message="embedding dicts") # print to txt to debug for k in emb_dicts: write_dict(emb_dicts[k], OUTPUT_PATH + "debug/emb_dicts_" + str(k) + ".txt") for k in counters: write_counter(counters[k], OUTPUT_PATH + "debug/counters_" + str(k) + ".txt") write_example(examples[5], OUTPUT_PATH + "debug/example.txt")
def get_loader(examples_file, batch_size, shuffle=False, debug=False, debug_length=20): examples, num_relations = load(examples_file) # print("num_relations: ", num_relations) data_list = [] feature_dim = None num_e = 0 for e in examples: num_e += 1 feature_list = [ e["G_data"].count_query, e["G_data"].count_title, e["G_data"].is_digit, e["G_data"].is_punct, e["G_data"].is_stop, e["G_data"].is_special, [word_len / 10.0 for word_len in e["G_data"].word_len], # normalize word length [id_val / 20.0 for id_val in e["G_data"].id] # normalize word position ] if feature_dim is None: feature_dim = len(feature_list) emb_ids_dict = { "word_id": torch.LongTensor(e["G_data"].word_id).unsqueeze(0), "tag_id": torch.LongTensor(e["G_data"].tag_id).unsqueeze(0), "is_digit_id": torch.LongTensor(e["G_data"].is_digit_id).unsqueeze(0), "is_punct_id": torch.LongTensor(e["G_data"].is_punct_id).unsqueeze(0), "is_stop_id": torch.LongTensor(e["G_data"].is_stop_id).unsqueeze(0), "is_special_id": torch.LongTensor(e["G_data"].is_special_id).unsqueeze(0) } x = torch.FloatTensor(feature_list).t().unsqueeze( 0).contiguous() # 1 * num_nodes * num_features edge_index = e["G_data"].edge_index edge_type = e["G_data"].edge_type_id #print("DEBUG node_idx: ", e["G_data"].node_index) #print("DEBUG edge_idx: ", edge_index) #print("DEBUG edge_type: ", edge_type) y = torch.LongTensor(e["G_data"].y_phrase) y_node_type = torch.LongTensor(e["G_data"].y_node_type) words = e["G_data"].word data_list.append( Data(x=x, edge_type=edge_type, edge_index=edge_index, y=y, words=words, y_node_type=y_node_type, emb_ids_dict=emb_ids_dict, G_for_decode=e["G_for_decode"], queries_features=e["queries_features"], titles_features=e["titles_features"], phrase_features=e["phrase_features"])) if shuffle: random.shuffle(data_list) if debug and num_e >= debug_length: break return data_list, num_relations, feature_dim
def main(args): # get revised args # NOTICE: here is our default data organization structure. Change it if you are different. # original_data_folder = DATA_PATH + "original/" + args.data_type + "/" # data_type is event or concept. processed_data_folder = os.path.join(DATA_PATH, 'processed', args.data_type) args.train_examples_file = os.path.join(processed_data_folder, 'train-examples.pkl') args.dev_examples_file = os.path.join(processed_data_folder, 'dev-examples.pkl') args.test_examples_file = os.path.join(processed_data_folder, 'test-examples.pkl') args.train_output_file = os.path.join(processed_data_folder, 'train_output.txt') args.eval_output_file = os.path.join(processed_data_folder, 'eval_output.txt') args.test_output_file = os.path.join(processed_data_folder, 'test_output.txt') args.emb_mats_file = os.path.join(processed_data_folder, 'emb_mats.pkl') args.emb_dicts_file = os.path.join(processed_data_folder, 'emb_dicts.pkl') args.counters_file = os.path.join(processed_data_folder, 'counters.pkl') # get checkpoint save path args_for_checkpoint_folder_name = [ args.net, args.data_type, "_".join(args.tasks), "_".join(args.emb_tags), "_".join(args.edge_types_list), args.d_model, args.layers, args.num_bases, args.lr, args.debug ] # NOTICE: change here save_dir = args.checkpoint_dir args.output_file_prefix = "_".join( [str(s) for s in args_for_checkpoint_folder_name]) args.checkpoint_dir = get_checkpoint_dir(save_dir, args_for_checkpoint_folder_name) if args.mode != "train": args.resume = args.checkpoint_dir + "model_best.pth.tar" # NOTICE: so set --resume won't change it. print(args) # set device, random seed, logger device, use_cuda, n_gpu = set_device(args.no_cuda) set_random_seed(args.seed) # logger = set_logger(args.log_file) # check whether need data preprocessing. If yes, preprocess data if args.not_processed_data: # use --not_processed_data --spacy_not_processed_data for complete prepro prepro(args) # # data emb_mats = load(args.emb_mats_file) emb_dicts = load(args.emb_dicts_file) data_list, num_relations, feature_dim = get_loader( args.train_examples_file, 3, shuffle=True, debug=args.debug) print("num_relations: ", num_relations) # num_relations = len(emb_dicts["edge_types"]) #!!! train_dataloader = data_list[0:math.floor(0.8 * len(data_list))] dev_dataloader = data_list[math.floor(0.8 * len(data_list)):math. floor(0.9 * len(data_list))] test_dataloader = data_list[math.floor(0.9 * len(data_list)):] # model model = Model(config=args, in_channels=feature_dim, out_channels=args.d_model, num_relations=num_relations, num_bases=args.num_bases, emb_mats=emb_mats, emb_dicts=emb_dicts, dropout=0.1) # TODO: set them according to args summarize_model(model) if use_cuda and args.use_multi_gpu and n_gpu > 1: model = nn.DataParallel(model) model.to(device) print("successfully get model") # optimizer and scheduler parameters = filter(lambda p: p.requires_grad, model.parameters()) # for p in parameters: # if p.dim() == 1: # p.data.normal_(0, math.sqrt(6 / (1 + p.size(0)))) # # elif list(p.shape) == [args.tgt_vocab_limit, 300]: # # print("omit embeddings.") # else: # nn.init.xavier_normal_(p, math.sqrt(3)) optimizer = torch.optim.Adam(params=parameters, lr=args.lr, betas=(args.beta1, args.beta2), eps=1e-8, weight_decay=3e-7) cr = 1.0 / math.log(args.lr_warm_up_num) scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=lambda ee: cr * math.log(ee + 1) if ee < args.lr_warm_up_num else 1) loss = {} # loss["P"] = torch.nn.CrossEntropyLoss() # loss["D"] = torch.nn.BCEWithLogitsLoss(reduction="sum") # trainer trainer = Trainer(args, model, train_dataloader=train_dataloader, dev_dataloader=dev_dataloader, test_dataloader=test_dataloader, loss=loss, optimizer=optimizer, scheduler=scheduler, device=device, emb_dicts=emb_dicts) # start train/eval/test model start = datetime.now() if args.mode.lower() == "train": trainer.train() elif args.mode.lower() == "eval_train": args.use_ema = False train_output_file = RESULT_PATH + "train_output." + args.output_file_prefix + ".txt" trainer.eval(train_dataloader, train_output_file) elif args.mode.lower() in [ "eval", "evaluation", "valid", "validation", "eval_dev" ]: args.use_ema = False eval_output_file = RESULT_PATH + "dev_output." + args.output_file_prefix + ".txt" trainer.eval(dev_dataloader, eval_output_file) elif args.mode.lower() in ["eval_test"]: args.use_ema = False test_output_file = RESULT_PATH + "test_output." + args.output_file_prefix + ".txt" trainer.eval(test_dataloader, test_output_file) elif args.mode.lower() == "test": args.use_ema = False test_output_file = RESULT_PATH + "test_output." + args.output_file_prefix + ".txt" trainer.test(test_dataloader, test_output_file) else: print("Error: set mode to be train or eval or test or eval_train.") print(("Time of {} model: {}").format(args.mode, datetime.now() - start))
'PP-FAC': Counter({'What': 3, 'Which': 1}), 'UCP-UNK': Counter({'What': 20, 'Which': 3, 'Who': 2, 'Where': 1}), 'ADVP-GPE': Counter({'Which': 2, 'What': 1, 'Where': 1}), 'PP-LOC': Counter({'What': 4, 'Where': 1}), 'UCP-DATE': Counter({'How': 1}), 'ADVP-ORG': Counter({'What': 3, 'Who': 2, 'Which': 1}), 'PP-LAW': Counter({'What': 1, 'Which': 1}), 'ADVP-CARDINAL': Counter({'How': 1, 'When': 1}), 'PP-EVENT': Counter({'What': 3}), 'UCP-PERSON': Counter({'What': 1}), 'PP-MONEY': Counter({'How': 3}), 'PP-QUANTITY': Counter({'What': 3, 'How': 1}), 'PP-NORP': Counter({'What': 2})} """ answertag2qtype_infos = load(ANSWERTAG2QTYPE_FILE_PATH) ANSWERTAG2QTYPE_SET = answertag2qtype_infos["answertag2qtype_set"] # refined set: remove low frequency types. ANSWERTAG2QTYPE_COUNTER = answertag2qtype_infos["answertag2qtype_counter"] print("Before delete low frequency types: ==============") for k in ANSWERTAG2QTYPE_SET: print(k) print(ANSWERTAG2QTYPE_SET[k]) for ans_tag in ANSWERTAG2QTYPE_SET: counter = ANSWERTAG2QTYPE_COUNTER[ans_tag] threshold = 0.03 * sum(counter.values()) for q_type in counter: if counter[q_type] < threshold: ANSWERTAG2QTYPE_SET[ans_tag].remove(q_type)