Example #1
0
 def __init__(self, config, emb_dicts, examples_file):
     self.examples = load(examples_file)
     self.num_auginfos_per_example = [
         len(e["selected_info_processed"]) for e in self.examples
     ]
     self.num_auginfos = np.cumsum(self.num_auginfos_per_example)
     self.num = self.num_auginfos[-1]
Example #2
0
 def eval(self, dataloader, eval_file, output_file):
     eval_dict = load(eval_file)
     result = self._valid(eval_dict, dataloader)
     print("eval: " + self._result2string(result, self.result_keys))
     if output_file is not None:
         with open(output_file, 'w', encoding='utf8') as outfile:
             json.dump(result, outfile)
     return result
Example #3
0
def main(args):
    # prepro files
    CURRENT_PATH = os.getcwd().split("/")
    DATA_PATH = "/".join(CURRENT_PATH[:-4]) + "/Datasets/"

    DATA_ACS_INFO_FILE_PATH = DATA_PATH + "processed/SQuAD1.1-Zhou/squad_ans_clue_style_info.pkl"
    SAMPLE_PROBS_FILE_PATH = DATA_PATH + "processed/SQuAD1.1-Zhou/squad_sample_probs.pkl"
    SQUAD_FILE = DATA_PATH + "original/SQuAD1.1-Zhou/train.txt"

    # !!!NOTICE: remember to clear these files when needed, otherwise we won't re-calculate.
    if not os.path.isfile(
            SAMPLE_PROBS_FILE_PATH) or args.not_processed_sample_probs_file:
        print(SAMPLE_PROBS_FILE_PATH +
              " not exist.\nNow start generate these files.\n")
        # if not exist, generate mapping dict and save to file
        get_sample_probs(filename=SQUAD_FILE,
                         filetype="squad",
                         save_dataset_info_file=DATA_ACS_INFO_FILE_PATH,
                         save_sample_probs_file=SAMPLE_PROBS_FILE_PATH,
                         sent_limit=100,
                         ques_limit=50,
                         answer_limit=30,
                         is_clue_topN=20,
                         debug=args.debug,
                         debug_length=20,
                         answer_length_bin_width=3,
                         answer_length_min_val=0,
                         answer_length_max_val=30,
                         clue_dep_dist_bin_width=2,
                         clue_dep_dist_min_val=0,
                         clue_dep_dist_max_val=20)

    SAMPLE_PROBS = load(SAMPLE_PROBS_FILE_PATH)
    print(SAMPLE_PROBS_FILE_PATH + " loaded.\n")

    # excute tasks
    if args.debug:
        args.da_start_index = 0
        args.da_end_index = 10

    if args.da_task == "file2sentences":
        file2sentences(args.da_input_file,
                       args.da_input_type,
                       args.da_sentences_file,
                       args.da_paragraphs_file,
                       max_plength=args.para_limit,
                       max_length=args.sent_limit)
    if args.da_task == "sentences2augmented_sentences":
        sentences2augmented_sentences(
            args.da_sentences_file, args.da_augmented_sentences_file,
            args.da_start_index, args.da_end_index, SAMPLE_PROBS,
            args.num_sample_answer, args.num_sample_clue,
            args.num_sample_style, args.max_sample_times)
Example #4
0
def prepro(config, augmented_sentences_pkl_file,
           processed_augmented_sentences_pkl_file):
    debug = config.debug
    debug_length = config.debug_batchnum * config.batch_size

    # get train spacy processed examples and counters
    examples = load(augmented_sentences_pkl_file)
    examples = get_spacy_processed_examples(config,
                                            examples,
                                            debug,
                                            debug_length,
                                            shuffle=False)

    # get emb_mats and emb_dicts
    emb_dicts = load(config.emb_dicts_file)

    # get featured examples
    examples = get_featured_examples(config, examples, emb_dicts)
    save(processed_augmented_sentences_pkl_file,
         examples,
         message="processed_augmented_sentences_pkl_file")
Example #5
0
def get_augmented_sents_examples(augmented_sentences_pkl_file,
                                 debug=False,
                                 debug_length=20,
                                 sent_limit=100,
                                 ans_limit=30):
    """
    This is used to load the augmented sentences data that generated by DA_main.py
    """
    examples = load(augmented_sentences_pkl_file)
    result = []

    para_id = 0
    for example in tqdm(examples):
        ans_sent = example["context"]

        for info in example["selected_infos"]:
            answer_text = info["answer"]["answer_text"]
            answer_start = info["answer"]["char_start"]
            # filter
            answer_bio_ids = info["answer"]["answer_bio_ids"]
            answer_length = answer_bio_ids.count("B") + answer_bio_ids.count(
                "I")
            if (len(example["ans_sent_doc"]) > sent_limit
                    or answer_length > ans_limit):
                continue
            for clue in info["clues"]:
                clue_text = clue["clue_text"]
                clue_start = ans_sent.find(clue_text)
                if clue_start < 0:  # not -1
                    continue

                for style_text in info["styles"]:
                    output_e = {
                        "paragraph": ans_sent,
                        "question": "",
                        "ques_type": style_text,  # string type
                        "answer": answer_text,
                        "answer_start": answer_start,
                        "clue": clue_text,
                        "clue_start": clue_start,
                        "para_id": example["sid"]
                    }  # because our paragraph is sentence actually.
                    result.append(output_e)
        para_id += 1
        if debug and para_id >= debug_length:
            break
    return result
Example #6
0
    def __init__(self, config, emb_dicts, examples_file):
        self.examples = load(examples_file)
        self.num = len(self.examples)
        # print(self.examples[0])

        # refine examples according to config here.
        start = datetime.now()

        # change 1: get is_clue by clue top N.
        for example in self.examples:
            # start revise ans_sent_is_clue by clue augmenter
            # TODO: maybe we can put it into prepro if this strategy has a good performance. Also save time.
            clue_info = FQG_data_augmentor.get_clue_info(
                example["question"],
                example["ans_sent"],
                example["answer"],
                None,
                chunklist=None,
                y1_in_sent=example["y1_in_sent"],
                doc=example["ans_sent_doc"],
                ques_doc=example["ques_doc"],
                sent_limit=config.sent_limit)
            example["ans_sent_is_clue"] = clue_info[
                "selected_clue_binary_ids_padded"]
            example["ans_sent_is_clue_ids"] = feature2ids(
                example["ans_sent_is_clue"], emb_dicts["is_clue"],
                len(example["ans_sent_doc"]), config.sent_limit)
            # end revise ans_sent_is_clue by clue augmenter

            example["switch_soft"] = (
                ((example["switch_soft"] > 0).astype(float) +
                 (example["switch_soft"] <= config.soft_copy_topN +
                  1).astype(float)) == 2.0).astype(float)
            example["copy_position_soft"] = example[
                "copy_position_soft"] * example["switch_soft"]

            # if config.debug:
            #     print("clue_info: ", clue_info)
            #     print("sentence: ", example["ans_sent"])
            #     print("question: ", example["question"])
            #     print("answer: ", example["answer"])

        # change 2: refine vocabulary
        if (config.use_refine_copy_tgt or config.use_refine_copy_src
                or config.use_refine_copy_tgt_src):

            assert (config.refined_src_vocab_limit <= config.tgt_vocab_limit)
            assert (config.refined_tgt_vocab_limit <=
                    config.refined_src_vocab_limit)
            assert (config.refined_copy_vocab_limit <=
                    config.refined_tgt_vocab_limit)

            OOV_id = emb_dicts["word"]["<oov>"]

            for i in range(self.num):
                # refine switch and copy_position
                example = self.examples[i]
                switch = np.zeros(config.ques_limit, dtype=np.int32)
                copy_position = np.zeros(config.ques_limit, dtype=np.int32)
                tgt = np.zeros(config.ques_limit, dtype=np.int32)

                # iterate over question tokens
                for idx, tgt_word in enumerate(example["ques_tokens"]):
                    # get question token's word index
                    word_idx = None
                    for each in (tgt_word, tgt_word.lower(),
                                 tgt_word.capitalize(), tgt_word.upper()):
                        if each in emb_dicts["word"]:
                            word_idx = emb_dicts["word"][each]
                            break

                    # get refined copy
                    compare_idx = word_idx
                    OOV_idx = emb_dicts["word"]["<oov>"]

                    # oov or low-freq as copy target
                    if (compare_idx is None) or \
                            (compare_idx >= config.refined_copy_vocab_limit) or \
                            compare_idx == OOV_idx:
                        if tgt_word.lower() in example["src_tokens"]:
                            switch[idx] = 1
                            # NOTICE: we can revise here, as tgt_word can show multiple times
                            copy_position[idx] = \
                                example["src_tokens"].index(tgt_word.lower())

                    # get refined tgt
                    if (config.use_refine_copy_tgt
                            or config.use_refine_copy_tgt_src):
                        if (compare_idx is None) or \
                                (compare_idx >= config.refined_tgt_vocab_limit) or \
                                compare_idx == OOV_idx:
                            tgt[idx] = OOV_id
                        else:
                            tgt[idx] = word_idx

                # assign new values
                self.examples[i]["switch_oov"] = switch
                self.examples[i]["copy_position_oov"] = copy_position

                # refine tgt ids
                if (config.use_refine_copy_tgt
                        or config.use_refine_copy_tgt_src):
                    self.examples[i]["tgt"] = tgt

                # refine src ids
                if (config.use_refine_copy_src
                        or config.use_refine_copy_tgt_src):
                    c_mask = (example['ans_sent_word_ids'] >=
                              config.refined_src_vocab_limit)
                    self.examples[i]['ans_sent_word_ids'] = \
                        c_mask * OOV_id + \
                        (1 - c_mask) * example['ans_sent_word_ids']
                    q_mask = (example['ques_word_ids'] >=
                              config.refined_src_vocab_limit)
                    self.examples[i]['ques_word_ids'] = \
                        q_mask * OOV_id + \
                        (1 - q_mask) * example['ques_word_ids']

        print("num_total_examples: ", len(self.examples))
        print(("Time of refine data: {}").format(datetime.now() - start))
Example #7
0
def prepro(config):
    emb_tags = config.emb_config.keys()
    emb_config = config.emb_config
    emb_mats = {}
    emb_dicts = {}

    debug = config.debug
    debug_length = config.debug_batchnum * config.batch_size

    # get train spacy processed examples and counters
    if not config.processed_by_spacy and not config.processed_example_features:
        train_examples = get_raw_examples(config.train_file, config.data_type,
                                          debug, debug_length)
        train_examples, train_meta, train_eval = get_spacy_processed_examples(
            config, train_examples, debug, debug_length, shuffle=False)

        dev_examples = get_raw_examples(config.dev_file, config.data_type,
                                        debug, debug_length)
        dev_examples, dev_meta, dev_eval = get_spacy_processed_examples(
            config, dev_examples, debug, debug_length, shuffle=False)

        test_examples = get_raw_examples(config.test_file, config.data_type,
                                         debug, debug_length)
        test_examples, test_meta, test_eval = get_spacy_processed_examples(
            config, test_examples, debug, debug_length, shuffle=False)

        counters = get_updated_counters_by_examples(config,
                                                    None,
                                                    train_examples,
                                                    increment=1,
                                                    init=True,
                                                    finish=True)
        # only use train data
        final_counters = copy.deepcopy(counters)

        save(config.train_examples_file,
             train_examples,
             message="train examples")
        save(config.dev_examples_file, dev_examples, message="dev examples")
        save(config.test_examples_file, test_examples, message="test examples")
        save(config.train_meta_file, train_meta, message="train meta")
        save(config.dev_meta_file, dev_meta, message="dev meta")
        save(config.test_meta_file, test_meta, message="test meta")
        save(config.train_eval_file, train_eval, message="train eval")
        save(config.dev_eval_file, dev_eval, message="dev eval")
        save(config.test_eval_file, test_eval, message="test eval")
        save(config.counters_file, final_counters, message="counters")
    else:
        train_examples = load(config.train_examples_file)
        train_meta = load(config.train_meta_file)
        train_eval = load(config.train_eval_file)

        dev_examples = load(config.dev_examples_file)
        dev_meta = load(config.dev_meta_file)
        dev_eval = load(config.dev_eval_file)

        test_examples = load(config.test_examples_file)
        test_meta = load(config.test_meta_file)
        test_eval = load(config.test_eval_file)

        final_counters = load(config.counters_file)
        counters = final_counters

    # get emb_mats and emb_dicts
    if not config.processed_emb:
        for tag in emb_tags:
            emb_mats[tag], emb_dicts[tag] = get_embedding(
                final_counters[tag],
                tag,
                emb_file=emb_config[tag]["emb_file"],
                size=emb_config[tag]["emb_size"],
                vec_size=emb_config[tag]["emb_dim"])
        save(config.emb_mats_file, emb_mats, message="embedding mats")
        save(config.emb_dicts_file, emb_dicts, message="embedding dicts")
    else:
        emb_mats = load(config.emb_mats_file)
        emb_dicts = load(config.emb_dicts_file)
    for k in emb_dicts:
        print("Embedding dict length: " + k + " " + str(len(emb_dicts[k])))

    # get related_words_dict and related_words_ids_mat
    if not config.processed_related_words:
        related_words_dict = get_related_words_dict(
            list(emb_dicts["word"].keys()), config.max_topN)
        related_words_ids_mat = get_related_words_ids_mat_with_related_words_dict(
            emb_dicts["word"], config.max_topN, related_words_dict)
        save(config.related_words_dict_file,
             related_words_dict,
             message="related words dict")
        save(config.related_words_ids_mat_file,
             related_words_ids_mat,
             message="related words ids mat")
    else:
        related_words_dict = load(config.related_words_dict_file)
        related_words_ids_mat = load(config.related_words_ids_mat_file)

    # get featured examples
    # TODO: handle potential insert SOS EOS problem when extracting tag features
    if not config.processed_example_features:
        train_examples, train_meta = get_featured_examples(
            config, train_examples, train_meta, "train", emb_dicts,
            related_words_ids_mat, related_words_dict)
        dev_examples, dev_meta = get_featured_examples(config, dev_examples,
                                                       dev_meta, "dev",
                                                       emb_dicts,
                                                       related_words_ids_mat,
                                                       related_words_dict)
        test_examples, test_meta = get_featured_examples(
            config, test_examples, test_meta, "test", emb_dicts,
            related_words_ids_mat, related_words_dict)

        save(config.train_examples_file,
             train_examples,
             message="train examples")
        save(config.dev_examples_file, dev_examples, message="dev examples")
        save(config.test_examples_file, test_examples, message="test examples")
        save(config.train_meta_file, train_meta, message="train meta")
        save(config.dev_meta_file, dev_meta, message="dev meta")
        save(config.test_meta_file, test_meta, message="test meta")
        save(config.train_eval_file, train_eval, message="train eval")
        save(config.dev_eval_file, dev_eval, message="dev eval")
        save(config.test_eval_file, test_eval, message="test eval")
    else:
        train_examples = load(config.train_examples_file)
        train_meta = load(config.train_meta_file)
        train_eval = load(config.train_eval_file)
        dev_examples = load(config.dev_examples_file)
        dev_meta = load(config.dev_meta_file)
        dev_eval = load(config.dev_eval_file)
        test_examples = load(config.test_examples_file)
        test_meta = load(config.test_meta_file)
        test_eval = load(config.test_eval_file)

    # print to txt to debug
    """
Example #8
0
    def __init__(self, config, emb_mats, emb_dicts, dropout=0.1):
        super().__init__()
        self.config = config
        self.config.n_best = 1
        self.dicts = emb_dicts
        self.dicts["idx2tgt"] = dict([[v, k]
                                      for k, v in emb_dicts["word"].items()])
        self.PAD = emb_dicts["word"]["<pad>"]

        # input, output embedder
        self.enc_embedder = Embedder(config, emb_mats, emb_dicts, dropout)
        if config.share_embedder:
            self.dec_embedder = self.enc_embedder
        else:
            self.dec_embedder = Embedder(config, emb_mats, emb_dicts, dropout)
        self.enc_emb_tags = config.emb_tags
        self.dec_emb_tags = ["word"]

        self.src_vocab_limit = config.emb_config["word"]["emb_size"]

        total_emb_size = self.enc_embedder.get_total_emb_dim(self.enc_emb_tags)

        if self.config.use_clue_info:
            self.clue_threshold = 0.5
            clue_embedding_dim = config.emb_config["is_overlap"]["emb_dim"]
            self.clue_embedder = nn.Embedding(
                num_embeddings=3,  # 0: PAD, 1: not overlap, 2: overlap
                embedding_dim=clue_embedding_dim,
                padding_idx=0)

        if self.config.use_style_info:
            style_embedding_dim = config.emb_config["is_overlap"][
                "emb_dim"]  # NOTICE
            self.style_emb_mat = nn.Parameter(
                torch.randn(self.config.num_question_style,
                            style_embedding_dim)).to(DEVICE)
            nn.init.xavier_normal_(self.style_emb_mat, math.sqrt(3))

        # encoder
        enc_input_size = total_emb_size
        if self.config.use_clue_info:
            enc_input_size += clue_embedding_dim
        self.encoder = Encoder(config, enc_input_size, dropout)

        # decoder
        dec_input_size = config.emb_config["word"]["emb_dim"]
        self.decoder = Decoder(config, dec_input_size, dropout)
        self.decIniter = DecIniter(config)

        # generator
        self.predict_size = min(config.tgt_vocab_limit, len(emb_dicts["word"]))
        if config.use_refine_copy_tgt or config.use_refine_copy_tgt_src:
            self.predict_size = min(config.refined_tgt_vocab_limit,
                                    len(emb_dicts["word"]))
        self.generator = Generator(
            config.dec_rnn_size // config.maxout_pool_size, self.predict_size)

        if self.config.copy_type in ["soft", "soft-oov"]:
            self.related_words_ids_mat = load(
                config.related_words_ids_mat_file)
            self.related_words_mask = torch.zeros(
                [self.predict_size, self.predict_size]).to(DEVICE)
            for i in range(self.predict_size):
                related_ids = [
                    i for i in self.related_words_ids_mat[i] if i != -1
                ][:self.config.soft_copy_topN]
                self.related_words_mask[i, related_ids] = 1
Example #9
0
def main(args):
    # import model according to input args
    if args.net == "FQG":
        from model.FQG_model import FQG as Model
    else:
        print("Default use s2s_qanet model.")
        from model.FQG_model import FQG as Model

    # configure according to input args and some experience
    emb_config["word"]["emb_size"] = args.tgt_vocab_limit
    args.emb_config["word"]["emb_size"] = args.tgt_vocab_limit
    args.brnn = True
    args.lower = True
    args.share_embedder = True

    # configure for complete experiment and ablation models

    # get checkpoint save path
    args_for_checkpoint_folder_name = [
        args.net, args.data_type, args.copy_type, args.copy_loss_type,
        args.soft_copy_topN, args.only_copy_content, args.use_vocab_mask,
        args.use_clue_info, args.use_style_info, args.use_refine_copy_tgt,
        args.use_refine_copy_src, args.use_refine_copy_tgt_src, args.beam_size
    ]  # NOTICE: change here. Also notice, debug mode will replace the model.
    save_dir = args.checkpoint_dir
    args.checkpoint_dir = get_checkpoint_dir(save_dir,
                                             args_for_checkpoint_folder_name)
    if args.mode != "train":
        args.resume = args.checkpoint_dir + "model_best.pth.tar"  # !!!!! NOTICE: so set --resume won't change it.

    print(args)

    # set device, random seed, logger
    device, use_cuda, n_gpu = set_device(args.no_cuda)
    set_random_seed(args.seed)
    # logger = set_logger(args.log_file)
    logger = None

    # check whether need data preprocessing. If yes, preprocess data
    if args.not_processed_data:  # use --not_processed_data --spacy_not_processed_data for complete prepro
        prepro(args)

    # data
    emb_mats = load(args.emb_mats_file)
    emb_dicts = load(args.emb_dicts_file)

    train_dataloader = get_loader(args,
                                  emb_dicts,
                                  args.train_examples_file,
                                  args.batch_size,
                                  shuffle=True)
    dev_dataloader = get_loader(args,
                                emb_dicts,
                                args.dev_examples_file,
                                args.batch_size,
                                shuffle=False)
    test_dataloader = get_loader(args,
                                 emb_dicts,
                                 args.test_examples_file,
                                 args.batch_size,
                                 shuffle=False)

    # model
    model = Model(args, emb_mats, emb_dicts)
    summarize_model(model)
    if use_cuda and args.use_multi_gpu and n_gpu > 1:
        if EXP_PLATFORM.lower() == "venus":
            pass
        else:
            model = nn.DataParallel(model)
    model.to(device)
    partial_models = None
    partial_resumes = None
    partial_trainables = None

    # optimizer and scheduler
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    for p in parameters:
        if p.dim() == 1:
            p.data.normal_(0, math.sqrt(6 / (1 + p.size(0))))
        elif list(p.shape) == [args.tgt_vocab_limit, 300]:
            print("omit embeddings.")
        else:
            nn.init.xavier_normal_(p, math.sqrt(3))
    optimizer = Optim(args.optim,
                      args.learning_rate,
                      max_grad_norm=args.max_grad_norm,
                      max_weight_value=args.max_weight_value,
                      lr_decay=args.learning_rate_decay,
                      start_decay_at=args.start_decay_at,
                      decay_bad_count=args.halve_lr_bad_count)
    optimizer.set_parameters(model.parameters())
    scheduler = None

    loss = {}
    loss["P"] = torch.nn.CrossEntropyLoss()
    loss["D"] = torch.nn.BCEWithLogitsLoss(reduction="sum")

    # trainer
    trainer = Trainer(args,
                      model,
                      train_dataloader=train_dataloader,
                      dev_dataloader=dev_dataloader,
                      loss=loss,
                      optimizer=optimizer,
                      scheduler=scheduler,
                      device=device,
                      emb_dicts=emb_dicts,
                      logger=logger,
                      partial_models=partial_models,
                      partial_resumes=partial_resumes,
                      partial_trainables=partial_trainables)

    # start train/eval/test model
    start = datetime.now()
    if args.mode == "train":
        trainer.train()
    elif args.mode == "eval_train":
        args.use_ema = False
        trainer.eval(train_dataloader, args.train_eval_file,
                     args.train_output_file)
    elif args.mode in ["eval", "evaluation", "valid", "validation"]:
        args.use_ema = False
        trainer.eval(dev_dataloader, args.dev_eval_file, args.eval_output_file)
    elif args.mode == "test":
        args.use_ema = False
        trainer.eval(test_dataloader, args.test_eval_file,
                     args.test_output_file)
    else:
        print("Error: set mode to be train or eval or test.")
    print(("Time of {} model: {}").format(args.mode, datetime.now() - start))
Example #10
0
def evaluate_and_filter(input_file, input_augmented_pkl_file, output_file):
    augmented_examples = load(input_augmented_pkl_file)
    pid_sid_ans2ans_labels = {}
    for e in augmented_examples:
        pid = e["pid"]
        sid = e["sid"]
        for info in e["selected_info_processed"]:
            ans = info["answer_text"]
            compound_id = str(pid) + "_" + str(sid) + "_" + str(ans)
            ans_chunk_label = info["answer_chunk_tag"]
            ans_ner_label = get_answer_ner_tag(e["ans_sent_doc"],
                                               ans,
                                               processed_by_spacy=True)
            if compound_id in pid_sid_ans2ans_labels:
                pass
            else:
                pid_sid_ans2ans_labels[compound_id] = {
                    "ans_chunk_label": ans_chunk_label,
                    "ans_ner_label": ans_ner_label
                }

    outfile = open(output_file, 'w', encoding='utf8')
    with codecs.open(input_file, encoding='utf8') as infile:
        lines = infile.readlines()
        i = 0
        for line in lines:
            line_split = str(line).rstrip().split("\t")
            example_pid = line_split[
                0]  # same with get_qa_input_file in QG_augment_main.py
            example_sid = line_split[1]
            q = line_split[2]
            example_ans_sent = line_split[3]
            example_answer_text = line_split[4]
            example_paragraph = line_split[7]

            paragraph_readibility = get_readibility(example_paragraph)
            paragraph_perplexity = get_perplexity(example_paragraph)
            paragraph_length = len(example_paragraph.split())
            ans_sent_readibility = get_readibility(example_ans_sent)
            ans_sent_perplexity = get_perplexity(example_ans_sent)
            ans_sent_length = len(example_ans_sent.split())

            question_readibility = get_readibility(q)
            question_perplexity = get_perplexity(q)
            question_length = len(q.split())
            question_type_text, question_type_id = get_question_type(q)

            answer_readibility = get_readibility(example_answer_text)
            answer_perplexity = get_perplexity(example_answer_text)
            compound_id = str(example_pid) + "_" + str(
                example_sid) + "_" + str(example_answer_text)
            answer_chunk_tag = pid_sid_ans2ans_labels[compound_id][
                "ans_chunk_label"]
            answer_ner_tag = pid_sid_ans2ans_labels[compound_id][
                "ans_ner_label"]
            answer_length = len(example_answer_text.split())

            # TODO: filter here !!!

            if i == 0:
                head = "\t".join([
                    "pid", "sid", "question", "ans_sent", "answer",
                    "s_char_start", "s_char_end", "paragraph", "p_char_start",
                    "p_char_end", "entailment_score", "p_readibility",
                    "p_perplexity", "p_length", "s_readibility",
                    "s_perplexity", "s_length", "q_readibility",
                    "q_perplexity", "q_length", "q_type", "q_type_id",
                    "a_readibility", "a_perplexity", "a_length", "a_chunk_tag",
                    "a_ner_tag"
                ])
                outfile.write(head + "\n")
            line_split += [
                paragraph_readibility, paragraph_perplexity, paragraph_length,
                ans_sent_readibility, ans_sent_perplexity, ans_sent_length,
                question_readibility, question_perplexity, question_length,
                question_type_text, question_type_id, answer_readibility,
                answer_perplexity, answer_length, answer_chunk_tag,
                answer_ner_tag
            ]
            output_list = [str(item) for item in line_split]
            outfile.write(
                "\t".join(output_list).rstrip().replace("\n", "\\n") + "\n")
            i = i + 1
    outfile.close()
    infile.close()
Example #11
0
def main(args):
    # import model according to input args
    if args.net == "FQG":
        from model.FQG_model import FQG as Model
    else:
        print("Default use s2s_qanet model.")
        from model.FQG_model import FQG as Model

    # configure according to input args and some experience
    emb_config["word"]["emb_size"] = args.tgt_vocab_limit
    args.emb_config["word"]["emb_size"] = args.tgt_vocab_limit
    args.brnn = True
    args.lower = True
    args.share_embedder = True

    # configure for complete experiment and ablation models

    # get checkpoint save path
    args_for_checkpoint_folder_name = [
        args.net, args.data_type, args.copy_type, args.copy_loss_type,
        args.soft_copy_topN, args.only_copy_content, args.use_vocab_mask,
        args.use_clue_info, args.use_style_info, args.use_refine_copy_tgt,
        args.use_refine_copy_src, args.use_refine_copy_tgt_src, args.beam_size
    ]  # NOTICE: change here. Keep the same with QG_main.py. Otherwise, there may be error.
    save_dir = args.checkpoint_dir
    args.checkpoint_dir = get_checkpoint_dir(save_dir,
                                             args_for_checkpoint_folder_name)
    # args.mode = "test"
    # if args.mode != "train":
    args.resume = args.checkpoint_dir + "model_best.pth.tar"  # !!!!! NOTICE: so set --resume won't change it.

    print(args)

    # set device, random seed, logger
    device, use_cuda, n_gpu = set_device(args.no_cuda)
    set_random_seed(args.seed)
    # logger = set_logger(args.log_file)
    logger = None

    # check whether need data preprocessing. If yes, preprocess data
    #if args.mode == "prepro":
    prepro(args, args.da_augmented_sentences_file,
           args.qg_augmented_sentences_file)
    #    return

    # data
    emb_mats = load(args.emb_mats_file)
    emb_dicts = load(args.emb_dicts_file)

    dataloader = get_loader(args,
                            emb_dicts,
                            args.qg_augmented_sentences_file,
                            args.batch_size,
                            shuffle=False)

    # model
    model = Model(args, emb_mats, emb_dicts)
    summarize_model(model)
    if use_cuda and args.use_multi_gpu and n_gpu > 1:
        model = nn.DataParallel(model)
    model.to(device)
    partial_models = None
    partial_resumes = None
    partial_trainables = None

    # optimizer and scheduler
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    for p in parameters:
        if p.dim() == 1:
            p.data.normal_(0, math.sqrt(6 / (1 + p.size(0))))
        elif list(p.shape) == [args.tgt_vocab_limit, 300]:
            print("omit embeddings.")
        else:
            nn.init.xavier_normal_(p, math.sqrt(3))
    optimizer = Optim(args.optim,
                      args.learning_rate,
                      max_grad_norm=args.max_grad_norm,
                      max_weight_value=args.max_weight_value,
                      lr_decay=args.learning_rate_decay,
                      start_decay_at=args.start_decay_at,
                      decay_bad_count=args.halve_lr_bad_count)
    optimizer.set_parameters(model.parameters())
    scheduler = None

    loss = {}
    loss["P"] = torch.nn.CrossEntropyLoss()
    loss["D"] = torch.nn.BCEWithLogitsLoss(reduction="sum")

    # trainer
    trainer = Trainer(args,
                      model,
                      train_dataloader=None,
                      dev_dataloader=None,
                      loss=loss,
                      optimizer=optimizer,
                      scheduler=scheduler,
                      device=device,
                      emb_dicts=emb_dicts,
                      logger=logger,
                      partial_models=partial_models,
                      partial_resumes=partial_resumes,
                      partial_trainables=partial_trainables)

    # start train/eval/test model
    start = datetime.now()
    args.use_ema = False
    trainer.test(dataloader, args.qg_result_file)
    get_qa_input_file(args.qg_result_file, args.da_paragraphs_file,
                      args.qa_data_file)
    # TODO: delete duplicate examples. different clue, style may generate the same question...
    print(("Time of {} model: {}").format(args.mode, datetime.now() - start))
Example #12
0
def prepro(config):
    emb_tags = config.emb_tags
    emb_config = config.emb_config
    emb_mats = {}
    emb_dicts = {}

    debug = config.debug
    debug_length = config.debug_batchnum * config.batch_size

    # get examples and counters
    if not config.processed_example_features:
        examples = get_raw_examples(config, config.train_file, debug,
                                    debug_length)
        examples = get_featured_examples(config, examples)
        counters = get_counters(examples, config.emb_tags,
                                config.emb_not_count_tags)

        save(config.train_examples_file, (examples, 0), message="examples")
        save(config.counters_file, counters, message="counters")
    else:
        examples, num_relations = load(config.train_examples_file)
        counters = load(config.counters_file)

    # get emb_mats and emb_dicts
    if not config.processed_emb:
        for tag in emb_tags:
            emb_mats[tag], emb_dicts[tag] = get_embedding(
                counters[tag],
                tag,
                emb_file=emb_config[tag]["emb_file"],
                size=emb_config[tag]["emb_size"],
                vec_size=emb_config[tag]["emb_dim"])
        save(config.emb_mats_file, emb_mats, message="embedding mats")
        save(config.emb_dicts_file, emb_dicts, message="embedding dicts")
    else:
        emb_mats = load(config.emb_mats_file)
        emb_dicts = load(config.emb_dicts_file)
    for k in emb_dicts:
        print("Embedding dict length: " + k + " " + str(len(emb_dicts[k])))

    if not config.processed_example_graph_features:
        # NOTICE: we should set update_edge_types2ids = True only for train dataset
        #if config.processed_emb and "edge_types" in emb_dicts:
        #    edge_types2ids = emb_dicts["edge_types"]
        #else:
        edge_types2ids = {}
        examples, num_relations, edge_types2ids = get_graph_examples(
            config,
            examples,
            config.edge_types_list,
            emb_dicts,
            edge_types2ids,
            update_edge_types2ids=True)
        emb_dicts["edge_types"] = edge_types2ids
        save(config.train_examples_file, (examples, num_relations),
             message="examples")
        save(config.emb_dicts_file, emb_dicts, message="embedding dicts")

    # print to txt to debug
    for k in emb_dicts:
        write_dict(emb_dicts[k],
                   OUTPUT_PATH + "debug/emb_dicts_" + str(k) + ".txt")
    for k in counters:
        write_counter(counters[k],
                      OUTPUT_PATH + "debug/counters_" + str(k) + ".txt")
    write_example(examples[5], OUTPUT_PATH + "debug/example.txt")
Example #13
0
def get_loader(examples_file,
               batch_size,
               shuffle=False,
               debug=False,
               debug_length=20):
    examples, num_relations = load(examples_file)
    # print("num_relations: ", num_relations)
    data_list = []
    feature_dim = None

    num_e = 0
    for e in examples:
        num_e += 1
        feature_list = [
            e["G_data"].count_query,
            e["G_data"].count_title,
            e["G_data"].is_digit,
            e["G_data"].is_punct,
            e["G_data"].is_stop,
            e["G_data"].is_special,
            [word_len / 10.0
             for word_len in e["G_data"].word_len],  # normalize word length
            [id_val / 20.0
             for id_val in e["G_data"].id]  # normalize word position
        ]
        if feature_dim is None:
            feature_dim = len(feature_list)
        emb_ids_dict = {
            "word_id":
            torch.LongTensor(e["G_data"].word_id).unsqueeze(0),
            "tag_id":
            torch.LongTensor(e["G_data"].tag_id).unsqueeze(0),
            "is_digit_id":
            torch.LongTensor(e["G_data"].is_digit_id).unsqueeze(0),
            "is_punct_id":
            torch.LongTensor(e["G_data"].is_punct_id).unsqueeze(0),
            "is_stop_id":
            torch.LongTensor(e["G_data"].is_stop_id).unsqueeze(0),
            "is_special_id":
            torch.LongTensor(e["G_data"].is_special_id).unsqueeze(0)
        }
        x = torch.FloatTensor(feature_list).t().unsqueeze(
            0).contiguous()  # 1 * num_nodes * num_features
        edge_index = e["G_data"].edge_index
        edge_type = e["G_data"].edge_type_id
        #print("DEBUG node_idx: ", e["G_data"].node_index)
        #print("DEBUG edge_idx:  ", edge_index)
        #print("DEBUG edge_type: ", edge_type)
        y = torch.LongTensor(e["G_data"].y_phrase)
        y_node_type = torch.LongTensor(e["G_data"].y_node_type)
        words = e["G_data"].word
        data_list.append(
            Data(x=x,
                 edge_type=edge_type,
                 edge_index=edge_index,
                 y=y,
                 words=words,
                 y_node_type=y_node_type,
                 emb_ids_dict=emb_ids_dict,
                 G_for_decode=e["G_for_decode"],
                 queries_features=e["queries_features"],
                 titles_features=e["titles_features"],
                 phrase_features=e["phrase_features"]))
        if shuffle:
            random.shuffle(data_list)
        if debug and num_e >= debug_length:
            break

    return data_list, num_relations, feature_dim
Example #14
0
def main(args):
    # get revised args
    # NOTICE: here is our default data organization structure. Change it if you are different.
    # original_data_folder = DATA_PATH + "original/" + args.data_type + "/"  # data_type is event or concept.

    processed_data_folder = os.path.join(DATA_PATH, 'processed',
                                         args.data_type)
    args.train_examples_file = os.path.join(processed_data_folder,
                                            'train-examples.pkl')
    args.dev_examples_file = os.path.join(processed_data_folder,
                                          'dev-examples.pkl')
    args.test_examples_file = os.path.join(processed_data_folder,
                                           'test-examples.pkl')
    args.train_output_file = os.path.join(processed_data_folder,
                                          'train_output.txt')
    args.eval_output_file = os.path.join(processed_data_folder,
                                         'eval_output.txt')
    args.test_output_file = os.path.join(processed_data_folder,
                                         'test_output.txt')
    args.emb_mats_file = os.path.join(processed_data_folder, 'emb_mats.pkl')
    args.emb_dicts_file = os.path.join(processed_data_folder, 'emb_dicts.pkl')
    args.counters_file = os.path.join(processed_data_folder, 'counters.pkl')

    # get checkpoint save path
    args_for_checkpoint_folder_name = [
        args.net, args.data_type, "_".join(args.tasks),
        "_".join(args.emb_tags), "_".join(args.edge_types_list), args.d_model,
        args.layers, args.num_bases, args.lr, args.debug
    ]  # NOTICE: change here
    save_dir = args.checkpoint_dir
    args.output_file_prefix = "_".join(
        [str(s) for s in args_for_checkpoint_folder_name])
    args.checkpoint_dir = get_checkpoint_dir(save_dir,
                                             args_for_checkpoint_folder_name)
    if args.mode != "train":
        args.resume = args.checkpoint_dir + "model_best.pth.tar"  # NOTICE: so set --resume won't change it.

    print(args)

    # set device, random seed, logger
    device, use_cuda, n_gpu = set_device(args.no_cuda)
    set_random_seed(args.seed)
    # logger = set_logger(args.log_file)

    # check whether need data preprocessing. If yes, preprocess data
    if args.not_processed_data:  # use --not_processed_data --spacy_not_processed_data for complete prepro
        prepro(args)

    # # data
    emb_mats = load(args.emb_mats_file)
    emb_dicts = load(args.emb_dicts_file)

    data_list, num_relations, feature_dim = get_loader(
        args.train_examples_file, 3, shuffle=True, debug=args.debug)
    print("num_relations: ", num_relations)
    # num_relations = len(emb_dicts["edge_types"]) #!!!
    train_dataloader = data_list[0:math.floor(0.8 * len(data_list))]
    dev_dataloader = data_list[math.floor(0.8 * len(data_list)):math.
                               floor(0.9 * len(data_list))]
    test_dataloader = data_list[math.floor(0.9 * len(data_list)):]

    # model
    model = Model(config=args,
                  in_channels=feature_dim,
                  out_channels=args.d_model,
                  num_relations=num_relations,
                  num_bases=args.num_bases,
                  emb_mats=emb_mats,
                  emb_dicts=emb_dicts,
                  dropout=0.1)  # TODO: set them according to args
    summarize_model(model)
    if use_cuda and args.use_multi_gpu and n_gpu > 1:
        model = nn.DataParallel(model)
    model.to(device)
    print("successfully get model")

    # optimizer and scheduler
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    # for p in parameters:
    #     if p.dim() == 1:
    #         p.data.normal_(0, math.sqrt(6 / (1 + p.size(0))))
    #     # elif list(p.shape) == [args.tgt_vocab_limit, 300]:
    #     #     print("omit embeddings.")
    #     else:
    #         nn.init.xavier_normal_(p, math.sqrt(3))
    optimizer = torch.optim.Adam(params=parameters,
                                 lr=args.lr,
                                 betas=(args.beta1, args.beta2),
                                 eps=1e-8,
                                 weight_decay=3e-7)
    cr = 1.0 / math.log(args.lr_warm_up_num)
    scheduler = torch.optim.lr_scheduler.LambdaLR(
        optimizer,
        lr_lambda=lambda ee: cr * math.log(ee + 1)
        if ee < args.lr_warm_up_num else 1)

    loss = {}
    # loss["P"] = torch.nn.CrossEntropyLoss()
    # loss["D"] = torch.nn.BCEWithLogitsLoss(reduction="sum")

    # trainer
    trainer = Trainer(args,
                      model,
                      train_dataloader=train_dataloader,
                      dev_dataloader=dev_dataloader,
                      test_dataloader=test_dataloader,
                      loss=loss,
                      optimizer=optimizer,
                      scheduler=scheduler,
                      device=device,
                      emb_dicts=emb_dicts)

    # start train/eval/test model
    start = datetime.now()
    if args.mode.lower() == "train":
        trainer.train()
    elif args.mode.lower() == "eval_train":
        args.use_ema = False
        train_output_file = RESULT_PATH + "train_output." + args.output_file_prefix + ".txt"
        trainer.eval(train_dataloader, train_output_file)
    elif args.mode.lower() in [
            "eval", "evaluation", "valid", "validation", "eval_dev"
    ]:
        args.use_ema = False
        eval_output_file = RESULT_PATH + "dev_output." + args.output_file_prefix + ".txt"
        trainer.eval(dev_dataloader, eval_output_file)
    elif args.mode.lower() in ["eval_test"]:
        args.use_ema = False
        test_output_file = RESULT_PATH + "test_output." + args.output_file_prefix + ".txt"
        trainer.eval(test_dataloader, test_output_file)
    elif args.mode.lower() == "test":
        args.use_ema = False
        test_output_file = RESULT_PATH + "test_output." + args.output_file_prefix + ".txt"
        trainer.test(test_dataloader, test_output_file)
    else:
        print("Error: set mode to be train or eval or test or eval_train.")
    print(("Time of {} model: {}").format(args.mode, datetime.now() - start))
Example #15
0
 'PP-FAC': Counter({'What': 3, 'Which': 1}),
 'UCP-UNK': Counter({'What': 20, 'Which': 3, 'Who': 2, 'Where': 1}),
 'ADVP-GPE': Counter({'Which': 2, 'What': 1, 'Where': 1}),
 'PP-LOC': Counter({'What': 4, 'Where': 1}),
 'UCP-DATE': Counter({'How': 1}),
 'ADVP-ORG': Counter({'What': 3, 'Who': 2, 'Which': 1}),
 'PP-LAW': Counter({'What': 1, 'Which': 1}),
 'ADVP-CARDINAL': Counter({'How': 1, 'When': 1}),
 'PP-EVENT': Counter({'What': 3}),
 'UCP-PERSON': Counter({'What': 1}),
 'PP-MONEY': Counter({'How': 3}),
 'PP-QUANTITY': Counter({'What': 3, 'How': 1}),
 'PP-NORP': Counter({'What': 2})}
"""

answertag2qtype_infos = load(ANSWERTAG2QTYPE_FILE_PATH)
ANSWERTAG2QTYPE_SET = answertag2qtype_infos["answertag2qtype_set"]
# refined set: remove low frequency types.
ANSWERTAG2QTYPE_COUNTER = answertag2qtype_infos["answertag2qtype_counter"]
print("Before delete low frequency types: ==============")
for k in ANSWERTAG2QTYPE_SET:
    print(k)
    print(ANSWERTAG2QTYPE_SET[k])

for ans_tag in ANSWERTAG2QTYPE_SET:
    counter = ANSWERTAG2QTYPE_COUNTER[ans_tag]
    threshold = 0.03 * sum(counter.values())
    for q_type in counter:
        if counter[q_type] < threshold:
            ANSWERTAG2QTYPE_SET[ans_tag].remove(q_type)