Beispiel #1
0
def main():
    opt = parse_args()

    init_logger(opt.log_file)
    logger.info("Extracting features...")

    src_nfeats = inputters.get_num_features(opt.data_type, opt.train_dir,
                                            'src')
    qa_nfeats = inputters.get_num_features(opt.data_type, opt.train_dir, 'qa')
    tgt_nfeats = inputters.get_num_features(opt.data_type, opt.train_dir,
                                            'tgt')
    logger.info(" * number of source features: %d." % src_nfeats)
    logger.info(" * number of qa features: %d." % qa_nfeats)
    logger.info(" * number of target features: %d." % tgt_nfeats)

    logger.info("Building `Fields` object...")
    fields = inputters.get_fields(src_nfeats, qa_nfeats, tgt_nfeats,
                                  opt.data_type)

    logger.info("Building & saving training data...")
    train_dataset_files = build_save_dataset('train', fields, opt)

    logger.info("Building & saving validation data...")
    build_save_dataset('valid', fields, opt)

    logger.info("Building & saving vocabulary...")
    # train_dataset_files = '/research/king3/yfgao/pycharm_deployment/CoQG/data/coref_flow/processed/coqg.turn3.train.pt'
    build_save_vocab(train_dataset_files, opt.data_type, fields, opt)
Beispiel #2
0
def main():
    opt = parse_args()

    if (opt.max_shard_size > 0):
        raise AssertionError("-max_shard_size is deprecated, please use \
                             -shard_size (number of examples) instead.")

    init_logger(opt.log_file)
    logger.info("Extracting features...")

    src_nfeats = inputters.get_num_features(opt.data_type, opt.train_src,
                                            'src')
    tgt_nfeats = inputters.get_num_features(opt.data_type, opt.train_tgt,
                                            'tgt')
    logger.info(" * number of source features: %d." % src_nfeats)
    logger.info(" * number of target features: %d." % tgt_nfeats)

    logger.info("Building `Fields` object...")
    fields = inputters.get_fields(opt.data_type, src_nfeats, tgt_nfeats)

    logger.info("Building & saving training data...")
    train_dataset_files = build_save_dataset('train', fields, opt)

    logger.info("Building & saving validation data...")
    build_save_dataset('valid', fields, opt)

    logger.info("Building & saving vocabulary...")
    build_save_vocab(train_dataset_files, fields, opt)
Beispiel #3
0
def main():
    opt = parse_args()

    if (opt.max_shard_size > 0):
        raise AssertionError("-max_shard_size is deprecated, please use \
                             -shard_size (number of examples) instead.")

    init_logger(opt.log_file)
    logger.info("Extracting features...")

    # 下面的代码是尝试解决多进程prepare失败的问题,但是没有效果
    torch.multiprocessing.set_sharing_strategy('file_system')
    import resource
    rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
    resource.setrlimit(resource.RLIMIT_NOFILE, (65535, rlimit[1]))
    # END

    src_nfeats = inputters.get_num_features(opt.data_type, opt.train_src,
                                            'src')
    tgt_nfeats = inputters.get_num_features(opt.data_type, opt.train_tgt,
                                            'tgt')
    logger.info(" * number of source features: %d." % src_nfeats)
    logger.info(" * number of target features: %d." % tgt_nfeats)

    logger.info("Building `Fields` object...")
    fields = inputters.get_fields(opt.data_type, src_nfeats, tgt_nfeats)
    myutils.add_more_field(fields)
    logger.info("Building & saving training data...")
    train_dataset_files = build_save_dataset('train', fields, opt)

    logger.info("Building & saving validation data...")
    build_save_dataset('valid', fields, opt)

    logger.info("Building & saving vocabulary...")
    build_save_vocab(train_dataset_files, fields, opt)
Beispiel #4
0
def main():
    # Options are parsed and stored in opt object
    opt = parse_args()
    # Logging of data
    init_logger(opt.log_file)
    logger.info("Extracting features...")

    src_nfeats = inputters.get_num_features(opt.data_type, opt.train_src,
                                            'src')
    tgt_nfeats = inputters.get_num_features(opt.data_type, opt.train_tgt,
                                            'tgt')
    logger.info(" * number of source features: %d." % src_nfeats)
    logger.info(" * number of target features: %d." % tgt_nfeats)

    logger.info("Building `Fields` object...")
    fields = inputters.get_fields(opt.data_type, src_nfeats, tgt_nfeats)

    #Generation of traing, validation and dictionary data
    logger.info("Building & saving training data...")
    train_dataset_files = build_save_dataset('train', fields, opt)

    logger.info("Building & saving validation data...")
    valid_dataset_files = build_save_dataset('valid', fields, opt)

    logger.info("Building & saving vocabulary...")
    build_save_vocab(train_dataset_files + valid_dataset_files, fields, opt)
Beispiel #5
0
def main():
    opt = parse_args()
    init_logger(opt.log_file)
    logger.info("Extracting features...")

    src_nfeats = inputters.get_num_features(opt.data_type, opt.train_src,
                                            'src')
    tgt_nfeats = inputters.get_num_features(opt.data_type, opt.train_tgt,
                                            'tgt')
    ans_nfeats = inputters.get_num_features(opt.data_type, opt.train_ans,
                                            "ans")
    logger.info(" * number of source features: %d." % src_nfeats)
    logger.info(" * number of target features: %d." % tgt_nfeats)
    logger.info(" * number of answer features: %d." % ans_nfeats)

    logger.info("Building `Fields` object...")
    fields = inputters.get_fields(opt.data_type, src_nfeats, tgt_nfeats,
                                  ans_nfeats)

    logger.info("fields src")
    logger.info(fields.src.__dict__)
    logger.info(fields.tgt.__dict__)
    logger.info(fields.src_map.__dict__)
    logger.info(fields.ans.__dict__)
    logger.info(fields.indices.__dict__)
    logger.info(fields.alignment.__dict__)
    '''
Beispiel #6
0
def main():
    #pdb.set_trace()
    opt = parse_args()
    init_logger(opt.log_file)
    logger.info("Extracting features...")

    # If there are special features added -- not in our case
    src_nfeats = inputters.get_num_features(opt.data_type, opt.train_src,
                                            'src')
    tgt_nfeats = inputters.get_num_features(opt.data_type, opt.train_tgt,
                                            'tgt')
    logger.info(" * number of source features: %d." % src_nfeats)
    logger.info(" * number of target features: %d." % tgt_nfeats)

    logger.info("Building `Fields` object...")
    fields = inputters.get_fields(opt.data_type, src_nfeats, tgt_nfeats)

    logger.info("Building & saving training data...")
    train_dataset_files = build_save_dataset('train', fields, opt)

    logger.info("Building & saving validation data...")
    valid_dataset_files = build_save_dataset('valid', fields, opt)

    logger.info("Building & saving vocabulary...")
    build_save_vocab(train_dataset_files + valid_dataset_files, fields, opt)
Beispiel #7
0
def main():
    opt = parse_args()

    assert opt.max_shard_size == 0, \
        "-max_shard_size is deprecated. Please use \
        -shard_size (number of examples) instead."
    assert opt.shuffle == 0, \
        "-shuffle is not implemented. Please shuffle \
        your data before pre-processing."

    init_logger(opt.log_file)
    logger.info("Extracting features...")

    src_nfeats = inputters.get_num_features(opt.data_type, opt.train_src,
                                            'src')
    tgt_nfeats = inputters.get_num_features(opt.data_type, opt.train_tgt,
                                            'tgt')
    logger.info(" * number of source features: %d." % src_nfeats)
    logger.info(" * number of target features: %d." % tgt_nfeats)

    logger.info("Building `Fields` object...")
    fields = inputters.get_fields(opt.data_type, src_nfeats, tgt_nfeats)

    logger.info("Building & saving training data...")
    train_dataset_files = build_save_dataset('train', fields, opt)

    logger.info("Building & saving validation data...")
    build_save_dataset('valid', fields, opt)

    logger.info("Building & saving vocabulary...")
    build_save_vocab(train_dataset_files, fields, opt)
Beispiel #8
0
def main():
    opt = parse_args()

    if opt.max_shard_size > 0:
        raise AssertionError("-max_shard_size is deprecated, please use \
                             -shard_size (number of examples) instead.")
    if opt.shuffle > 0:
        raise AssertionError("-shuffle is not implemented, please make sure \
                             you shuffle your data before pre-processing.")

    init_logger(opt.log_file)
    logger.info("Extracting features...")

    src_nfeats = inputters.get_num_features(opt.data_type, opt.train_src,
                                            "src")
    tgt_nfeats = inputters.get_num_features(opt.data_type, opt.train_tgt,
                                            "tgt")
    logger.info(" * number of source features: %d." % src_nfeats)
    logger.info(" * number of target features: %d." % tgt_nfeats)

    logger.info("Building `Fields` object...")
    fields = inputters.get_fields(opt.data_type, src_nfeats, tgt_nfeats)

    logger.info("Building & saving training data...")
    train_dataset_files = build_save_dataset("train", fields, opt)

    logger.info("Building & saving validation data...")
    build_save_dataset("valid", fields, opt)

    logger.info("Building & saving vocabulary...")
    build_save_vocab(train_dataset_files, fields, opt)
Beispiel #9
0
def _get_fields(data_type, train_src, train_tgt):
    logger.info("Extracting features...")

    src_nfeats = inputters.get_num_features(data_type, train_src, 'src')
    tgt_nfeats = inputters.get_num_features(data_type, train_tgt, 'tgt')
    logger.info(" * number of source features: %d." % src_nfeats)
    logger.info(" * number of target features: %d." % tgt_nfeats)

    logger.info("Building `Fields` object...")
    fields = inputters.get_fields(data_type, src_nfeats, tgt_nfeats)

    return fields
Beispiel #10
0
def dump_dataset(savepath, save_dev=False):
    src_corpus = savepath + '/src-train.txt'
    tgt_corpus = savepath + '/tgt-train.txt'

    src_nfeats = inputters.get_num_features('text', src_corpus, 'src')
    tgt_nfeats = inputters.get_num_features('text', tgt_corpus, 'tgt')
    fields = inputters.get_fields('text', src_nfeats, tgt_nfeats)
    fields['graph'] = torchtext.data.Field(sequential=False)
    train_dataset_files = build_save_dataset('train', fields, src_corpus,
                                             tgt_corpus, savepath, args)

    if save_dev:
        src_corpus = savepath + '/src-dev.txt'
        tgt_corpus = savepath + '/tgt-dev.txt'
        build_save_dataset('dev', fields, src_corpus, tgt_corpus, savepath,
                           args)
    build_save_vocab(train_dataset_files, fields, savepath, args)
Beispiel #11
0
def preprocess_main(opt):
    logger = get_logger(opt.log_file)
    src_nfeats = inputters.get_num_features(opt.data_type, opt.train_src,
                                            'src')
    tgt_nfeats = inputters.get_num_features(opt.data_type, opt.train_tgt,
                                            'tgt')
    logger.info(" * number of source features: %d." % src_nfeats)
    logger.info(" * number of target features: %d." % tgt_nfeats)

    logger.info("Building `Fields` object...")
    fields = inputters.get_fields(opt.data_type, src_nfeats, tgt_nfeats)

    logger.info("Building & saving training data...")
    train_dataset_files = build_save_dataset('train', fields, opt, logger)

    logger.info("Building & saving vocabulary...")
    build_save_vocab(train_dataset_files, fields, opt, logger)

    logger.info("Building & saving validation data...")
    build_save_dataset('valid', fields, opt, logger)
Beispiel #12
0
def main():
    opt = parse_args()

    print("Extracting features...")
    src_nfeats = inputters.get_num_features(opt.data_type, opt.train_src,
                                            'src')
    tgt_nfeats = inputters.get_num_features(opt.data_type, opt.train_tgt,
                                            'tgt')
    print(" * number of source features: %d." % src_nfeats)
    print(" * number of target features: %d." % tgt_nfeats)

    print("Building `Fields` object...")
    fields = inputters.get_fields(opt.data_type, src_nfeats, tgt_nfeats)

    print("Building & saving training data...")
    train_dataset_files = build_save_dataset('train', fields, opt)

    print("Building & saving vocabulary...")
    build_save_vocab(train_dataset_files, fields, opt)

    print("Building & saving validation data...")
    build_save_dataset('valid', fields, opt)
Beispiel #13
0
def main():
    opt = parse_args()
    init_logger(opt.log_file)
    logger.info("Extracting features...")


    src_nfeats = inputters.get_num_features(
        opt.data_type, opt.train_src, 'src')
    tgt_nfeats = inputters.get_num_features(
        opt.data_type, opt.train_tgt, 'tgt')
    ans_nfeats = inputters.get_num_features(
        opt.data_type, opt.train_ans, "ans")
    logger.info(" * number of source features: %d." % src_nfeats)
    logger.info(" * number of target features: %d." % tgt_nfeats)
    logger.info(" * number of answer features: %d." % ans_nfeats)

    logger.info("Building `Fields` object...")
    fields = inputters.get_fields(opt.data_type, src_nfeats, tgt_nfeats, ans_nfeats)

    logger.info("fields src")
    logger.info(fields['src'].__dict__)
    logger.info(fields['tgt'].__dict__)
    logger.info(fields['src_map'].__dict__)
    logger.info(fields['ans'].__dict__)
    logger.info(fields['indices'].__dict__)
    logger.info(fields['alignment'].__dict__)


    logger.info("Building & saving training data...")
    train_dataset_files = build_save_dataset('train', fields, opt)
    logger.info(train_dataset_files)

    logger.info("Building & saving vocabulary...")
    build_save_vocab(train_dataset_files, fields, opt)

    logger.info("Building & saving validation data...")
    build_save_dataset('valid', fields, opt)
Beispiel #14
0
        "-src_word_vec_size",
        "25",
        "-tgt_word_vec_size",
        "25",
    ])

    train_args.batch_size = 50
    #print(train_args)
    #sys.exit()
    try:
        torch.manual_seed(preproc_args.seed)

        opt = preproc_args
        logger = logging

        src_nfeats = inputters.get_num_features(opt.data_type, opt.train_src,
                                                'src')
        tgt_nfeats = inputters.get_num_features(opt.data_type, opt.train_tgt,
                                                'tgt')
        logger.info(" * number of source features: %d." % src_nfeats)
        logger.info(" * number of target features: %d." % tgt_nfeats)

        logger.info("Building `Fields` object...")
        fields = inputters.get_fields(opt.data_type, src_nfeats, tgt_nfeats)

        logger.info("Building & saving training data...")
        train_dataset_files = build_save_dataset('train', fields, opt, logger)

        logger.info("Building & saving vocabulary...")
        build_save_vocab(train_dataset_files, fields, opt, logger)

        logger.info("Building & saving validation data...")