Example #1
0
def do_extract(args):
    datapath, destination_fn = args.datapath, args.dest_fn
    voc_fn = datapath + ".voc"
    data_fn = datapath + ".data.json.gz"
    log.info("extracting data from %s using processor in %s", data_fn, voc_fn)

    data = json.load(gzip.open(data_fn, "rb"))

    bi_pp = processors.load_pp_pair_from_file(voc_fn)
    tgt_processor = bi_pp.tgt_processor()
    for key in data:
        src_fn = destination_fn + ".%s.src.txt" % key
        tgt_fn = destination_fn + ".%s.tgt.txt" % key
        tgt_swallow_fn = destination_fn + ".%s.tgt.swallow.txt" % key
        log.info("extracting key %s into %s and %s and %s", key, src_fn,
                 tgt_fn, tgt_swallow_fn)

        src_f = codecs.open(src_fn, "w", encoding="utf8")
        tgt_f = codecs.open(tgt_fn, "w", encoding="utf8")
        tgt_swallow_f = codecs.open(tgt_swallow_fn, "w", encoding="utf8")

        for src, tgt in data[key]:
            src_dec, tgt_dec = bi_pp.deconvert(src, tgt)
            src_f.write(src_dec + "\n")
            tgt_f.write(tgt_dec + "\n")

            tgt_swallow = tgt_processor.deconvert_swallow(tgt)
            tgt_swallow_string = " ".join(
                ("[@%i]" % w if isinstance(w, int) else w)
                for w in tgt_swallow)
            tgt_swallow_f.write(tgt_swallow_string + "\n")
Example #2
0
def load_voc_and_update_training_config(config_training):
    data_prefix = config_training["training_management"]["data_prefix"]
    voc_fn = data_prefix + ".voc"
    data_fn = data_prefix + ".data.json.gz"

    log.info("loading voc from %s" % voc_fn)
#     src_voc, tgt_voc = json.load(open(voc_fn))

    bi_idx = processors.load_pp_pair_from_file(voc_fn)

    src_indexer, tgt_indexer = bi_idx.src_processor(), bi_idx.tgt_processor()
#     src_indexer = processors.PreProcessor.make_from_serializable(src_voc)
#     tgt_indexer = processors.PreProcessor.make_from_serializable(tgt_voc)
#     tgt_voc = None
#     src_voc = None

#     Vi = len(src_voc) + 1 # + UNK
#     Vo = len(tgt_voc) + 1 # + UNK

    Vi = len(src_indexer)  # + UNK
    Vo = len(tgt_indexer)  # + UNK

    config_training.add_section("data", keep_at_bottom="metadata", overwrite=False)
    config_training["data"]["data_fn"] = data_fn
    config_training["data"]["Vi"] = Vi
    config_training["data"]["Vo"] = Vo
    config_training["data"]["voc"] = voc_fn

    config_training.set_readonly()

    return src_indexer, tgt_indexer
Example #3
0
def create_encdec_and_indexers_from_config_dict(config_dict, src_indexer=None, tgt_indexer=None, load_config_model="no",
                                                return_model_infos=False):
    assert load_config_model in "yes no if_exists".split()

    if src_indexer is None or tgt_indexer is None:
        voc_fn = config_dict.data["voc"]
        log.info("loading voc from %s" % voc_fn)
#         src_voc, tgt_voc = json.load(open(voc_fn))

        bi_idx = processors.load_pp_pair_from_file(voc_fn)

    if src_indexer is None:
        src_indexer = bi_idx.src_processor()

    if tgt_indexer is None:
        tgt_indexer = bi_idx.tgt_processor()

#     tgt_voc = None
#     src_voc = None

    encdec = create_encdec_from_config_dict(config_dict["model"], src_indexer, tgt_indexer)

    eos_idx = len(tgt_indexer)

    model_infos = None

    if load_config_model != "no":
        if "model_parameters" not in config_dict:
            if load_config_model == "yes":
                log.error("cannot find model parameters in config file")
                raise ValueError(
                    "Config file do not contain model_parameters section")
        else:
            if config_dict.model_parameters.type == "model":
                model_filename = config_dict.model_parameters.filename
                log.info(
                    "loading model parameters from file specified by config file:%s" %
                    model_filename)
                serializers.load_npz(model_filename, encdec)
                if return_model_infos:
                    model_infos = create_filename_infos(model_filename)
            else:
                if load_config_model == "yes":
                    log.error(
                        "model parameters in config file is of type snapshot, not model")
                    raise ValueError("Config file model is not of type model")

    result = encdec, eos_idx, src_indexer, tgt_indexer
    if return_model_infos:
        return result, model_infos
    else:
        return result
Example #4
0
def do_make_data(config):
    #     raw_input("Press Enter to Continue 222")

    save_prefix_dir, save_prefix_fn = os.path.split(config.data.save_prefix)
    ensure_path(save_prefix_dir)

    config_fn = config.data.save_prefix + ".data.config"
    voc_fn = config.data.save_prefix + ".voc"
    data_fn = config.data.save_prefix + ".data.json.gz"
    #     valid_data_fn = config.save_prefix + "." + config.model + ".valid.data.npz"

    #     voc_fn_src = config.save_prefix + ".src.voc"
    #     voc_fn_tgt = config.save_prefix + ".tgt.voc"

    files_that_will_be_created = [config_fn, voc_fn, data_fn]

    if config.processing.bpe_src is not None:
        bpe_data_file_src = config.data.save_prefix + ".src.bpe"
        files_that_will_be_created.append(bpe_data_file_src)

    if config.processing.bpe_tgt is not None:
        bpe_data_file_tgt = config.data.save_prefix + ".tgt.bpe"
        files_that_will_be_created.append(bpe_data_file_tgt)

    if config.processing.joint_bpe is not None:
        bpe_data_file_joint = config.data.save_prefix + ".joint.bpe"
        files_that_will_be_created.append(bpe_data_file_joint)

    already_existing_files = []
    for filename in files_that_will_be_created:  # , valid_data_fn]:
        if os.path.exists(filename):
            already_existing_files.append(filename)
    if len(already_existing_files
           ) > 0 and not config.processing.force_overwrite:
        print "Warning: existing files are going to be replaced: ", already_existing_files
        raw_input("Press Enter to Continue")

    if config.processing.use_voc is not None:
        log.info("loading voc from %s" % config.processing.use_voc)
        #         src_voc, tgt_voc = json.load(open(config.use_voc))
        #         src_pp = processors.load_pp_from_data(json.load(open(src_voc)))
        #         tgt_pp = IndexingPrePostProcessor.make_from_serializable(tgt_voc)
        bi_idx = processors.load_pp_pair_from_file(config.processing.use_voc)
    else:

        bi_idx = processors.BiIndexingPrePostProcessor(
            voc_limit1=config.processing.src_voc_size,
            voc_limit2=config.processing.tgt_voc_size)
        pp = processors.BiProcessorChain()

        if config.processing.source_char_conversion is not None:
            log.info("using source char conversion %s",
                     config.processing.source_char_conversion)
            char_conv_dic = json.load(
                open(config.processing.source_char_conversion))
            pp.add_src_processor(
                processors.SourceCharacterConverter(char_conv_dic))

        if config.processing.latin_tgt:
            pp.add_tgt_processor(
                processors.LatinScriptProcess(config.processing.latin_type))

        if config.processing.latin_src:
            pp.add_src_processor(
                processors.LatinScriptProcess(config.processing.latin_type))

        pp.add_src_processor(
            processors.SimpleSegmenter(
                config.processing.src_segmentation_type))
        if config.processing.bpe_src is not None:
            pp.add_src_processor(
                processors.BPEProcessing(bpe_data_file=bpe_data_file_src,
                                         symbols=config.processing.bpe_src,
                                         separator="._@@@"))

        pp.add_tgt_processor(
            processors.SimpleSegmenter(
                config.processing.tgt_segmentation_type))
        if config.processing.bpe_tgt is not None:
            pp.add_tgt_processor(
                processors.BPEProcessing(bpe_data_file=bpe_data_file_tgt,
                                         symbols=config.processing.bpe_tgt,
                                         separator="._@@@"))

        if config.processing.joint_bpe is not None:
            pp.add_biprocessor(
                processors.JointBPEBiProcessor(
                    bpe_data_file=bpe_data_file_joint,
                    symbols=config.processing.joint_bpe,
                    separator="._@@@"))

        bi_idx.add_preprocessor(pp)

    def load_data(src_fn, tgt_fn, max_nb_ex=None, infos_dict=None):

        training_data, stats_src, stats_tgt = processors.build_dataset_pp(
            src_fn, tgt_fn, bi_idx, max_nb_ex=max_nb_ex)

        log.info("src data stats:\n%s", stats_src.make_report())
        log.info("tgt data stats:\n%s", stats_tgt.make_report())

        if infos_dict is not None:
            infos_dict["src"] = stats_src.report_as_obj()
            infos_dict["tgt"] = stats_tgt.report_as_obj()

        return training_data

    infos = collections.OrderedDict()
    infos["train"] = collections.OrderedDict()

    log.info("loading training data from %s and %s" %
             (config.data.src_fn, config.data.tgt_fn))
    training_data = load_data(config.data.src_fn,
                              config.data.tgt_fn,
                              max_nb_ex=config.data.max_nb_ex,
                              infos_dict=infos["train"])

    dev_data = None
    if config.data.dev_src is not None:
        log.info("loading dev data from %s and %s" %
                 (config.data.dev_src, config.data.dev_tgt))
        infos["dev"] = collections.OrderedDict()
        dev_data = load_data(config.data.dev_src,
                             config.data.dev_tgt,
                             infos_dict=infos["dev"])

    test_data = None
    if config.data.test_src is not None:
        log.info("loading test data from %s and %s" %
                 (config.data.test_src, config.data.test_tgt))
        infos["test"] = collections.OrderedDict()
        test_data = load_data(config.data.test_src,
                              config.data.test_tgt,
                              infos_dict=infos["test"])

    config.insert_section("infos",
                          infos,
                          even_if_readonly=True,
                          keep_at_bottom="metadata",
                          overwrite=False)

    #     if config.shuffle:
    #         log.info("shuffling data")
    #         if config.enable_fast_shuffle:
    #             shuffle_in_unison_faster(data_input, data_target)
    #         else:
    #             data_input, data_target = shuffle_in_unison(data_input, data_target)
    log.info("saving config to %s" % config_fn)
    config.save_to(config_fn)
    #     json.dump(config.__dict__, open(config_fn, "w"),
    #               indent=2, separators=(',', ': '))

    log.info("saving voc to %s" % voc_fn)
    processors.save_pp_pair_to_file(bi_idx, voc_fn)
    #     json.dump([src_pp.to_serializable(), tgt_pp.to_serializable()],
    #               open(voc_fn, "w"), indent=2, separators=(',', ': '))

    log.info("saving train_data to %s" % data_fn)
    data_all = {"train": training_data}
    if test_data is not None:
        data_all["test"] = test_data
    if dev_data is not None:
        data_all["dev"] = dev_data

    json.dump(data_all,
              gzip.open(data_fn, "wb"),
              indent=2,
              separators=(',', ': '))
Example #5
0
def create_encdec_and_indexers_from_config_dict(
        config_dict,
        src_indexer=None,
        tgt_indexer=None,
        load_config_model="no",
        return_model_infos=False,
        additional_models_parameters_for_averaging=None):
    assert load_config_model in "yes no if_exists".split()

    if src_indexer is None or tgt_indexer is None:
        voc_fn = config_dict.data["voc"]
        log.info("loading voc from %s" % voc_fn)
        #         src_voc, tgt_voc = json.load(open(voc_fn))

        bi_idx = processors.load_pp_pair_from_file(voc_fn)

    if src_indexer is None:
        src_indexer = bi_idx.src_processor()

    if tgt_indexer is None:
        tgt_indexer = bi_idx.tgt_processor()

#     tgt_voc = None
#     src_voc = None

    encdec = create_encdec_from_config_dict(config_dict["model"], src_indexer,
                                            tgt_indexer)

    eos_idx = len(tgt_indexer)

    model_infos = None

    if load_config_model != "no":
        if "model_parameters" not in config_dict:
            assert additional_models_parameters_for_averaging is None
            if load_config_model == "yes":
                log.error("cannot find model parameters in config file")
                raise ValueError(
                    "Config file do not contain model_parameters section")
        else:
            model_filename = config_dict.model_parameters.filename
            if additional_models_parameters_for_averaging is not None:
                load_model_flexible([model_filename] +
                                    additional_models_parameters_for_averaging,
                                    encdec)
            else:
                load_model_flexible(model_filename, encdec)


#             if config_dict.model_parameters.type == "model":
#                 log.info(
#                     "loading model parameters from file specified by config file:%s" %
#                     model_filename)
#                 serializers.load_npz(model_filename, encdec)
#                 if return_model_infos:
#                     model_infos = create_filename_infos(model_filename)
#             else:
#                 log.info("loading model parameters from snapshot file specified by config file:%s" %model_filename)
#                 with np.load(model_filename) as fs:
#                     dics = serializers.NpzDeserializer(fs, path="updater/model:main/")
#                     dics.load(encdec)
            if return_model_infos:
                model_infos = create_filename_infos(model_filename)
    else:
        assert additional_models_parameters_for_averaging is None

    result = encdec, eos_idx, src_indexer, tgt_indexer
    if return_model_infos:
        return result, model_infos
    else:
        return result