Beispiel #1
0
 def test_shallow(self):
     test_obj = yaml.load("""
                      a: !DummyArgClass
                        arg1: !DummyArgClass2
                          _xnmt_id: id1
                          v: some_val
                        arg2: !Ref { name: id1 }
                      """)
     preloaded = persistence.YamlPreloader.preload_obj(root=test_obj,
                                                       exp_name="exp1",
                                                       exp_dir=self.out_dir)
     initalized = persistence.initialize_if_needed(preloaded)
     persistence.save_to_file(self.model_file, initalized)
Beispiel #2
0
def main(overwrite_args=None):

    with tee.Tee(), tee.Tee(error=True):
        argparser = argparse.ArgumentParser()
        argparser.add_argument("--dynet-mem", type=str)
        argparser.add_argument("--dynet-seed",
                               type=int,
                               help="set random seed for DyNet and XNMT.")
        argparser.add_argument("--dynet-autobatch", type=int)
        argparser.add_argument("--dynet-devices", type=str)
        argparser.add_argument("--dynet-viz",
                               action='store_true',
                               help="use visualization")
        argparser.add_argument("--dynet-gpu",
                               action='store_true',
                               help="use GPU acceleration")
        argparser.add_argument("--dynet-gpu-ids", type=int)
        argparser.add_argument("--dynet-gpus", type=int)
        argparser.add_argument("--dynet-weight-decay", type=float)
        argparser.add_argument("--dynet-profiling", type=int)
        argparser.add_argument("--settings",
                               type=str,
                               default="standard",
                               help="settings (standard, debug, or unittest)"
                               "must be given in '=' syntax, e.g."
                               " --settings=standard")
        argparser.add_argument("experiments_file")
        argparser.add_argument("experiment_name",
                               nargs='*',
                               help="Run only the specified experiments")
        argparser.set_defaults(generate_doc=False)
        args = argparser.parse_args(overwrite_args)

        if args.dynet_seed:
            random.seed(args.dynet_seed)
            np.random.seed(args.dynet_seed)

        if args.dynet_gpu:
            if settings.CHECK_VALIDITY:
                settings.CHECK_VALIDITY = False
                log_preamble(
                    "disabling CHECK_VALIDITY because it is not supported on GPU currently",
                    logging.WARNING)

        config_experiment_names = YamlPreloader.experiment_names_from_file(
            args.experiments_file)

        results = []

        # Check ahead of time that all experiments exist, to avoid bad surprises
        experiment_names = args.experiment_name or config_experiment_names

        if args.experiment_name:
            nonexistent = set(experiment_names).difference(
                config_experiment_names)
            if len(nonexistent) != 0:
                raise Exception("Experiments {} do not exist".format(",".join(
                    list(nonexistent))))

        log_preamble(
            f"running XNMT revision {tee.get_git_revision()} on {socket.gethostname()} on {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
        )
        for experiment_name in experiment_names:

            ParamManager.init_param_col()

            uninitialized_exp_args = YamlPreloader.preload_experiment_from_file(
                args.experiments_file, experiment_name)

            logger.info(f"=> Running {experiment_name}")

            glob_args = uninitialized_exp_args.data.exp_global
            log_file = glob_args.log_file

            if os.path.isfile(log_file) and not settings.OVERWRITE_LOG:
                logger.warning(
                    f"log file {log_file} already exists, skipping experiment; please delete log file by hand if you want to overwrite it "
                    f"(or activate OVERWRITE_LOG, by either specifying an environment variable as OVERWRITE_LOG=1, "
                    f"or specifying --settings=debug, or changing xnmt.settings.Standard.OVERWRITE_LOG manually)"
                )
                continue

            tee.set_out_file(log_file)

            model_file = glob_args.model_file

            uninitialized_exp_args.data.exp_global.commandline_args = args

            # Create the model
            experiment = initialize_if_needed(uninitialized_exp_args)
            ParamManager.param_col.model_file = experiment.exp_global.model_file
            ParamManager.param_col.save_num_checkpoints = experiment.exp_global.save_num_checkpoints
            ParamManager.populate()

            # Run the experiment
            eval_scores = experiment(save_fct=lambda: save_to_file(
                model_file, experiment, ParamManager.param_col))
            results.append((experiment_name, eval_scores))
            print_results(results)

            tee.unset_out_file()
Beispiel #3
0
    def run(self):
        seed = 13
        random.seed(seed)
        np.random.seed(seed)

        EXP_DIR = os.path.dirname(__file__)
        EXP = "annot"

        model_file = f"{EXP_DIR}/results/{EXP}.mod"
        log_file = f"{EXP_DIR}/results/{EXP}.log"
        xnmt.tee.utils.dy.DynetParams().set_mem(
            1024)  #Doesnt work figure out how to set memory
        xnmt.tee.set_out_file(log_file, exp_name=EXP)

        ParamManager.init_param_col()
        ParamManager.param_col.model_file = model_file

        pre_runner = PreprocRunner(
            tasks=[
                PreprocTokenize(
                    in_files=
                    [  #f'{EXP_DIR}/conala-corpus/conala-trainnodev.snippet',
                        #f'{EXP_DIR}/conala-corpus/conala-trainnodev.intent',
                        #f'{EXP_DIR}/conala-corpus/conala-dev.intent',
                        #f'{EXP_DIR}/conala-corpus/conala-dev.snippet',
                        #f'{EXP_DIR}/conala-corpus/conala-test.intent',
                        #f'{EXP_DIR}/conala-corpus/conala-test.snippet',
                        f'{EXP_DIR}/conala-corpus/attack_code_train.txt',
                        f'{EXP_DIR}/conala-corpus/attack_text_train.txt',
                        f'{EXP_DIR}/conala-corpus/attack_code_test.txt',
                        f'{EXP_DIR}/conala-corpus/attack_text_test.txt'

                        #f'{EXP_DIR}/conala-corpus/all.code',
                        #f'{EXP_DIR}/conala-corpus/all.anno'
                    ],
                    out_files=
                    [  #f'{EXP_DIR}/conala-corpus/conala-trainnodev.tmspm4000.snippet',
                        #f'{EXP_DIR}/conala-corpus/conala-trainnodev.tmspm4000.intent',
                        #f'{EXP_DIR}/conala-corpus/conala-dev.tmspm4000.intent',
                        #f'{EXP_DIR}/conala-corpus/conala-dev.tmspm4000.snippet',
                        #f'{EXP_DIR}/conala-corpus/conala-test.tmspm4000.intent',
                        #f'{EXP_DIR}/conala-corpus/conala-test.tmspm4000.snippet',
                        f'{EXP_DIR}/conala-corpus/attack-train.tmspm4000.snippet',
                        f'{EXP_DIR}/conala-corpus/attack-train.tmspm4000.intent',
                        f'{EXP_DIR}/conala-corpus/attack-test.tmspm4000.snippet',
                        f'{EXP_DIR}/conala-corpus/attack-test.tmspm4000.intent'
                        #f'{EXP_DIR}/conala-corpus/django.tmspm4000.snippet',
                        #f'{EXP_DIR}/conala-corpus/django.tmspm4000.intent'
                    ],
                    specs=[{
                        'filenum':
                        'all',
                        'tokenizers': [
                            SentencepieceTokenizer(
                                hard_vocab_limit=False,
                                train_files=[
                                    f'{EXP_DIR}/conala-corpus/attack_text_train.txt',
                                    f'{EXP_DIR}/conala-corpus/attack_code_train.txt'
                                ],
                                vocab_size=self.vocab_size,
                                model_type=self.model_type,
                                model_prefix=
                                'conala-corpus/attack-train.tmspm4000.spm')
                        ]
                    }]),
                PreprocVocab(
                    in_files=[
                        f'{EXP_DIR}/conala-corpus/attack-train.tmspm4000.intent',
                        f'{EXP_DIR}/conala-corpus/attack-train.tmspm4000.snippet'
                    ],
                    out_files
                    =[
                        f'{EXP_DIR}/conala-corpus/attack-train.tmspm4000.intent.vocab',
                        f'{EXP_DIR}/conala-corpus/attack-train.tmspm4000.snippet.vocab'
                    ],
                    specs=[{
                        'filenum':
                        'all',
                        'filters': [VocabFiltererFreq(min_freq=self.min_freq)]
                    }])
            ],
            overwrite=False)

        src_vocab = Vocab(
            vocab_file=
            f"{EXP_DIR}/conala-corpus/attack-train.tmspm4000.intent.vocab")
        trg_vocab = Vocab(
            vocab_file=
            f"{EXP_DIR}/conala-corpus/attack-train.tmspm4000.snippet.vocab")

        batcher = Batcher(batch_size=64)

        inference = AutoRegressiveInference(search_strategy=BeamSearch(
            len_norm=PolynomialNormalization(apply_during_search=True),
            beam_size=5),
                                            post_process='join-piece')

        layer_dim = self.layer_dim

        model = DefaultTranslator(
            src_reader=PlainTextReader(vocab=src_vocab),
            trg_reader=PlainTextReader(vocab=trg_vocab),
            src_embedder=SimpleWordEmbedder(emb_dim=layer_dim,
                                            vocab=src_vocab),
            encoder=BiLSTMSeqTransducer(input_dim=layer_dim,
                                        hidden_dim=layer_dim,
                                        layers=self.layers),
            attender=MlpAttender(hidden_dim=layer_dim,
                                 state_dim=layer_dim,
                                 input_dim=layer_dim),
            trg_embedder=SimpleWordEmbedder(emb_dim=layer_dim,
                                            vocab=trg_vocab),
            decoder=AutoRegressiveDecoder(
                input_dim=layer_dim,
                rnn=UniLSTMSeqTransducer(
                    input_dim=layer_dim,
                    hidden_dim=layer_dim,
                ),
                transform=AuxNonLinear(input_dim=layer_dim,
                                       output_dim=layer_dim,
                                       aux_input_dim=layer_dim),
                scorer=Softmax(vocab_size=len(trg_vocab), input_dim=layer_dim),
                trg_embed_dim=layer_dim,
                input_feeding=False,
                bridge=CopyBridge(dec_dim=layer_dim)),
            inference=inference)

        #decoder = AutoRegressiveDecoder(bridge=CopyBridge(),inference=inference))

        train = SimpleTrainingRegimen(
            name=f"{EXP}",
            model=model,
            batcher=WordSrcBatcher(avg_batch_size=64),
            trainer=AdamTrainer(alpha=self.alpha),
            patience=3,
            lr_decay=0.5,
            restart_trainer=True,
            run_for_epochs=self.epochs,
            src_file=f"{EXP_DIR}/conala-corpus/attack-train.tmspm4000.intent",
            trg_file=f"{EXP_DIR}/conala-corpus/attack-train.tmspm4000.snippet",
            dev_tasks=[
                LossEvalTask(
                    src_file=
                    f"{EXP_DIR}/conala-corpus/attack-test.tmspm4000.intent",
                    ref_file=
                    f'{EXP_DIR}/conala-corpus/attack-test.tmspm4000.snippet',
                    model=model,
                    batcher=WordSrcBatcher(avg_batch_size=64)),
                AccuracyEvalTask(
                    eval_metrics='bleu',
                    src_file=
                    f'{EXP_DIR}/conala-corpus/attack-test.tmspm4000.intent',
                    ref_file=f'{EXP_DIR}/conala-corpus/attack_text_test.txt',
                    hyp_file=f'results/{EXP}.dev.hyp',
                    model=model)
            ])

        evaluate = [
            AccuracyEvalTask(
                eval_metrics="bleu",
                #src_file=f"{EXP_DIR}/conala-corpus/conala-test.tmspm4000.intent",
                src_file=
                f"{EXP_DIR}/conala-corpus/attack-test.tmspm4000.intent",
                #ref_file=f"{EXP_DIR}/conala-corpus/all.code",
                #ref_file = f"{EXP_DIR}/conala-corpus/conala-test.snippet",
                ref_file=f"{EXP_DIR}/conala-corpus/attack_text_test.txt",
                hyp_file=f"results/{EXP}.test.hyp",
                inference=inference,
                model=model)
        ]

        standard_experiment = Experiment(exp_global=ExpGlobal(
            default_layer_dim=512,
            dropout=0.3,
            log_file=log_file,
            model_file=model_file),
                                         name="annot",
                                         model=model,
                                         train=train,
                                         evaluate=evaluate)

        # run experiment
        standard_experiment(
            save_fct=lambda: save_to_file(model_file, standard_experiment))

        exit()
Beispiel #4
0
    model=model,
    batcher=batcher,
    trainer=AdamTrainer(alpha=0.001),
    run_for_epochs=2,
    src_file="examples/data/head.ja",
    trg_file="examples/data/head.en",
    dev_tasks=[
        LossEvalTask(src_file="examples/data/head.ja",
                     ref_file="examples/data/head.en",
                     model=model,
                     batcher=batcher)
    ],
)

evaluate = [
    AccuracyEvalTask(eval_metrics="bleu,wer",
                     src_file="examples/data/head.ja",
                     ref_file="examples/data/head.en",
                     hyp_file=f"examples/output/{EXP}.test_hyp",
                     inference=inference,
                     model=model)
]

standard_experiment = Experiment(model=model, train=train, evaluate=evaluate)

# run experiment
standard_experiment(save_fct=lambda: save_to_file(
    model_file, standard_experiment, ParamManager.param_col))

exit()
Beispiel #5
0
    run_for_epochs=2,
    src_file="examples/data/head.ja",
    trg_file="examples/data/head.en",
    dev_tasks=[
        LossEvalTask(src_file="examples/data/head.ja",
                     ref_file="examples/data/head.en",
                     model=model,
                     batcher=batcher)
    ],
)

evaluate = [
    AccuracyEvalTask(eval_metrics="bleu,wer",
                     src_file="examples/data/head.ja",
                     ref_file="examples/data/head.en",
                     hyp_file=f"examples/output/{EXP}.test_hyp",
                     inference=inference,
                     model=model)
]

standard_experiment = Experiment(name="programmatic",
                                 model=model,
                                 train=train,
                                 evaluate=evaluate)

# run experiment
standard_experiment(
    save_fct=lambda: save_to_file(model_file, standard_experiment))

exit()
Beispiel #6
0
def main(overwrite_args: Optional[Sequence[str]] = None) -> None:

    with tee.Tee(), tee.Tee(error=True):
        argparser = argparse.ArgumentParser()
        utils.add_backend_argparse(argparser)
        argparser.add_argument("--settings",
                               type=str,
                               default="standard",
                               help="settings (standard, debug, or unittest)"
                               "must be given in '=' syntax, e.g."
                               " --settings=standard")
        argparser.add_argument(
            "--resume",
            action='store_true',
            help="whether a saved experiment is being resumed, and"
            "locations of output files should be re-used.")
        argparser.add_argument("--backend",
                               type=str,
                               default="dynet",
                               help="backend (dynet or torch)")
        argparser.add_argument("experiments_file")
        argparser.add_argument("experiment_name",
                               nargs='*',
                               help="Run only the specified experiments")
        argparser.set_defaults(generate_doc=False)
        args = argparser.parse_args(overwrite_args)

        if xnmt.backend_dynet and args.dynet_seed: args.seed = args.dynet_seed
        if getattr(args, "seed", None):
            random.seed(args.seed)
            np.random.seed(args.seed)
            if xnmt.backend_torch: torch.manual_seed(0)

        if xnmt.backend_dynet and args.dynet_gpu and settings.CHECK_VALIDITY:
            settings.CHECK_VALIDITY = False
            log_preamble(
                "disabling CHECK_VALIDITY because it is not supported in the DyNet/GPU setting",
                logging.WARNING)

        config_experiment_names = YamlPreloader.experiment_names_from_file(
            args.experiments_file)

        results = []

        # Check ahead of time that all experiments exist, to avoid bad surprises
        experiment_names = args.experiment_name or config_experiment_names

        if args.experiment_name:
            nonexistent = set(experiment_names).difference(
                config_experiment_names)
            if len(nonexistent) != 0:
                raise Exception("Experiments {} do not exist".format(",".join(
                    list(nonexistent))))

        log_preamble(
            f"running XNMT revision {tee.get_git_revision()} on {socket.gethostname()} with {'DyNet' if xnmt.backend_dynet else 'PyTorch'} on {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
        )
        for experiment_name in experiment_names:

            ParamManager.init_param_col()

            uninitialized_exp_args = YamlPreloader.preload_experiment_from_file(
                args.experiments_file, experiment_name, resume=args.resume)

            logger.info(f"=> Running {experiment_name}")

            glob_args = uninitialized_exp_args.data.exp_global
            log_file = glob_args.log_file

            if not settings.OVERWRITE_LOG:
                log_files_exist = []
                if os.path.isfile(log_file): log_files_exist.append(log_file)
                if os.path.isdir(log_file + ".tb"):
                    log_files_exist.append(log_file + ".tb/")
                if log_files_exist:
                    logger.warning(
                        f"log file(s) {' '.join(log_files_exist)} already exists, skipping experiment; "
                        f"please delete log file by hand if you want to overwrite it "
                        f"(or activate OVERWRITE_LOG, by either specifying an environment variable OVERWRITE_LOG=1, "
                        f"or specifying --settings=debug, or changing xnmt.settings.Standard.OVERWRITE_LOG manually)"
                    )
                    continue
            elif settings.OVERWRITE_LOG and os.path.isdir(log_file + ".tb"):
                shutil.rmtree(
                    log_file + ".tb/"
                )  # remove tensorboard logs from previous run that is being overwritten

            tee.set_out_file(log_file, exp_name=experiment_name)

            try:

                model_file = glob_args.model_file

                uninitialized_exp_args.data.exp_global.commandline_args = vars(
                    args)

                # Create the model
                experiment = initialize_if_needed(uninitialized_exp_args)
                ParamManager.param_col.model_file = experiment.exp_global.model_file
                ParamManager.param_col.save_num_checkpoints = experiment.exp_global.save_num_checkpoints
                ParamManager.populate()

                # Run the experiment
                eval_scores = experiment(
                    save_fct=lambda: save_to_file(model_file, experiment))
                results.append((experiment_name, eval_scores))
                print_results(results)

            except Exception as e:
                file_logger.error(traceback.format_exc())
                raise e
            finally:
                tee.unset_out_file()
Beispiel #7
0
import xnmt.tee
from xnmt.param_collection import ParamManager
from xnmt.persistence import initialize_if_needed, YamlPreloader, LoadSerialized, save_to_file

EXP_DIR = os.path.dirname(__file__)
EXP = "programmatic-load"

model_file = f"{EXP_DIR}/models/{EXP}.mod"
log_file = f"{EXP_DIR}/logs/{EXP}.log"

xnmt.tee.set_out_file(log_file)

ParamManager.init_param_col()

load_experiment = LoadSerialized(
  filename=f"{EXP_DIR}/models/programmatic.mod",
  overwrite=[
    {"path" : "train", "val" : None}
  ]
)

uninitialized_experiment = YamlPreloader.preload_obj(load_experiment, exp_dir=EXP_DIR, exp_name=EXP)
loaded_experiment = initialize_if_needed(uninitialized_experiment)

# if we were to continue training, we would need to set a save model file like this:
# ParamManager.param_col.model_file = model_file
ParamManager.populate()

# run experiment
loaded_experiment(save_fct=lambda: save_to_file(model_file, loaded_experiment))