def __init__(self, config):
        RLContextRemote = ray.remote(RLContext)
        ExperienceReplayBufferRemote = ray.remote(ExperienceReplayBuffer)

        self.config = config
        self.n_collectors = self.config.get('n_collectors', 3)

        if self.n_collectors == 0:
            self.rl_context = RLContext(config=self.config,
                                        gin_config_str=gin.config_str())
            self.replay_buffer = ExperienceReplayBuffer(config=self.config,
                                                        collectors=[])

        else:
            self.remote_rl_contexts = [
                RLContextRemote.remote(config=self.config,
                                       gin_config_str=gin.config_str())
                for _ in range(self.n_collectors)
            ]
            self.replay_buffer = ExperienceReplayBufferRemote.remote(
                config=self.config, collectors=self.remote_rl_contexts)
            self.next_batch_refs = set()
            self.stats_ref = None

        self.future_batch_size = self.config.get('future_batch_size', 10)
        self.collect_initial()
Example #2
0
def train_func(config):
    # Hyperparameters
    bindings = []
    for key, value in config.items():
        bindings.append(f'{key}={value}')

    # generate folder structures
    run_paths = utils_params.gen_run_folder(','.join(bindings[2]))

    # set loggers
    utils_misc.set_loggers(run_paths['path_logs_train'], logging.INFO)

    # gin-config
    gin.parse_config_files_and_bindings(['configs/config.gin'], bindings)
    utils_params.save_config(run_paths['path_gin'], gin.config_str())

    # setup pipeline
    ds_train, ds_val, ds_test = load_from_tfrecords()

    # model
    model = TransformerS2S()

    trainer = Trainer(model, ds_train, ds_val, run_paths)
    for val_accuracy in trainer.train():
        tune.report(val_accuracy=val_accuracy)
def descartes_builder(name='out', params=[]):
    print("Building grid search with parameters: ", params)

    directory = os.path.join('grids', name)
    if not os.path.exists(directory):
        os.makedirs(directory)

    all_values = []
    all_params = []
    for param in params:
        values = gin.query_parameter(param)
        all_params.append(param)
        all_values.append(values)
    descartes = itertools.product(*all_values)

    i = 0
    for one in descartes:

        exp_directory = os.path.join(directory, str(i))
        if not os.path.exists(exp_directory):
            os.makedirs(exp_directory)

        with gin.unlock_config():
            for param_idx in range(len(all_params)):
                gin.bind_parameter(all_params[param_idx], one[param_idx])

        config_str = gin.config_str()
        with open(os.path.join(exp_directory, 'config.gin'), 'w+') as f:
            f.write(config_str)
        i += 1
    pass
Example #4
0
def train_func(config):
    # Hyperparameters
    bindings = []
    for key, value in config.items():
        bindings.append(f'{key}={value}')

    # generate folder structures
    run_paths = utils_params.gen_run_folder(','.join(bindings))

    # set loggers
    utils_misc.set_loggers(run_paths['path_logs_train'], logging.INFO)

    # gin-config
    gin.parse_config_files_and_bindings(['/mnt/home/repos/dl-lab-skeleton/diabetic_retinopathy/configs/config.gin'], bindings)
    utils_params.save_config(run_paths['path_gin'], gin.config_str())

    # setup pipeline
    ds_train, ds_val, ds_test, ds_info = load()

    # model
    model = vgg_like(input_shape=ds_info.features["image"].shape, n_classes=ds_info.features["label"].num_classes)

    trainer = Trainer(model, ds_train, ds_val, ds_info, run_paths)
    for val_accuracy in trainer.train():
        tune.report(val_accuracy=val_accuracy)
Example #5
0
def train_func(config):
    # Hyperparameters
    bindings = []
    for key, value in config.items():
        bindings.append(f'{key}={value}')

    # generate folder structures
    run_paths = utils_params.gen_run_folder(bindings[2])

    # set loggers
    utils_misc.set_loggers(run_paths['path_logs_train'], logging.INFO)

    # gin-config
    # gin dir should be replaced by your own dir
    gin.parse_config_files_and_bindings([r'D:\Uni Stuttgart\Deep learning lab\Diabetic Retinopathy Detection\dl-lab-2020-team08\diabetic_retinopathy\configs\config.gin'],
                                        bindings)
    utils_params.save_config(run_paths['path_gin'], gin.config_str())

    # setup pipeline
    train_ds, valid_ds, test_ds = datasets.load()

    # model
    model = DenseNet121(IMG_SIZE=256)

    trainer = Trainer(model=model, ds_train=train_ds, ds_val=test_ds, run_paths=run_paths)
    for val_accuracy in trainer.train():
        tune.report(val_accuracy=val_accuracy)
Example #6
0
def setup_logger():
    # import os
    # Set run specific envirorment configurations
    timestamp = time.strftime("run_%Y_%m_%d_%H_%M_%S") + "_{machine}".format(
        machine=socket.gethostname())

    gin.bind_parameter(
        'multi_tasking_train.model_storage_directory',
        os.path.join(
            gin.query_parameter('multi_tasking_train.model_storage_directory'),
            timestamp))

    os.makedirs(
        gin.query_parameter('multi_tasking_train.model_storage_directory'),
        exist_ok=True)

    log.handlers.clear()
    formatter = logging.Formatter('%(message)s')
    fh = logging.FileHandler(
        os.path.join(
            gin.query_parameter('multi_tasking_train.model_storage_directory'),
            "log.txt"))
    fh.setLevel(logging.INFO)
    fh.setFormatter(formatter)
    log.addHandler(fh)

    ch = logging.StreamHandler()
    ch.setLevel(logging.INFO)
    ch.setFormatter(formatter)
    log.setLevel(logging.INFO)
    log.addHandler(ch)

    # Set global GPU state
    if torch.cuda.is_available() and gin.query_parameter(
            'multi_tasking_train.device') == 'cuda':
        log.info("Using CUDA device:{0}".format(torch.cuda.current_device()))
    else:
        if gin.query_parameter('multi_tasking_train.device') == 'cpu':
            log.info("Utilizing CPU")
        else:
            raise Exception(
                f"Unrecognized device: {gin.query_parameter('multi_tasking_train.device')}"
            )

    # ML-Flow
    mlflow.set_tracking_uri(
        f"{gin.query_parameter('multi_tasking_train.ml_flow_directory')}")
    mlflow.set_experiment(
        f"/{gin.query_parameter('multi_tasking_train.experiment_name')}")

    mlflow.start_run()
    gin_parameters = gin.config._CONFIG.get(list(gin.config._CONFIG.keys())[0])
    mlflow.log_params(gin_parameters)

    # all_params = {x[1].split('.')[-1]: gin.config._CONFIG.get(x) for x in list(gin.config._CONFIG.keys())}
    all_params = gin.config_str()
    with open('config_log.txt', 'w') as f:
        f.write(all_params)
    mlflow.log_artifact("config_log.txt")
    mlflow.log_artifact(__file__)
def main(_):
    gin.parse_config_files_and_bindings(FLAGS.gin_files,
                                        bindings=FLAGS.gin_bindings,
                                        skip_unknown=False)
    logging.info(gin.config_str())

    train_eval()
Example #8
0
def main(argv):
    del argv
    if not hasattr(FLAGS.hparams, "items"):
        FLAGS.hparams = utils.YAMLDictParser().parse(FLAGS.hparams)

    log_dir = FLAGS.neutra_log_dir
    utils.BindHParams(FLAGS.hparams)
    if FLAGS.restore_from_config:
        with tf.io.gfile.GFile(os.path.join(log_dir, "config")) as f:
            gin.parse_config(f.read())

    tf.io.gfile.makedirs(log_dir)
    summary_writer = tf.summary.create_file_writer(log_dir, flush_millis=10000)
    summary_writer.set_as_default()
    tf.summary.experimental.set_step(0)

    for i in range(10):
        try:
            checkpoint_log_dir = (FLAGS.checkpoint_log_dir
                                  if FLAGS.checkpoint_log_dir else
                                  FLAGS.neutra_log_dir)
            exp = neutra.NeuTraExperiment(log_dir=checkpoint_log_dir)
            with tf.io.gfile.GFile(os.path.join(log_dir, "config"), "w") as f:
                f.write(gin.config_str())
            logging.info("Config:\n%s", gin.config_str())

            checkpoint = checkpoint_log_dir + "/model.ckpt"
            if tf.io.gfile.exists(checkpoint + ".index"):
                logging.info("Restoring from %s", checkpoint)
                exp.checkpoint.restore(checkpoint)

            with utils.use_xla(False):
                if FLAGS.mode == "train":
                    Train(exp)
                elif FLAGS.mode == "objective":
                    TuneObjective(exp)
                elif FLAGS.mode == "benchmark":
                    Benchmark(exp)
                elif FLAGS.mode == "eval":
                    Eval(exp)
                break
        except tf.errors.InvalidArgumentError as e:
            if "NaN" in e.message:
                logging.error(e.message)
                logging.error("Got a NaN, try: %d", i)
            else:
                raise e
def write_config(experiment_name):
    """Write output config"""
    base_path = os.getenv('AICROWD_OUTPUT_PATH', '../scratch/shared')
    path = os.path.join(base_path, experiment_name)
    if not os.path.exists(path):
        os.mkdir(path)
    with open(os.path.join(path, 'conf.gin'), 'w') as f:
        f.write(gin.config_str())
 def _init_tboard_logging(self):
     self.summary_writer = SummaryWriter(
         log_dir=module_dirs.get_current_tboard_dir(),
         comment=self.agent_name,
     )
     # add config string to summary
     config_str = gin.config_str()
     # see https://stackoverflow.com/questions/45016458/tensorflow-tf-summary-text-and-linebreaks
     config_str = config_str.replace('\n', '  \n')
     self.summary_writer.add_text(tag='gin_config', text_string=config_str)
def tuning(config):
    # set hyperparameters
    bindings = []
    for key, value in config.items():
        bindings.append('{}={}'.format(str(key), str(value)))

    # generate folder structures
    run_paths = utils_params.gen_run_folder(','.join(bindings))

    # gin-config
    gin.parse_config_files_and_bindings([config_path], bindings)
    utils_params.save_config(run_paths['path_gin'], gin.config_str())

    # setup pipeline
    ds_train, ds_val, ds_test, ds_info = datasets.load(model_type=model_type)

    # setup model
    if model_name == 'VGG16':
        model = vgg_like(input_shape=(256, 256, 3), model_type=model_type)
    elif model_name == 'Simplified Inception':
        model = simplified_inception(input_shape=(256, 256, 3), model_type=model_type)
    elif model_name == 'Simplified SEResNeXt':
        model = simplified_seresnext(input_shape=(256, 256, 3), model_type=model_type)
    elif model_name == 'RepVGG':
        model = rep_vgg(input_shape=(256, 256, 3), model_type=model_type)
    elif model_name == 'DenseNet201':
        model = densenet201(input_shape=(256, 256, 3), model_type=model_type)
    elif model_name == 'EfficientNetB3':
        model = efficientnetb3(input_shape=(256, 256, 3), model_type=model_type)
    else:
        model = vgg_like(input_shape=(256, 256, 3), model_type=model_type)

    # set training loggers
    utils_misc.set_loggers(run_paths['path_logs_train'], logging.INFO)

    # train the model
    trainer = Trainer(model, ds_train, ds_val, ds_info, model_type=model_type, run_paths=run_paths)
    for val_accuracy in trainer.train():
        tune.report(val_accuracy=val_accuracy * 100)

    # set validation loggers
    utils_misc.set_loggers(run_paths['path_logs_eval'], logging.INFO)

    # evaluate the model
    trained_model = trainer.model_output()
    if model_type == 'regression':
        trained_model.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.Huber(delta=0.3), metrics=[BinaryAccuracy(model_type=model_type)])
    elif model_type == 'binary_classification':
        trained_model.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics=[BinaryAccuracy(model_type=model_type)])
    elif model_type == 'multi_classification':
        trained_model.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics=[BinaryAccuracy(model_type=model_type)])

    result = trained_model.evaluate(ds_test, return_dict=True)
    test_accuracy = result['binary_accuracy']
    tune.report(test_accuracy=test_accuracy * 100)
 def __getstate__(self):
     result = {
         k: getattr(self, k)
         for k in self.PICKLE_DIRECTLY if hasattr(self, k)
     }
     result['trainables_weights'] = {
         k: v.state_dict()
         for k, v in self.trainables.items()
     }
     result['gin_config'] = gin.config_str()
     return result
Example #13
0
def load_config(save_config=True):
    """Loads config."""
    gin.parse_config_files_and_bindings(flags.FLAGS.gin_configs,
                                        flags.FLAGS.gin_bindings,
                                        skip_unknown=True)
    config = Config()
    if save_config and jax.host_id() == 0:
        os.makedirs(config.checkpoint_dir)
        with open(config.checkpoint_dir + '/config.gin', 'w') as f:
            f.write(gin.config_str())
    return config
Example #14
0
def _write_gin_configs(output_file, operative=True):
    """Writes current gin configs to `output_file`."""
    if operative:
        config_str = gin.operative_config_str()
    else:
        config_str = gin.config_str()

    logging.info('=' * 80)
    logging.info('Gin configs\n%s', config_str)
    logging.info('=' * 80)
    with tf.io.gfile.GFile(output_file, 'w') as f:
        f.write(config_str)
Example #15
0
def main(argv):
    gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_param, skip_unknown=True)
    print("Gin parameter bindings:\n{}".format(gin.config_str()))

    use_neptune = "NEPTUNE_API_TOKEN" in os.environ
    exp_id = ''

    if use_neptune:
        neptune.init(project_qualified_name='bbeatrix/curl')
        exp = neptune.create_experiment(params=gin_config_to_dict(gin.config_str()),
                                        name=FLAGS.gin_file[0].split('/')[-1][:-4],
                                        upload_source_files=['./*.py'])
        exp_id = exp.id
    else:
        neptune.init('shared/onboarding', 'ANONYMOUS', backend=neptune.OfflineBackend())

    neptune.log_text('gin_config', gin.config_str())
    neptune.log_artifact(*FLAGS.gin_file, 'gin_config_{}.gin'.format(exp_id))

    exp_manager = ExperimentManager(prefix=exp_id)
    exp_manager.run_experiment()

    neptune.stop()
    print("Fin")
def main(argv):
  del argv  # Unused.
  gin.parse_config_files_and_bindings(FLAGS.gin_configs, FLAGS.gin_bindings)

  tf.io.gfile.makedirs(FLAGS.summary_dir)
  save_gin_config(gin.config_str(),
                  os.path.join(FLAGS.summary_dir, "config.parsed.gin"))

  if FLAGS.run_train:
    trainer.train(ckpt_dir=FLAGS.ckpt_dir, summary_dir=FLAGS.summary_dir)
  if FLAGS.run_eval:
    evaluator.evaluate(ckpt_dir=FLAGS.ckpt_dir, summary_dir=FLAGS.summary_dir)

  save_gin_config(gin.operative_config_str(),
                  os.path.join(FLAGS.summary_dir, "config.operative.gin"))
 def write_gin_config(output_folder, swp=''):
     if swp is not '':
         swp = swp.lower()
         for c in " \{\}[]'":
             swp = swp.replace(c, '')
         for c in ",:":
             swp = swp.replace(c, '_')
         output_folder += f':{swp}'
     os.makedirs(output_folder, exist_ok=True)
     gin.bind_parameter('output_folder._output_folder',
                        os.path.abspath(output_folder))
     output_path = os.path.join(output_folder, 'config.gin')
     with open(output_path, 'w') as f:
         logging.info(f'{swp} -> {output_path}')
         f.write(gin.config_str())
     return output_path
Example #18
0
def main(argv):
  del argv

  save_dir = FLAGS.save_dir
  if FLAGS.index is not None:
    save_dir = os.path.join(save_dir, str(FLAGS.index))
  logging.info('SAVE DIR: %s', save_dir)

  gin.parse_config_files_and_bindings(FLAGS.gin_config, FLAGS.gin_bindings)
  logging.info('CONFIG DIRS: %s', str(FLAGS.gin_config))

  gfile.makedirs(save_dir)
  with gfile.GFile(os.path.join(save_dir, 'config.gin'), 'w') as f:
    f.write(gin.config_str())

  train(save_dir)
Example #19
0
def main(argv):
    # generate folder structures
    run_paths = utils_params.gen_run_folder()

    # set loggers
    utils_misc.set_loggers(run_paths['path_logs_train'], logging.INFO)

    # gin-config
    gin.parse_config_files_and_bindings(['configs/config.gin'], [])
    utils_params.save_config(run_paths['path_gin'], gin.config_str())

    # setup pipeline
    ds_train, ds_val, ds_test = load_tfrecords.load_from_tfrecords()

    # print number of available GPUs
    print("Num GPUs Available: ",
          len(tf.config.experimental.list_physical_devices('GPU')))

    if FLAGS.train:
        model = TransformerS2S()
        model.build((None, 250, 6))
        model.summary()
        trainer = Trainer(model, ds_train, ds_val, run_paths)
        for _ in trainer.train():
            continue

    else:
        # get one completely trained model to do evaluating
        opt = tf.keras.optimizers.Adam()
        model = TransformerS2S()
        ckpt = tf.train.Checkpoint(step=tf.Variable(1),
                                   optimizer=opt,
                                   net=model)

        # change ckpt dir to load the ckpt you want
        manager = tf.train.CheckpointManager(
            ckpt,
            "/content/drive/MyDrive/experiments/run_2021-01-24T13-52-22-787253/ckpts",
            max_to_keep=3)
        ckpt.restore(manager.latest_checkpoint)
        print("Restored from {}".format(manager.latest_checkpoint))
        evaluate(model, ds_test)
def run():
    """Run the beam pipeline to create synthetic dataset."""
    pipeline_options = beam.options.pipeline_options.PipelineOptions(
        FLAGS.pipeline_options)
    with beam.Pipeline(options=pipeline_options) as pipeline:
        for gin_search_path in [GIN_PATH] + FLAGS.gin_search_path:
            gin.add_config_file_search_path(gin_search_path)
        gin.parse_config_files_and_bindings(FLAGS.gin_file,
                                            FLAGS.gin_param,
                                            skip_unknown=True)

        np.random.seed(FLAGS.random_seed)
        _ = (pipeline
             | beam.Create(np.random.randint(2**32, size=FLAGS.num_examples))
             | beam.ParDo(GenerateExampleFn(gin.config_str()))
             | beam.Reshuffle()
             | beam.Map(_float_dict_to_tfexample)
             | beam.io.tfrecordio.WriteToTFRecord(FLAGS.output_tfrecord_path,
                                                  num_shards=FLAGS.num_shards,
                                                  coder=beam.coders.ProtoCoder(
                                                      tf.train.Example)))
def main(argv):
    # generate folder structures
    run_paths = utils_params.gen_run_folder(folder)

    # gin-config
    gin.parse_config_files_and_bindings(['configs/config.gin'], [])
    utils_params.save_config(run_paths['path_gin'], gin.config_str())

    # setup pipeline
    ds_train, ds_val, ds_test, ds_info = datasets.load(model_type=model_type)

    # setup model
    if model_name == 'VGG16':
        model = vgg_like(input_shape=(256, 256, 3), model_type=model_type)
    elif model_name == 'Simplified Inception':
        model = simplified_inception(input_shape=(256, 256, 3), model_type=model_type)
    elif model_name == 'Simplified SEResNeXt':
        model = simplified_seresnext(input_shape=(256, 256, 3), model_type=model_type)
    elif model_name == 'RepVGG':
        model = rep_vgg(input_shape=(256, 256, 3), model_type=model_type)
    elif model_name == 'DenseNet201':
        model = densenet201(input_shape=(256, 256, 3), model_type=model_type)
    elif model_name == 'EfficientNetB3':
        model = efficientnetb3(input_shape=(256, 256, 3), model_type=model_type)
    else:
        model = vgg_like(input_shape=(256, 256, 3), model_type=model_type)
    model.summary()

    if FLAGS.train:
        # set training loggers
        utils_misc.set_loggers(run_paths['path_logs_train'], logging.INFO)
        # train the model
        trainer = Trainer(model, ds_train, ds_val, ds_info, model_type=model_type, run_paths=run_paths)
        for _ in trainer.train():
            continue
    else:
        # set validation loggers
        utils_misc.set_loggers(run_paths['path_logs_eval'], logging.INFO)
        # evaluate the model
        evaluate(model, ds_test, ds_info, model_type=model_type, run_paths=run_paths)
Example #22
0
def preprocess(
        target_processor: DataProcessor.__class__,
        output_dir: str,
        ignore_asserts: bool,
        random_seed=None,
):
    os.makedirs(output_dir, exist_ok=True)
    setup_logger(output_dir, target_processor.__name__)

    LOGGER.info("GOT config: \n======config======\n %s \n========config=======" % gin.config_str())
    if random_seed is not None:
        LOGGER.info('Setting random seed to %d', random_seed)
        seed_everything(random_seed)
    for data_df, basename in parse():
        LOGGER.info("[Preprocess]: started processing a df with %d rows:" % len(data_df))
        processor: DataProcessor = target_processor(data_df=data_df,
                                                    output_dir=output_dir)

        generator = processor.generate_chunks_iterable()

        preprocessed_chunks = []
        try:
            for (idx, df_chunk) in tqdm(generator):
                try:
                    data_chunk = processor.construct_chunk(df_chunk)
                except AssertionError as ex:
                    if ignore_asserts:
                        LOGGER.warning("GOT ASSERT %r on idx %d" % (ex, idx))
                        continue
                    else:
                        raise ex
                preprocessed_chunks.append(
                    processor.preprocess_chunk(chunk=data_chunk, idx=basename)
                )
        except KeyboardInterrupt as ex:
            LOGGER.warning("BREAKING by interrupt. got %d processed chunks" % len(preprocessed_chunks))
        processed_data = processor.postprocess_chunks(preprocessed_chunks)
        processor.save_on_disk(processed_data)
Example #23
0
def main(argv):

    # generate folder structures
    run_paths = utils_params.gen_run_folder()
    # set loggers
    utils_misc.set_loggers(run_paths['path_logs_train'], logging.INFO)

    # gin-config
    gin.parse_config_files_and_bindings([
        r'D:\Uni Stuttgart\Deep learning lab\Diabetic Retinopathy Detection\dl-lab-2020-team08\diabetic_retinopathy\configs\config.gin'
    ], [])
    utils_params.save_config(run_paths['path_gin'], gin.config_str())

    # setup pipeline
    train_ds, valid_ds, test_ds = datasets.load()

    # training including fine tuning
    if FLAGS.train:
        # model
        if FLAGS.train:
            model = DenseNet121(IMG_SIZE=256)
            model.summary()

            # training and fine tuning
            trainer = Trainer(model=model,
                              ds_train=train_ds,
                              ds_val=valid_ds,
                              run_paths=run_paths)
            for _ in trainer.train():
                continue

    else:
        # evaluation
        # model dir should be replaced by saved model dir
        model_dir = r"\diabetic_retinopathy\logs\20201221-225335\saved_model_ft"
        model = tf.keras.models.load_model(model_dir)
        evaluate(model, valid_ds)
def main(_):
    # https://github.com/google-research/text-to-text-transfer-transformer/blob/c0ea75dbe9e35a629ae2e3c964ef32adc0e997f3/t5/models/mesh_transformer_main.py#L149
    # Add search path for gin files stored in package.
    gin.add_config_file_search_path(
        pkg_resources.resource_filename(__name__, "gin"))
    gin.parse_config_files_and_bindings(FLAGS.gin_file,
                                        FLAGS.gin_param,
                                        finalize_config=True)
    pl.seed_everything(1234)
    with gin.config_scope('sroie_t5_baseline'):
        task_functions_maps = get_tasks_functions_maps()

    # Datasets
    with gin.config_scope('train_sroie'):
        train_keynames = get_all_keynames_from_dir()

    with gin.config_scope('validation_sroie'):
        val_keynames = get_all_keynames_from_dir()

    train_datasets = get_datasets_dict_from_task_functions_map(
        keynames=train_keynames, tasks_functions_maps=task_functions_maps)
    val_datasets = get_datasets_dict_from_task_functions_map(
        keynames=val_keynames, tasks_functions_maps=task_functions_maps)

    with gin.config_scope('task_train'):
        task_train = operative_macro()

    # Initializing model
    model = T5OCRBaseline()

    # Trainer
    if FLAGS.debug:
        logger = False
        trainer_callbacks = []
    else:
        logger = NeptuneLogger(
            close_after_fit=False,
            api_key=os.environ["NEPTUNE_API_TOKEN"],
            # project_name is set via gin file
            # params=None,
            tags=[model.t5_model_prefix, task_train, 't5_ocr_baseline'])
        with gin.config_scope('sroie_t5_baseline'):
            checkpoint_callback = config_model_checkpoint(
                monitor=None if FLAGS.best_model_run_mode else "val_f1",
                dirpath=("/home/marcospiau/final_project_ia376j/checkpoints/"
                         f"{logger.project_name.replace('/', '_')}/"
                         "t5_ocr_baseline/"),
                prefix=(
                    f"experiment_id={logger.experiment.id}-task={task_train}-"
                    "t5_model_prefix="
                    f"{model.t5_model_prefix.replace('-', '_')}"),
                filename=("{step}-{epoch}-{val_precision:.6f}-{val_recall:.6f}"
                          "-{val_f1:.6f}-{val_exact_match:.6f}"),
                mode="max",
                save_top_k=None if FLAGS.best_model_run_mode else 1,
                verbose=True)
        early_stop_callback = config_early_stopping_callback()
        trainer_callbacks = [checkpoint_callback, early_stop_callback]

    trainer = Trainer(
        checkpoint_callback=not (FLAGS.debug),
        log_gpu_memory=True,
        # profiler=FLAGS.debug,
        logger=logger,
        callbacks=trainer_callbacks,
        progress_bar_refresh_rate=1,
        log_every_n_steps=1)
    # Dataloaders
    train_loader_kwargs = {
        'num_workers': mp.cpu_count(),
        'shuffle': True if (trainer.overfit_batches == 0) else False,
        'pin_memory': True
    }

    if trainer.overfit_batches != 0:
        with gin.unlock_config():
            gin.bind_parameter(
                'get_dataloaders_dict_from_datasets_dict.batch_size', 1)

    eval_loader_kwargs = {**train_loader_kwargs, **{'shuffle': False}}

    train_dataloaders = get_dataloaders_dict_from_datasets_dict(
        datasets_dict=train_datasets, dataloader_kwargs=train_loader_kwargs)
    val_dataloaders = get_dataloaders_dict_from_datasets_dict(
        datasets_dict=val_datasets, dataloader_kwargs=eval_loader_kwargs)

    # Logging important artifacts and params
    if logger:
        to_upload = {
            'gin_operative_config.gin': gin.operative_config_str(),
            'gin_complete_config.gin': gin.config_str(),
            'abseil_flags.txt': FLAGS.flags_into_string()
        }
        for destination, content in to_upload.items():
            buffer = StringIO(initial_value=content)
            buffer.seek(0)
            logger.log_artifact(buffer, destination=destination)
        params_to_log = dict()
        params_to_log['str_replace_newlines'] = gin.query_parameter(
            'sroie_t5_baseline/get_default_preprocessing_functions.'
            'str_replace_newlines')
        params_to_log['task_train'] = task_train
        params_to_log['patience'] = early_stop_callback.patience
        params_to_log['max_epochs'] = trainer.max_epochs
        params_to_log['min_epochs'] = trainer.min_epochs
        params_to_log[
            'accumulate_grad_batches'] = trainer.accumulate_grad_batches
        params_to_log['batch_size'] = train_dataloaders[task_train].batch_size

        for k, v in params_to_log.items():
            logger.experiment.set_property(k, v)

    trainer.fit(model,
                train_dataloader=train_dataloaders[task_train],
                val_dataloaders=val_dataloaders[task_train])

    # Logging best metrics and saving best checkpoint on Neptune experiment
    if logger:
        trainer.logger.experiment.log_text(
            log_name='best_model_path',
            x=trainer.checkpoint_callback.best_model_path)
        if not (FLAGS.best_model_run_mode):
            trainer.logger.experiment.log_metric(
                'best_model_val_f1',
                trainer.checkpoint_callback.best_model_score.item())
        if FLAGS.upload_best_checkpoint:
            trainer.logger.experiment.log_artifact(
                trainer.checkpoint_callback.best_model_path)

        trainer.logger.experiment.stop()
Example #25
0
def get_options_dict(n_epochs=None,
                     log_dir=gin.REQUIRED,
                     log_name=gin.REQUIRED,
                     trn_files=gin.REQUIRED,
                     tst_files=gin.REQUIRED,
                     input_shape=gin.REQUIRED,
                     test_only=False,
                     restore=None,
                     restore_resnet_features=None,
                     original_transnet=False,
                     transition_only_trn_files=None,
                     create_dir_and_summaries=True,
                     transition_only_data_fraction=0.3,
                     c3d_net=False,
                     bi_tempered_loss=False,
                     bi_tempered_loss_temp2=1.,
                     learning_rate_schedule=None,
                     learning_rate_decay=None):
    trn_files_ = []
    for fn in trn_files:
        trn_files_.extend(glob.glob(fn))

    if transition_only_trn_files is not None:
        transition_trn_files_ = []
        for fn in transition_only_trn_files:
            transition_trn_files_.extend(glob.glob(fn))

    tst_files_ = {}
    for k, v in tst_files.items():
        tst_files_[k] = []
        for fn in v:
            tst_files_[k].extend(glob.glob(fn))

    log_dir = os.path.join(log_dir, log_name + "_" + datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S"))
    summary_writer = tf.summary.create_file_writer(log_dir) if create_dir_and_summaries else None

    config_str = gin.config_str().replace("# ", "### ").split("\n")
    config_str = "\n\n".join([l for l in config_str if not l.startswith("### =====")])

    if create_dir_and_summaries:
        with summary_writer.as_default():
            tf.summary.text("config", config_str, step=0)
        with open(os.path.join(log_dir, "config.gin"), "w") as f:
            f.write(gin.config_str())

    print("\n{}\n".format(log_name.upper()))

    return {
        "n_epochs": n_epochs,
        "log_dir": log_dir,
        "summary_writer": summary_writer,
        "trn_files": trn_files_,
        "tst_files": tst_files_,
        "input_shape": input_shape,
        "test_only": test_only,
        "restore": restore,
        "restore_resnet_features": restore_resnet_features,
        "original_transnet": original_transnet,
        "transition_only_trn_files": transition_trn_files_ if transition_only_trn_files is not None else None,
        "transition_only_data_fraction": transition_only_data_fraction,
        "c3d_net": c3d_net,
        "bi_tempered_loss": bi_tempered_loss,
        "bi_tempered_loss_temp2": bi_tempered_loss_temp2,
        "learning_rate_schedule": learning_rate_schedule,
        "learning_rate_decay": learning_rate_decay
    }
Example #26
0
def train(experiment_name,
          ml_flow_directory,
          data_directory,
          ablation_amount,
          transformer_weights,
          use_pretrained_heads,
          model_storage_directory,
          device,
          learning_rate,
          seed,
          repeat_in_epoch_sampling,
          evaluation_interval=1,
          checkpoint_interval=1,
          shuffle=True,
          num_workers=0,
          num_epochs=1,
          transformer_hidden_size=768,
          transformer_dropout_prob=.1,
          eval_on_dev=True,
          write_predictions=False):
    """

    Args:
        eval_on_dev: If True, evaluation is performed on using the development
            dataset during training/fine-tuning.
        write_predictions: If True, model output predictions and probabilities
            will be written to file.
        ablation_amount (float): the proportion of examples to ablate from
            training dataset, leaving 1-ablation_amount remaining.
    """
    log.info(gin.config_str())
    mlflow.set_tag('ablation', ablation_amount)
    torch.random.manual_seed(seed)
    ablation_amount = float(ablation_amount)  # type check this

    heads_and_datasets = prepare_datasets(
        ablation_amount,
        data_directory,
        num_workers,
        shuffle,
        transformer_dropout_prob,
        transformer_hidden_size,
        eval_on_dev=True)  # uses dev set for training eval
    validation_heads_and_dataloaders = prepare_datasets(
        ablation_amount,
        data_directory,
        num_workers,
        shuffle,
        transformer_dropout_prob,
        transformer_hidden_size,
        eval_on_dev=False)  # uses test set for training eval
    validation_heads_and_dataloaders = [
        (head, training_set, test_set)
        for (head, training_set, _), (_, _, test_set) in zip(
            heads_and_datasets, validation_heads_and_dataloaders)
    ]

    print(f'Data loaded. Begin Training {len(heads_and_datasets)} Tasks.')
    print(
        f'ABLATION AMOUNT {ablation_amount} ({(1 - ablation_amount) * 100} percent of data retained'
    )

    # TRAIN MODEL
    heads = [head for head, _, _ in heads_and_datasets]

    mlflow.set_tag('number_tasks', str(len(heads)))
    mtb = MultiTaskingBert(heads,
                           model_storage_directory=model_storage_directory,
                           transformer_weights=transformer_weights,
                           device=device,
                           learning_rate=learning_rate,
                           use_pretrained_heads=use_pretrained_heads,
                           write_predictions=write_predictions,
                           time_stamp=time.strftime("%Y%m%d%H%M%S"))
    # heads = [(head, test_loader) for head, _, test_loader in validation_heads_and_dataloaders]
    # mtb.predict(heads, partition="dev")

    mtb.fit(heads_and_datasets,
            num_epochs=num_epochs,
            evaluation_interval=evaluation_interval,
            checkpoint_interval=checkpoint_interval,
            repeat_in_epoch_sampling=repeat_in_epoch_sampling,
            validation_heads_and_dataloaders=validation_heads_and_dataloaders)
Example #27
0
    train(experiment_dir, model, optimizer, train_loader, val_loader, loss_fn,
          metric_fns, num_epochs, device, save_every, phases, dump_predictions)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-c',
                        '--config',
                        required=True,
                        help='Configuration file')
    parser.add_argument('-p',
                        '--parameter',
                        action='append',
                        help='Override parameters (\'parameter=value\')')
    args = parser.parse_args()

    experiment_name = '{0:%Y-%m-%d-%H-%M-%S}'.format(datetime.now())

    experiment_dir = path.join('.', 'experiments', experiment_name)
    os.makedirs(path.join(experiment_dir, 'weights'))

    # Load configuration file if specified
    gin.parse_config_file(args.config)
    if args.parameter:
        gin.parse_config(args.parameter)

    with open(path.join(experiment_dir, 'config.gin'), 'w') as f:
        f.write(gin.config_str())

    main(experiment_dir)
Example #28
0
def train(params: AttrDict) -> Any:
    """Main training function."""
    torch.manual_seed(params.seed)  #type: ignore
    np.random.seed(params.seed)

    ############################
    mfpNet = import_mfp_net(params)
    ############################

    batch_size = 1
    data_hz = 10
    ns_between_samples = (1.0 / data_hz) * 1e9
    d_s = params.subsampling
    t_h = params.hist_len_orig_hz
    t_f = params.fut_len_orig_hz
    NUM_WORKERS = 1

    ROOT_PATH = 'multiple_futures_prediction/'
    DATA_PATH = os.path.join(ROOT_PATH, 'carla_data_cfo')
    if params.scenario == 0:
        DATASET_DIR = os.path.join(DATA_PATH, 'Left_Turn_Dataset')
    elif params.scenario == 1:
        DATASET_DIR = os.path.join(DATA_PATH, 'Overtake_Dataset')
    elif params.scenario == 2:
        DATASET_DIR = os.path.join(DATA_PATH, 'Right_Turn_Dataset')
    else:
        raise ValueError(params.scenario)
    print("Loading dataset from:", str(os.path.join(DATASET_DIR, 'train')))

    # Loading the dataset.
    train_set = CarlaDataset(str(os.path.join(DATASET_DIR, 'train')),
                             t_h,
                             t_f,
                             d_s,
                             params.encoder_size,
                             params.use_gru,
                             params.self_norm,
                             params.data_aug,
                             params.use_context,
                             params.nbr_search_depth,
                             rotate_fut=params.rotate_pov)
    val_set = CarlaDataset(
        str(os.path.join(DATASET_DIR, 'train')),  # NOTE: using train for val
        t_h,
        t_f,
        d_s,
        params.encoder_size,
        params.use_gru,
        params.self_norm,
        params.data_aug,
        params.use_context,
        params.nbr_search_depth,
        rotate_fut=params.rotate_pov)

    train_data_loader = DataLoader(train_set,
                                   batch_size=batch_size,
                                   shuffle=1,
                                   num_workers=NUM_WORKERS,
                                   collate_fn=train_set.collate_fn,
                                   drop_last=True)  # type: ignore
    val_data_loader = DataLoader(val_set,
                                 batch_size=batch_size,
                                 shuffle=0,
                                 num_workers=NUM_WORKERS,
                                 collate_fn=val_set.collate_fn,
                                 drop_last=True)  #type: ignore

    # Compute or load existing mean over future trajectories.
    y_mean = get_mean(train_data_loader)

    # Initialize network
    net = mfpNet(params)
    if params.use_cuda:
        net = net.cuda()  #type: ignore

    net.y_mean = y_mean
    y_mean = torch.tensor(net.y_mean)

    if params.log:
        logger_file, logging_dir = setup_logger(ROOT_PATH + "./checkpts/",
                                                'CARLA')
        # Save the gin config used in the checkpoint dir
        f = os.path.join(logging_dir, 'config.gin')
        print("Saving gin params to", f)
        with open(f, 'w') as fh:
            fh.write(gin.config_str())

    train_loss = []  # removed typing for oatomobile / py35 compat
    val_loss = []

    MODE = 'Pre'  # For efficiency, we first pre-train w/o interactive rollouts.
    num_updates = 0
    optimizer = None

    # Save a checkpoint of the initial (untrained) model
    if params.log:
        msg_str = '\nSaving state, update iter:%d %s' % (num_updates,
                                                         logging_dir)
        print(msg_str)
        logger_file.write(msg_str)
        logger_file.flush()
        torch.save(net.state_dict(),
                   logging_dir + '/checkpoints/carla_%06d' % num_updates +
                   '.pth')  #type: ignore

    for epoch_num in range(20):
        if MODE == 'EndPre':
            MODE = 'Train'
            print('Training with interactive rollouts.')
            bStepByStep = True
        else:
            print('Pre-training without interactive rollouts.')
            bStepByStep = False

        # Average losses.
        avg_tr_loss = 0.
        avg_tr_time = 0.
        loss_counter = 0.0

        for i, data in enumerate(train_data_loader):
            if num_updates > params.pre_train_num_updates and MODE == 'Pre':
                MODE = 'EndPre'
                break

            # Implements the decaying noise on the NLL mentioned in email correspondence
            if params.log_posterior_unnorm_noise == True:
                if num_updates < params.init_noise_iters:
                    nll_noise = params.init_noise_value
                else:
                    nll_noise_fac = np.power(
                        0.1, num_updates // params.updates_div_by_10)
                    nll_noise = max(params.final_noise_value,
                                    params.init_noise_value * nll_noise_fac)
            else:
                nll_noise = 0.0

            lr_fac = np.power(0.1, num_updates // params.updates_div_by_10)
            lr = max(params.min_lr, params.lr_init * lr_fac)
            if optimizer is None:
                optimizer = torch.optim.Adam(net.parameters(),
                                             lr=lr)  #type: ignore
            elif lr != optimizer.defaults['lr']:
                optimizer = torch.optim.Adam(net.parameters(), lr=lr)

            st_time = time.time()
            hist, nbrs, mask, fut, mask, context, yaws, nbrs_info = data

            if params.remove_y_mean:
                fut = fut - y_mean.unsqueeze(1)

            if params.use_cuda:
                hist = hist.cuda()
                nbrs = nbrs.cuda()
                mask = mask.cuda()
                fut = fut.cuda()
                mask = mask.cuda()
                if context is not None:
                    context = context.cuda()
                if yaws is not None:
                    yaws = yaws.cuda()

            # Forward pass.
            visualize = False
            if params.no_atten_model:
                # New version of model (model_simple) supports rotating with yaws
                fut_preds, modes_pred = net.forward_mfp(
                    hist,
                    nbrs,
                    mask,
                    context,
                    nbrs_info,
                    fut,
                    bStepByStep,
                    use_forcing=params.use_forcing,  # NOTE missing from ngsim
                    visualize=visualize,
                    yaws=yaws,
                    rotate_hist=params.rotate_pov)
            else:
                fut_preds, modes_pred = net.forward_mfp(
                    hist,
                    nbrs,
                    mask,
                    context,
                    nbrs_info,
                    fut,
                    bStepByStep,
                    use_forcing=params.use_forcing,  # NOTE missing from ngsim
                    visualize=visualize)

            if params.modes == 1:
                if nll_loss_noise != 0.0:
                    raise ValueError(
                        "k=1 does not support non-zero noise (%f)" %
                        nll_loss_noise)
                l = nll_loss(fut_preds[0], fut, mask)
            else:
                l = nll_loss_multimodes(fut_preds,
                                        fut,
                                        mask,
                                        modes_pred,
                                        noise=nll_noise)  # type: ignore

            # Backprop.
            optimizer.zero_grad()
            l.backward()
            torch.nn.utils.clip_grad_norm_(net.parameters(), 10)  #type: ignore
            optimizer.step()
            num_updates += 1

            batch_time = time.time() - st_time
            avg_tr_loss += l.item()
            avg_tr_time += batch_time

            effective_batch_sz = float(hist.shape[1])
            if num_updates % params.iter_per_err == params.iter_per_err - 1:
                print("Epoch no:", epoch_num, "update:", num_updates,
                      "| Avg train loss:", format(avg_tr_loss / 100, '0.4f'),
                      "learning_rate:%.5f" % lr, "nll_noise:%.5f" % nll_noise)
                train_loss.append(avg_tr_loss / 100)

                if params.log:
                    msg_str_ = ("Epoch no:", epoch_num, "update:", num_updates,
                                "| Avg train loss:",
                                format(avg_tr_loss / 100,
                                       '0.4f'), "learning_rate:%.5f" % lr,
                                "nll_noise:%.5f" % nll_noise)
                    msg_str = str([str(ss) for ss in msg_str_])
                    logger_file.write(msg_str + '\n')
                    logger_file.flush()

                avg_tr_loss = 0.
                if num_updates % params.iter_per_eval == params.iter_per_eval - 1:
                    print("Starting eval")
                    val_nll_err = eval('nll',
                                       net,
                                       params,
                                       val_data_loader,
                                       bStepByStep,
                                       use_forcing=params.use_forcing,
                                       y_mean=y_mean,
                                       num_batches=500,
                                       dataset_name='val_dl nll')

                    if params.log:
                        logger_file.write('val nll: ' + str(val_nll_err) +
                                          '\n')
                        logger_file.flush()

            # Save weights.
            if params.log and num_updates % params.iters_per_save == params.iters_per_save - 1:
                msg_str = '\nSaving state, update iter:%d %s' % (num_updates,
                                                                 logging_dir)
                print(msg_str)
                logger_file.write(msg_str)
                logger_file.flush()
                torch.save(net.state_dict(), logging_dir +
                           '/checkpoints/carla_%06d' % num_updates +
                           '.pth')  #type: ignore
Example #29
0
def train(experiment_name,
          ml_flow_directory,
          transformer_weights,
          model_storage_directory,
          device,
          repeat_in_epoch_sampling,
          learning_rate,
          seed,
          evaluation_interval=1,
          checkpoint_interval=1,
          shuffle=True,
          num_workers=1,
          num_epochs=1,
          transformer_hidden_size=768,
          transformer_dropout_prob=.1):

    log.info(gin.config_str())

    torch.random.manual_seed(seed)
    heads_and_datasets = []
    load_clinical_configured_tasks()

    print("MT training with the following tasks:")
    pprint(TASKS)

    for task in TASKS:
        for dataset in TASKS[task]:
            train_dataset = DATASETS[task](TASKS[task][dataset]['train'])
            test_dataset = DATASETS[task](TASKS[task][dataset]['test'])

            labels = train_dataset.entity_labels if hasattr(
                train_dataset, 'entity_labels') else None
            if hasattr(train_dataset, 'class_labels'):
                labels = train_dataset.class_labels

            head = HEADS[TASKS[task][dataset]['head']](
                dataset,
                labels=labels,
                hidden_size=transformer_hidden_size,
                hidden_dropout_prob=transformer_dropout_prob)

            if TASKS[task][dataset]['head'] == 'subword_classification':
                if 'evaluate_biluo' in TASKS[task][dataset]:
                    head.config.evaluate_biluo = TASKS[task][dataset][
                        'evaluate_biluo']
                else:
                    head.config.evaluate_biluo = False
            heads_and_datasets.append(
                (head,
                 DataLoader(train_dataset,
                            batch_size=TASKS[task][dataset]['batch_size'],
                            num_workers=num_workers),
                 DataLoader(test_dataset,
                            batch_size=TASKS[task][dataset]['batch_size'],
                            shuffle=shuffle,
                            num_workers=num_workers)))

    heads = [head for head, _, _ in heads_and_datasets]
    mlflow.set_tag('number_tasks', str(len(heads)))
    mtb = MultiTaskingBert(heads,
                           model_storage_directory=model_storage_directory,
                           transformer_weights=transformer_weights,
                           device=device,
                           learning_rate=learning_rate)
    mtb.fit(heads_and_datasets,
            num_epochs=num_epochs,
            evaluation_interval=evaluation_interval,
            checkpoint_interval=checkpoint_interval,
            repeat_in_epoch_sampling=repeat_in_epoch_sampling)
def main(argv):
    """ Run evaluation script.

    :param argv: Command line arguments.
    """
    # Configure information displayed to terminal.
    np.set_printoptions(precision=2)
    warnings.filterwarnings("ignore")

    # Set-up the result directory.
    run_dir = settings.get_run_dir()
    if osp.exists(run_dir):
        print("Cannot resume previously saved run, overwriting data.")
    else:
        os.mkdir(run_dir)

    # Set-up logging.
    logger = logging.getLogger("attackgraph")
    logger.setLevel(logging.INFO)
    logger.propagate = False
    logger.handlers = []  # absl has a default handler that we need to remove.
    # logger.propagate = False
    formatter = logging.Formatter(
        "%(asctime)s %(name)s %(levelname)s %(message)s")
    # Log to terminal.
    terminal_handler = logging.StreamHandler()
    terminal_handler.setFormatter(formatter)
    # Log to file.
    file_handler = logging.FileHandler(osp.join(run_dir, "out.log"))
    file_handler.setLevel(logging.INFO)
    file_handler.setFormatter(formatter)
    # Debug output.
    debug_handler = logging.FileHandler(osp.join(run_dir, "debug.log"))
    debug_handler.setLevel(logging.DEBUG)
    debug_handler.setFormatter(formatter)
    # Register handlers.
    logger.addHandler(terminal_handler)
    logger.addHandler(file_handler)
    logger.addHandler(debug_handler)

    logger.info(f"Saving results to: {run_dir}")

    # Set-up gin configuration.
    gin_files = [
        osp.join(settings.SRC_DIR, "configs", f"{x}.gin")
        for x in FLAGS.config_files
    ]
    gin.parse_config_files_and_bindings(config_files=gin_files,
                                        bindings=FLAGS.config_overrides,
                                        skip_unknown=False)

    # Save program flags.
    with open(osp.join(run_dir, "flags.txt"), "w") as flag_file:
        # We want only flags relevant to this module to be saved, no extra flags.
        # See: https://github.com/abseil/abseil-py/issues/92
        key_flags = FLAGS.get_key_flags_for_module(argv[0])
        key_flags = "\n".join(flag.serialize() for flag in key_flags)
        flag_file.write(key_flags)
    with open(osp.join(run_dir, "config.txt"), "w") as config_file:
        config_file.write(gin.config_str())

    # Properly restrict pytorch to not consume extra resources.
    #  - https://github.com/pytorch/pytorch/issues/975
    #  - https://github.com/ray-project/ray/issues/3609
    torch.set_num_threads(1)
    os.environ["OMP_NUM_THREADS"] = "1"

    evaluate_qmix([
        player2_policies.Player2v0(),
        player2_policies.Player2v1(),
        player2_policies.Player2v2(),
        player2_policies.Player2v3(),
        player2_policies.Player2v4()
    ])