def __init__(self, config): RLContextRemote = ray.remote(RLContext) ExperienceReplayBufferRemote = ray.remote(ExperienceReplayBuffer) self.config = config self.n_collectors = self.config.get('n_collectors', 3) if self.n_collectors == 0: self.rl_context = RLContext(config=self.config, gin_config_str=gin.config_str()) self.replay_buffer = ExperienceReplayBuffer(config=self.config, collectors=[]) else: self.remote_rl_contexts = [ RLContextRemote.remote(config=self.config, gin_config_str=gin.config_str()) for _ in range(self.n_collectors) ] self.replay_buffer = ExperienceReplayBufferRemote.remote( config=self.config, collectors=self.remote_rl_contexts) self.next_batch_refs = set() self.stats_ref = None self.future_batch_size = self.config.get('future_batch_size', 10) self.collect_initial()
def train_func(config): # Hyperparameters bindings = [] for key, value in config.items(): bindings.append(f'{key}={value}') # generate folder structures run_paths = utils_params.gen_run_folder(','.join(bindings[2])) # set loggers utils_misc.set_loggers(run_paths['path_logs_train'], logging.INFO) # gin-config gin.parse_config_files_and_bindings(['configs/config.gin'], bindings) utils_params.save_config(run_paths['path_gin'], gin.config_str()) # setup pipeline ds_train, ds_val, ds_test = load_from_tfrecords() # model model = TransformerS2S() trainer = Trainer(model, ds_train, ds_val, run_paths) for val_accuracy in trainer.train(): tune.report(val_accuracy=val_accuracy)
def descartes_builder(name='out', params=[]): print("Building grid search with parameters: ", params) directory = os.path.join('grids', name) if not os.path.exists(directory): os.makedirs(directory) all_values = [] all_params = [] for param in params: values = gin.query_parameter(param) all_params.append(param) all_values.append(values) descartes = itertools.product(*all_values) i = 0 for one in descartes: exp_directory = os.path.join(directory, str(i)) if not os.path.exists(exp_directory): os.makedirs(exp_directory) with gin.unlock_config(): for param_idx in range(len(all_params)): gin.bind_parameter(all_params[param_idx], one[param_idx]) config_str = gin.config_str() with open(os.path.join(exp_directory, 'config.gin'), 'w+') as f: f.write(config_str) i += 1 pass
def train_func(config): # Hyperparameters bindings = [] for key, value in config.items(): bindings.append(f'{key}={value}') # generate folder structures run_paths = utils_params.gen_run_folder(','.join(bindings)) # set loggers utils_misc.set_loggers(run_paths['path_logs_train'], logging.INFO) # gin-config gin.parse_config_files_and_bindings(['/mnt/home/repos/dl-lab-skeleton/diabetic_retinopathy/configs/config.gin'], bindings) utils_params.save_config(run_paths['path_gin'], gin.config_str()) # setup pipeline ds_train, ds_val, ds_test, ds_info = load() # model model = vgg_like(input_shape=ds_info.features["image"].shape, n_classes=ds_info.features["label"].num_classes) trainer = Trainer(model, ds_train, ds_val, ds_info, run_paths) for val_accuracy in trainer.train(): tune.report(val_accuracy=val_accuracy)
def train_func(config): # Hyperparameters bindings = [] for key, value in config.items(): bindings.append(f'{key}={value}') # generate folder structures run_paths = utils_params.gen_run_folder(bindings[2]) # set loggers utils_misc.set_loggers(run_paths['path_logs_train'], logging.INFO) # gin-config # gin dir should be replaced by your own dir gin.parse_config_files_and_bindings([r'D:\Uni Stuttgart\Deep learning lab\Diabetic Retinopathy Detection\dl-lab-2020-team08\diabetic_retinopathy\configs\config.gin'], bindings) utils_params.save_config(run_paths['path_gin'], gin.config_str()) # setup pipeline train_ds, valid_ds, test_ds = datasets.load() # model model = DenseNet121(IMG_SIZE=256) trainer = Trainer(model=model, ds_train=train_ds, ds_val=test_ds, run_paths=run_paths) for val_accuracy in trainer.train(): tune.report(val_accuracy=val_accuracy)
def setup_logger(): # import os # Set run specific envirorment configurations timestamp = time.strftime("run_%Y_%m_%d_%H_%M_%S") + "_{machine}".format( machine=socket.gethostname()) gin.bind_parameter( 'multi_tasking_train.model_storage_directory', os.path.join( gin.query_parameter('multi_tasking_train.model_storage_directory'), timestamp)) os.makedirs( gin.query_parameter('multi_tasking_train.model_storage_directory'), exist_ok=True) log.handlers.clear() formatter = logging.Formatter('%(message)s') fh = logging.FileHandler( os.path.join( gin.query_parameter('multi_tasking_train.model_storage_directory'), "log.txt")) fh.setLevel(logging.INFO) fh.setFormatter(formatter) log.addHandler(fh) ch = logging.StreamHandler() ch.setLevel(logging.INFO) ch.setFormatter(formatter) log.setLevel(logging.INFO) log.addHandler(ch) # Set global GPU state if torch.cuda.is_available() and gin.query_parameter( 'multi_tasking_train.device') == 'cuda': log.info("Using CUDA device:{0}".format(torch.cuda.current_device())) else: if gin.query_parameter('multi_tasking_train.device') == 'cpu': log.info("Utilizing CPU") else: raise Exception( f"Unrecognized device: {gin.query_parameter('multi_tasking_train.device')}" ) # ML-Flow mlflow.set_tracking_uri( f"{gin.query_parameter('multi_tasking_train.ml_flow_directory')}") mlflow.set_experiment( f"/{gin.query_parameter('multi_tasking_train.experiment_name')}") mlflow.start_run() gin_parameters = gin.config._CONFIG.get(list(gin.config._CONFIG.keys())[0]) mlflow.log_params(gin_parameters) # all_params = {x[1].split('.')[-1]: gin.config._CONFIG.get(x) for x in list(gin.config._CONFIG.keys())} all_params = gin.config_str() with open('config_log.txt', 'w') as f: f.write(all_params) mlflow.log_artifact("config_log.txt") mlflow.log_artifact(__file__)
def main(_): gin.parse_config_files_and_bindings(FLAGS.gin_files, bindings=FLAGS.gin_bindings, skip_unknown=False) logging.info(gin.config_str()) train_eval()
def main(argv): del argv if not hasattr(FLAGS.hparams, "items"): FLAGS.hparams = utils.YAMLDictParser().parse(FLAGS.hparams) log_dir = FLAGS.neutra_log_dir utils.BindHParams(FLAGS.hparams) if FLAGS.restore_from_config: with tf.io.gfile.GFile(os.path.join(log_dir, "config")) as f: gin.parse_config(f.read()) tf.io.gfile.makedirs(log_dir) summary_writer = tf.summary.create_file_writer(log_dir, flush_millis=10000) summary_writer.set_as_default() tf.summary.experimental.set_step(0) for i in range(10): try: checkpoint_log_dir = (FLAGS.checkpoint_log_dir if FLAGS.checkpoint_log_dir else FLAGS.neutra_log_dir) exp = neutra.NeuTraExperiment(log_dir=checkpoint_log_dir) with tf.io.gfile.GFile(os.path.join(log_dir, "config"), "w") as f: f.write(gin.config_str()) logging.info("Config:\n%s", gin.config_str()) checkpoint = checkpoint_log_dir + "/model.ckpt" if tf.io.gfile.exists(checkpoint + ".index"): logging.info("Restoring from %s", checkpoint) exp.checkpoint.restore(checkpoint) with utils.use_xla(False): if FLAGS.mode == "train": Train(exp) elif FLAGS.mode == "objective": TuneObjective(exp) elif FLAGS.mode == "benchmark": Benchmark(exp) elif FLAGS.mode == "eval": Eval(exp) break except tf.errors.InvalidArgumentError as e: if "NaN" in e.message: logging.error(e.message) logging.error("Got a NaN, try: %d", i) else: raise e
def write_config(experiment_name): """Write output config""" base_path = os.getenv('AICROWD_OUTPUT_PATH', '../scratch/shared') path = os.path.join(base_path, experiment_name) if not os.path.exists(path): os.mkdir(path) with open(os.path.join(path, 'conf.gin'), 'w') as f: f.write(gin.config_str())
def _init_tboard_logging(self): self.summary_writer = SummaryWriter( log_dir=module_dirs.get_current_tboard_dir(), comment=self.agent_name, ) # add config string to summary config_str = gin.config_str() # see https://stackoverflow.com/questions/45016458/tensorflow-tf-summary-text-and-linebreaks config_str = config_str.replace('\n', ' \n') self.summary_writer.add_text(tag='gin_config', text_string=config_str)
def tuning(config): # set hyperparameters bindings = [] for key, value in config.items(): bindings.append('{}={}'.format(str(key), str(value))) # generate folder structures run_paths = utils_params.gen_run_folder(','.join(bindings)) # gin-config gin.parse_config_files_and_bindings([config_path], bindings) utils_params.save_config(run_paths['path_gin'], gin.config_str()) # setup pipeline ds_train, ds_val, ds_test, ds_info = datasets.load(model_type=model_type) # setup model if model_name == 'VGG16': model = vgg_like(input_shape=(256, 256, 3), model_type=model_type) elif model_name == 'Simplified Inception': model = simplified_inception(input_shape=(256, 256, 3), model_type=model_type) elif model_name == 'Simplified SEResNeXt': model = simplified_seresnext(input_shape=(256, 256, 3), model_type=model_type) elif model_name == 'RepVGG': model = rep_vgg(input_shape=(256, 256, 3), model_type=model_type) elif model_name == 'DenseNet201': model = densenet201(input_shape=(256, 256, 3), model_type=model_type) elif model_name == 'EfficientNetB3': model = efficientnetb3(input_shape=(256, 256, 3), model_type=model_type) else: model = vgg_like(input_shape=(256, 256, 3), model_type=model_type) # set training loggers utils_misc.set_loggers(run_paths['path_logs_train'], logging.INFO) # train the model trainer = Trainer(model, ds_train, ds_val, ds_info, model_type=model_type, run_paths=run_paths) for val_accuracy in trainer.train(): tune.report(val_accuracy=val_accuracy * 100) # set validation loggers utils_misc.set_loggers(run_paths['path_logs_eval'], logging.INFO) # evaluate the model trained_model = trainer.model_output() if model_type == 'regression': trained_model.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.Huber(delta=0.3), metrics=[BinaryAccuracy(model_type=model_type)]) elif model_type == 'binary_classification': trained_model.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics=[BinaryAccuracy(model_type=model_type)]) elif model_type == 'multi_classification': trained_model.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics=[BinaryAccuracy(model_type=model_type)]) result = trained_model.evaluate(ds_test, return_dict=True) test_accuracy = result['binary_accuracy'] tune.report(test_accuracy=test_accuracy * 100)
def __getstate__(self): result = { k: getattr(self, k) for k in self.PICKLE_DIRECTLY if hasattr(self, k) } result['trainables_weights'] = { k: v.state_dict() for k, v in self.trainables.items() } result['gin_config'] = gin.config_str() return result
def load_config(save_config=True): """Loads config.""" gin.parse_config_files_and_bindings(flags.FLAGS.gin_configs, flags.FLAGS.gin_bindings, skip_unknown=True) config = Config() if save_config and jax.host_id() == 0: os.makedirs(config.checkpoint_dir) with open(config.checkpoint_dir + '/config.gin', 'w') as f: f.write(gin.config_str()) return config
def _write_gin_configs(output_file, operative=True): """Writes current gin configs to `output_file`.""" if operative: config_str = gin.operative_config_str() else: config_str = gin.config_str() logging.info('=' * 80) logging.info('Gin configs\n%s', config_str) logging.info('=' * 80) with tf.io.gfile.GFile(output_file, 'w') as f: f.write(config_str)
def main(argv): gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_param, skip_unknown=True) print("Gin parameter bindings:\n{}".format(gin.config_str())) use_neptune = "NEPTUNE_API_TOKEN" in os.environ exp_id = '' if use_neptune: neptune.init(project_qualified_name='bbeatrix/curl') exp = neptune.create_experiment(params=gin_config_to_dict(gin.config_str()), name=FLAGS.gin_file[0].split('/')[-1][:-4], upload_source_files=['./*.py']) exp_id = exp.id else: neptune.init('shared/onboarding', 'ANONYMOUS', backend=neptune.OfflineBackend()) neptune.log_text('gin_config', gin.config_str()) neptune.log_artifact(*FLAGS.gin_file, 'gin_config_{}.gin'.format(exp_id)) exp_manager = ExperimentManager(prefix=exp_id) exp_manager.run_experiment() neptune.stop() print("Fin")
def main(argv): del argv # Unused. gin.parse_config_files_and_bindings(FLAGS.gin_configs, FLAGS.gin_bindings) tf.io.gfile.makedirs(FLAGS.summary_dir) save_gin_config(gin.config_str(), os.path.join(FLAGS.summary_dir, "config.parsed.gin")) if FLAGS.run_train: trainer.train(ckpt_dir=FLAGS.ckpt_dir, summary_dir=FLAGS.summary_dir) if FLAGS.run_eval: evaluator.evaluate(ckpt_dir=FLAGS.ckpt_dir, summary_dir=FLAGS.summary_dir) save_gin_config(gin.operative_config_str(), os.path.join(FLAGS.summary_dir, "config.operative.gin"))
def write_gin_config(output_folder, swp=''): if swp is not '': swp = swp.lower() for c in " \{\}[]'": swp = swp.replace(c, '') for c in ",:": swp = swp.replace(c, '_') output_folder += f':{swp}' os.makedirs(output_folder, exist_ok=True) gin.bind_parameter('output_folder._output_folder', os.path.abspath(output_folder)) output_path = os.path.join(output_folder, 'config.gin') with open(output_path, 'w') as f: logging.info(f'{swp} -> {output_path}') f.write(gin.config_str()) return output_path
def main(argv): del argv save_dir = FLAGS.save_dir if FLAGS.index is not None: save_dir = os.path.join(save_dir, str(FLAGS.index)) logging.info('SAVE DIR: %s', save_dir) gin.parse_config_files_and_bindings(FLAGS.gin_config, FLAGS.gin_bindings) logging.info('CONFIG DIRS: %s', str(FLAGS.gin_config)) gfile.makedirs(save_dir) with gfile.GFile(os.path.join(save_dir, 'config.gin'), 'w') as f: f.write(gin.config_str()) train(save_dir)
def main(argv): # generate folder structures run_paths = utils_params.gen_run_folder() # set loggers utils_misc.set_loggers(run_paths['path_logs_train'], logging.INFO) # gin-config gin.parse_config_files_and_bindings(['configs/config.gin'], []) utils_params.save_config(run_paths['path_gin'], gin.config_str()) # setup pipeline ds_train, ds_val, ds_test = load_tfrecords.load_from_tfrecords() # print number of available GPUs print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU'))) if FLAGS.train: model = TransformerS2S() model.build((None, 250, 6)) model.summary() trainer = Trainer(model, ds_train, ds_val, run_paths) for _ in trainer.train(): continue else: # get one completely trained model to do evaluating opt = tf.keras.optimizers.Adam() model = TransformerS2S() ckpt = tf.train.Checkpoint(step=tf.Variable(1), optimizer=opt, net=model) # change ckpt dir to load the ckpt you want manager = tf.train.CheckpointManager( ckpt, "/content/drive/MyDrive/experiments/run_2021-01-24T13-52-22-787253/ckpts", max_to_keep=3) ckpt.restore(manager.latest_checkpoint) print("Restored from {}".format(manager.latest_checkpoint)) evaluate(model, ds_test)
def run(): """Run the beam pipeline to create synthetic dataset.""" pipeline_options = beam.options.pipeline_options.PipelineOptions( FLAGS.pipeline_options) with beam.Pipeline(options=pipeline_options) as pipeline: for gin_search_path in [GIN_PATH] + FLAGS.gin_search_path: gin.add_config_file_search_path(gin_search_path) gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_param, skip_unknown=True) np.random.seed(FLAGS.random_seed) _ = (pipeline | beam.Create(np.random.randint(2**32, size=FLAGS.num_examples)) | beam.ParDo(GenerateExampleFn(gin.config_str())) | beam.Reshuffle() | beam.Map(_float_dict_to_tfexample) | beam.io.tfrecordio.WriteToTFRecord(FLAGS.output_tfrecord_path, num_shards=FLAGS.num_shards, coder=beam.coders.ProtoCoder( tf.train.Example)))
def main(argv): # generate folder structures run_paths = utils_params.gen_run_folder(folder) # gin-config gin.parse_config_files_and_bindings(['configs/config.gin'], []) utils_params.save_config(run_paths['path_gin'], gin.config_str()) # setup pipeline ds_train, ds_val, ds_test, ds_info = datasets.load(model_type=model_type) # setup model if model_name == 'VGG16': model = vgg_like(input_shape=(256, 256, 3), model_type=model_type) elif model_name == 'Simplified Inception': model = simplified_inception(input_shape=(256, 256, 3), model_type=model_type) elif model_name == 'Simplified SEResNeXt': model = simplified_seresnext(input_shape=(256, 256, 3), model_type=model_type) elif model_name == 'RepVGG': model = rep_vgg(input_shape=(256, 256, 3), model_type=model_type) elif model_name == 'DenseNet201': model = densenet201(input_shape=(256, 256, 3), model_type=model_type) elif model_name == 'EfficientNetB3': model = efficientnetb3(input_shape=(256, 256, 3), model_type=model_type) else: model = vgg_like(input_shape=(256, 256, 3), model_type=model_type) model.summary() if FLAGS.train: # set training loggers utils_misc.set_loggers(run_paths['path_logs_train'], logging.INFO) # train the model trainer = Trainer(model, ds_train, ds_val, ds_info, model_type=model_type, run_paths=run_paths) for _ in trainer.train(): continue else: # set validation loggers utils_misc.set_loggers(run_paths['path_logs_eval'], logging.INFO) # evaluate the model evaluate(model, ds_test, ds_info, model_type=model_type, run_paths=run_paths)
def preprocess( target_processor: DataProcessor.__class__, output_dir: str, ignore_asserts: bool, random_seed=None, ): os.makedirs(output_dir, exist_ok=True) setup_logger(output_dir, target_processor.__name__) LOGGER.info("GOT config: \n======config======\n %s \n========config=======" % gin.config_str()) if random_seed is not None: LOGGER.info('Setting random seed to %d', random_seed) seed_everything(random_seed) for data_df, basename in parse(): LOGGER.info("[Preprocess]: started processing a df with %d rows:" % len(data_df)) processor: DataProcessor = target_processor(data_df=data_df, output_dir=output_dir) generator = processor.generate_chunks_iterable() preprocessed_chunks = [] try: for (idx, df_chunk) in tqdm(generator): try: data_chunk = processor.construct_chunk(df_chunk) except AssertionError as ex: if ignore_asserts: LOGGER.warning("GOT ASSERT %r on idx %d" % (ex, idx)) continue else: raise ex preprocessed_chunks.append( processor.preprocess_chunk(chunk=data_chunk, idx=basename) ) except KeyboardInterrupt as ex: LOGGER.warning("BREAKING by interrupt. got %d processed chunks" % len(preprocessed_chunks)) processed_data = processor.postprocess_chunks(preprocessed_chunks) processor.save_on_disk(processed_data)
def main(argv): # generate folder structures run_paths = utils_params.gen_run_folder() # set loggers utils_misc.set_loggers(run_paths['path_logs_train'], logging.INFO) # gin-config gin.parse_config_files_and_bindings([ r'D:\Uni Stuttgart\Deep learning lab\Diabetic Retinopathy Detection\dl-lab-2020-team08\diabetic_retinopathy\configs\config.gin' ], []) utils_params.save_config(run_paths['path_gin'], gin.config_str()) # setup pipeline train_ds, valid_ds, test_ds = datasets.load() # training including fine tuning if FLAGS.train: # model if FLAGS.train: model = DenseNet121(IMG_SIZE=256) model.summary() # training and fine tuning trainer = Trainer(model=model, ds_train=train_ds, ds_val=valid_ds, run_paths=run_paths) for _ in trainer.train(): continue else: # evaluation # model dir should be replaced by saved model dir model_dir = r"\diabetic_retinopathy\logs\20201221-225335\saved_model_ft" model = tf.keras.models.load_model(model_dir) evaluate(model, valid_ds)
def main(_): # https://github.com/google-research/text-to-text-transfer-transformer/blob/c0ea75dbe9e35a629ae2e3c964ef32adc0e997f3/t5/models/mesh_transformer_main.py#L149 # Add search path for gin files stored in package. gin.add_config_file_search_path( pkg_resources.resource_filename(__name__, "gin")) gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_param, finalize_config=True) pl.seed_everything(1234) with gin.config_scope('sroie_t5_baseline'): task_functions_maps = get_tasks_functions_maps() # Datasets with gin.config_scope('train_sroie'): train_keynames = get_all_keynames_from_dir() with gin.config_scope('validation_sroie'): val_keynames = get_all_keynames_from_dir() train_datasets = get_datasets_dict_from_task_functions_map( keynames=train_keynames, tasks_functions_maps=task_functions_maps) val_datasets = get_datasets_dict_from_task_functions_map( keynames=val_keynames, tasks_functions_maps=task_functions_maps) with gin.config_scope('task_train'): task_train = operative_macro() # Initializing model model = T5OCRBaseline() # Trainer if FLAGS.debug: logger = False trainer_callbacks = [] else: logger = NeptuneLogger( close_after_fit=False, api_key=os.environ["NEPTUNE_API_TOKEN"], # project_name is set via gin file # params=None, tags=[model.t5_model_prefix, task_train, 't5_ocr_baseline']) with gin.config_scope('sroie_t5_baseline'): checkpoint_callback = config_model_checkpoint( monitor=None if FLAGS.best_model_run_mode else "val_f1", dirpath=("/home/marcospiau/final_project_ia376j/checkpoints/" f"{logger.project_name.replace('/', '_')}/" "t5_ocr_baseline/"), prefix=( f"experiment_id={logger.experiment.id}-task={task_train}-" "t5_model_prefix=" f"{model.t5_model_prefix.replace('-', '_')}"), filename=("{step}-{epoch}-{val_precision:.6f}-{val_recall:.6f}" "-{val_f1:.6f}-{val_exact_match:.6f}"), mode="max", save_top_k=None if FLAGS.best_model_run_mode else 1, verbose=True) early_stop_callback = config_early_stopping_callback() trainer_callbacks = [checkpoint_callback, early_stop_callback] trainer = Trainer( checkpoint_callback=not (FLAGS.debug), log_gpu_memory=True, # profiler=FLAGS.debug, logger=logger, callbacks=trainer_callbacks, progress_bar_refresh_rate=1, log_every_n_steps=1) # Dataloaders train_loader_kwargs = { 'num_workers': mp.cpu_count(), 'shuffle': True if (trainer.overfit_batches == 0) else False, 'pin_memory': True } if trainer.overfit_batches != 0: with gin.unlock_config(): gin.bind_parameter( 'get_dataloaders_dict_from_datasets_dict.batch_size', 1) eval_loader_kwargs = {**train_loader_kwargs, **{'shuffle': False}} train_dataloaders = get_dataloaders_dict_from_datasets_dict( datasets_dict=train_datasets, dataloader_kwargs=train_loader_kwargs) val_dataloaders = get_dataloaders_dict_from_datasets_dict( datasets_dict=val_datasets, dataloader_kwargs=eval_loader_kwargs) # Logging important artifacts and params if logger: to_upload = { 'gin_operative_config.gin': gin.operative_config_str(), 'gin_complete_config.gin': gin.config_str(), 'abseil_flags.txt': FLAGS.flags_into_string() } for destination, content in to_upload.items(): buffer = StringIO(initial_value=content) buffer.seek(0) logger.log_artifact(buffer, destination=destination) params_to_log = dict() params_to_log['str_replace_newlines'] = gin.query_parameter( 'sroie_t5_baseline/get_default_preprocessing_functions.' 'str_replace_newlines') params_to_log['task_train'] = task_train params_to_log['patience'] = early_stop_callback.patience params_to_log['max_epochs'] = trainer.max_epochs params_to_log['min_epochs'] = trainer.min_epochs params_to_log[ 'accumulate_grad_batches'] = trainer.accumulate_grad_batches params_to_log['batch_size'] = train_dataloaders[task_train].batch_size for k, v in params_to_log.items(): logger.experiment.set_property(k, v) trainer.fit(model, train_dataloader=train_dataloaders[task_train], val_dataloaders=val_dataloaders[task_train]) # Logging best metrics and saving best checkpoint on Neptune experiment if logger: trainer.logger.experiment.log_text( log_name='best_model_path', x=trainer.checkpoint_callback.best_model_path) if not (FLAGS.best_model_run_mode): trainer.logger.experiment.log_metric( 'best_model_val_f1', trainer.checkpoint_callback.best_model_score.item()) if FLAGS.upload_best_checkpoint: trainer.logger.experiment.log_artifact( trainer.checkpoint_callback.best_model_path) trainer.logger.experiment.stop()
def get_options_dict(n_epochs=None, log_dir=gin.REQUIRED, log_name=gin.REQUIRED, trn_files=gin.REQUIRED, tst_files=gin.REQUIRED, input_shape=gin.REQUIRED, test_only=False, restore=None, restore_resnet_features=None, original_transnet=False, transition_only_trn_files=None, create_dir_and_summaries=True, transition_only_data_fraction=0.3, c3d_net=False, bi_tempered_loss=False, bi_tempered_loss_temp2=1., learning_rate_schedule=None, learning_rate_decay=None): trn_files_ = [] for fn in trn_files: trn_files_.extend(glob.glob(fn)) if transition_only_trn_files is not None: transition_trn_files_ = [] for fn in transition_only_trn_files: transition_trn_files_.extend(glob.glob(fn)) tst_files_ = {} for k, v in tst_files.items(): tst_files_[k] = [] for fn in v: tst_files_[k].extend(glob.glob(fn)) log_dir = os.path.join(log_dir, log_name + "_" + datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S")) summary_writer = tf.summary.create_file_writer(log_dir) if create_dir_and_summaries else None config_str = gin.config_str().replace("# ", "### ").split("\n") config_str = "\n\n".join([l for l in config_str if not l.startswith("### =====")]) if create_dir_and_summaries: with summary_writer.as_default(): tf.summary.text("config", config_str, step=0) with open(os.path.join(log_dir, "config.gin"), "w") as f: f.write(gin.config_str()) print("\n{}\n".format(log_name.upper())) return { "n_epochs": n_epochs, "log_dir": log_dir, "summary_writer": summary_writer, "trn_files": trn_files_, "tst_files": tst_files_, "input_shape": input_shape, "test_only": test_only, "restore": restore, "restore_resnet_features": restore_resnet_features, "original_transnet": original_transnet, "transition_only_trn_files": transition_trn_files_ if transition_only_trn_files is not None else None, "transition_only_data_fraction": transition_only_data_fraction, "c3d_net": c3d_net, "bi_tempered_loss": bi_tempered_loss, "bi_tempered_loss_temp2": bi_tempered_loss_temp2, "learning_rate_schedule": learning_rate_schedule, "learning_rate_decay": learning_rate_decay }
def train(experiment_name, ml_flow_directory, data_directory, ablation_amount, transformer_weights, use_pretrained_heads, model_storage_directory, device, learning_rate, seed, repeat_in_epoch_sampling, evaluation_interval=1, checkpoint_interval=1, shuffle=True, num_workers=0, num_epochs=1, transformer_hidden_size=768, transformer_dropout_prob=.1, eval_on_dev=True, write_predictions=False): """ Args: eval_on_dev: If True, evaluation is performed on using the development dataset during training/fine-tuning. write_predictions: If True, model output predictions and probabilities will be written to file. ablation_amount (float): the proportion of examples to ablate from training dataset, leaving 1-ablation_amount remaining. """ log.info(gin.config_str()) mlflow.set_tag('ablation', ablation_amount) torch.random.manual_seed(seed) ablation_amount = float(ablation_amount) # type check this heads_and_datasets = prepare_datasets( ablation_amount, data_directory, num_workers, shuffle, transformer_dropout_prob, transformer_hidden_size, eval_on_dev=True) # uses dev set for training eval validation_heads_and_dataloaders = prepare_datasets( ablation_amount, data_directory, num_workers, shuffle, transformer_dropout_prob, transformer_hidden_size, eval_on_dev=False) # uses test set for training eval validation_heads_and_dataloaders = [ (head, training_set, test_set) for (head, training_set, _), (_, _, test_set) in zip( heads_and_datasets, validation_heads_and_dataloaders) ] print(f'Data loaded. Begin Training {len(heads_and_datasets)} Tasks.') print( f'ABLATION AMOUNT {ablation_amount} ({(1 - ablation_amount) * 100} percent of data retained' ) # TRAIN MODEL heads = [head for head, _, _ in heads_and_datasets] mlflow.set_tag('number_tasks', str(len(heads))) mtb = MultiTaskingBert(heads, model_storage_directory=model_storage_directory, transformer_weights=transformer_weights, device=device, learning_rate=learning_rate, use_pretrained_heads=use_pretrained_heads, write_predictions=write_predictions, time_stamp=time.strftime("%Y%m%d%H%M%S")) # heads = [(head, test_loader) for head, _, test_loader in validation_heads_and_dataloaders] # mtb.predict(heads, partition="dev") mtb.fit(heads_and_datasets, num_epochs=num_epochs, evaluation_interval=evaluation_interval, checkpoint_interval=checkpoint_interval, repeat_in_epoch_sampling=repeat_in_epoch_sampling, validation_heads_and_dataloaders=validation_heads_and_dataloaders)
train(experiment_dir, model, optimizer, train_loader, val_loader, loss_fn, metric_fns, num_epochs, device, save_every, phases, dump_predictions) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-c', '--config', required=True, help='Configuration file') parser.add_argument('-p', '--parameter', action='append', help='Override parameters (\'parameter=value\')') args = parser.parse_args() experiment_name = '{0:%Y-%m-%d-%H-%M-%S}'.format(datetime.now()) experiment_dir = path.join('.', 'experiments', experiment_name) os.makedirs(path.join(experiment_dir, 'weights')) # Load configuration file if specified gin.parse_config_file(args.config) if args.parameter: gin.parse_config(args.parameter) with open(path.join(experiment_dir, 'config.gin'), 'w') as f: f.write(gin.config_str()) main(experiment_dir)
def train(params: AttrDict) -> Any: """Main training function.""" torch.manual_seed(params.seed) #type: ignore np.random.seed(params.seed) ############################ mfpNet = import_mfp_net(params) ############################ batch_size = 1 data_hz = 10 ns_between_samples = (1.0 / data_hz) * 1e9 d_s = params.subsampling t_h = params.hist_len_orig_hz t_f = params.fut_len_orig_hz NUM_WORKERS = 1 ROOT_PATH = 'multiple_futures_prediction/' DATA_PATH = os.path.join(ROOT_PATH, 'carla_data_cfo') if params.scenario == 0: DATASET_DIR = os.path.join(DATA_PATH, 'Left_Turn_Dataset') elif params.scenario == 1: DATASET_DIR = os.path.join(DATA_PATH, 'Overtake_Dataset') elif params.scenario == 2: DATASET_DIR = os.path.join(DATA_PATH, 'Right_Turn_Dataset') else: raise ValueError(params.scenario) print("Loading dataset from:", str(os.path.join(DATASET_DIR, 'train'))) # Loading the dataset. train_set = CarlaDataset(str(os.path.join(DATASET_DIR, 'train')), t_h, t_f, d_s, params.encoder_size, params.use_gru, params.self_norm, params.data_aug, params.use_context, params.nbr_search_depth, rotate_fut=params.rotate_pov) val_set = CarlaDataset( str(os.path.join(DATASET_DIR, 'train')), # NOTE: using train for val t_h, t_f, d_s, params.encoder_size, params.use_gru, params.self_norm, params.data_aug, params.use_context, params.nbr_search_depth, rotate_fut=params.rotate_pov) train_data_loader = DataLoader(train_set, batch_size=batch_size, shuffle=1, num_workers=NUM_WORKERS, collate_fn=train_set.collate_fn, drop_last=True) # type: ignore val_data_loader = DataLoader(val_set, batch_size=batch_size, shuffle=0, num_workers=NUM_WORKERS, collate_fn=val_set.collate_fn, drop_last=True) #type: ignore # Compute or load existing mean over future trajectories. y_mean = get_mean(train_data_loader) # Initialize network net = mfpNet(params) if params.use_cuda: net = net.cuda() #type: ignore net.y_mean = y_mean y_mean = torch.tensor(net.y_mean) if params.log: logger_file, logging_dir = setup_logger(ROOT_PATH + "./checkpts/", 'CARLA') # Save the gin config used in the checkpoint dir f = os.path.join(logging_dir, 'config.gin') print("Saving gin params to", f) with open(f, 'w') as fh: fh.write(gin.config_str()) train_loss = [] # removed typing for oatomobile / py35 compat val_loss = [] MODE = 'Pre' # For efficiency, we first pre-train w/o interactive rollouts. num_updates = 0 optimizer = None # Save a checkpoint of the initial (untrained) model if params.log: msg_str = '\nSaving state, update iter:%d %s' % (num_updates, logging_dir) print(msg_str) logger_file.write(msg_str) logger_file.flush() torch.save(net.state_dict(), logging_dir + '/checkpoints/carla_%06d' % num_updates + '.pth') #type: ignore for epoch_num in range(20): if MODE == 'EndPre': MODE = 'Train' print('Training with interactive rollouts.') bStepByStep = True else: print('Pre-training without interactive rollouts.') bStepByStep = False # Average losses. avg_tr_loss = 0. avg_tr_time = 0. loss_counter = 0.0 for i, data in enumerate(train_data_loader): if num_updates > params.pre_train_num_updates and MODE == 'Pre': MODE = 'EndPre' break # Implements the decaying noise on the NLL mentioned in email correspondence if params.log_posterior_unnorm_noise == True: if num_updates < params.init_noise_iters: nll_noise = params.init_noise_value else: nll_noise_fac = np.power( 0.1, num_updates // params.updates_div_by_10) nll_noise = max(params.final_noise_value, params.init_noise_value * nll_noise_fac) else: nll_noise = 0.0 lr_fac = np.power(0.1, num_updates // params.updates_div_by_10) lr = max(params.min_lr, params.lr_init * lr_fac) if optimizer is None: optimizer = torch.optim.Adam(net.parameters(), lr=lr) #type: ignore elif lr != optimizer.defaults['lr']: optimizer = torch.optim.Adam(net.parameters(), lr=lr) st_time = time.time() hist, nbrs, mask, fut, mask, context, yaws, nbrs_info = data if params.remove_y_mean: fut = fut - y_mean.unsqueeze(1) if params.use_cuda: hist = hist.cuda() nbrs = nbrs.cuda() mask = mask.cuda() fut = fut.cuda() mask = mask.cuda() if context is not None: context = context.cuda() if yaws is not None: yaws = yaws.cuda() # Forward pass. visualize = False if params.no_atten_model: # New version of model (model_simple) supports rotating with yaws fut_preds, modes_pred = net.forward_mfp( hist, nbrs, mask, context, nbrs_info, fut, bStepByStep, use_forcing=params.use_forcing, # NOTE missing from ngsim visualize=visualize, yaws=yaws, rotate_hist=params.rotate_pov) else: fut_preds, modes_pred = net.forward_mfp( hist, nbrs, mask, context, nbrs_info, fut, bStepByStep, use_forcing=params.use_forcing, # NOTE missing from ngsim visualize=visualize) if params.modes == 1: if nll_loss_noise != 0.0: raise ValueError( "k=1 does not support non-zero noise (%f)" % nll_loss_noise) l = nll_loss(fut_preds[0], fut, mask) else: l = nll_loss_multimodes(fut_preds, fut, mask, modes_pred, noise=nll_noise) # type: ignore # Backprop. optimizer.zero_grad() l.backward() torch.nn.utils.clip_grad_norm_(net.parameters(), 10) #type: ignore optimizer.step() num_updates += 1 batch_time = time.time() - st_time avg_tr_loss += l.item() avg_tr_time += batch_time effective_batch_sz = float(hist.shape[1]) if num_updates % params.iter_per_err == params.iter_per_err - 1: print("Epoch no:", epoch_num, "update:", num_updates, "| Avg train loss:", format(avg_tr_loss / 100, '0.4f'), "learning_rate:%.5f" % lr, "nll_noise:%.5f" % nll_noise) train_loss.append(avg_tr_loss / 100) if params.log: msg_str_ = ("Epoch no:", epoch_num, "update:", num_updates, "| Avg train loss:", format(avg_tr_loss / 100, '0.4f'), "learning_rate:%.5f" % lr, "nll_noise:%.5f" % nll_noise) msg_str = str([str(ss) for ss in msg_str_]) logger_file.write(msg_str + '\n') logger_file.flush() avg_tr_loss = 0. if num_updates % params.iter_per_eval == params.iter_per_eval - 1: print("Starting eval") val_nll_err = eval('nll', net, params, val_data_loader, bStepByStep, use_forcing=params.use_forcing, y_mean=y_mean, num_batches=500, dataset_name='val_dl nll') if params.log: logger_file.write('val nll: ' + str(val_nll_err) + '\n') logger_file.flush() # Save weights. if params.log and num_updates % params.iters_per_save == params.iters_per_save - 1: msg_str = '\nSaving state, update iter:%d %s' % (num_updates, logging_dir) print(msg_str) logger_file.write(msg_str) logger_file.flush() torch.save(net.state_dict(), logging_dir + '/checkpoints/carla_%06d' % num_updates + '.pth') #type: ignore
def train(experiment_name, ml_flow_directory, transformer_weights, model_storage_directory, device, repeat_in_epoch_sampling, learning_rate, seed, evaluation_interval=1, checkpoint_interval=1, shuffle=True, num_workers=1, num_epochs=1, transformer_hidden_size=768, transformer_dropout_prob=.1): log.info(gin.config_str()) torch.random.manual_seed(seed) heads_and_datasets = [] load_clinical_configured_tasks() print("MT training with the following tasks:") pprint(TASKS) for task in TASKS: for dataset in TASKS[task]: train_dataset = DATASETS[task](TASKS[task][dataset]['train']) test_dataset = DATASETS[task](TASKS[task][dataset]['test']) labels = train_dataset.entity_labels if hasattr( train_dataset, 'entity_labels') else None if hasattr(train_dataset, 'class_labels'): labels = train_dataset.class_labels head = HEADS[TASKS[task][dataset]['head']]( dataset, labels=labels, hidden_size=transformer_hidden_size, hidden_dropout_prob=transformer_dropout_prob) if TASKS[task][dataset]['head'] == 'subword_classification': if 'evaluate_biluo' in TASKS[task][dataset]: head.config.evaluate_biluo = TASKS[task][dataset][ 'evaluate_biluo'] else: head.config.evaluate_biluo = False heads_and_datasets.append( (head, DataLoader(train_dataset, batch_size=TASKS[task][dataset]['batch_size'], num_workers=num_workers), DataLoader(test_dataset, batch_size=TASKS[task][dataset]['batch_size'], shuffle=shuffle, num_workers=num_workers))) heads = [head for head, _, _ in heads_and_datasets] mlflow.set_tag('number_tasks', str(len(heads))) mtb = MultiTaskingBert(heads, model_storage_directory=model_storage_directory, transformer_weights=transformer_weights, device=device, learning_rate=learning_rate) mtb.fit(heads_and_datasets, num_epochs=num_epochs, evaluation_interval=evaluation_interval, checkpoint_interval=checkpoint_interval, repeat_in_epoch_sampling=repeat_in_epoch_sampling)
def main(argv): """ Run evaluation script. :param argv: Command line arguments. """ # Configure information displayed to terminal. np.set_printoptions(precision=2) warnings.filterwarnings("ignore") # Set-up the result directory. run_dir = settings.get_run_dir() if osp.exists(run_dir): print("Cannot resume previously saved run, overwriting data.") else: os.mkdir(run_dir) # Set-up logging. logger = logging.getLogger("attackgraph") logger.setLevel(logging.INFO) logger.propagate = False logger.handlers = [] # absl has a default handler that we need to remove. # logger.propagate = False formatter = logging.Formatter( "%(asctime)s %(name)s %(levelname)s %(message)s") # Log to terminal. terminal_handler = logging.StreamHandler() terminal_handler.setFormatter(formatter) # Log to file. file_handler = logging.FileHandler(osp.join(run_dir, "out.log")) file_handler.setLevel(logging.INFO) file_handler.setFormatter(formatter) # Debug output. debug_handler = logging.FileHandler(osp.join(run_dir, "debug.log")) debug_handler.setLevel(logging.DEBUG) debug_handler.setFormatter(formatter) # Register handlers. logger.addHandler(terminal_handler) logger.addHandler(file_handler) logger.addHandler(debug_handler) logger.info(f"Saving results to: {run_dir}") # Set-up gin configuration. gin_files = [ osp.join(settings.SRC_DIR, "configs", f"{x}.gin") for x in FLAGS.config_files ] gin.parse_config_files_and_bindings(config_files=gin_files, bindings=FLAGS.config_overrides, skip_unknown=False) # Save program flags. with open(osp.join(run_dir, "flags.txt"), "w") as flag_file: # We want only flags relevant to this module to be saved, no extra flags. # See: https://github.com/abseil/abseil-py/issues/92 key_flags = FLAGS.get_key_flags_for_module(argv[0]) key_flags = "\n".join(flag.serialize() for flag in key_flags) flag_file.write(key_flags) with open(osp.join(run_dir, "config.txt"), "w") as config_file: config_file.write(gin.config_str()) # Properly restrict pytorch to not consume extra resources. # - https://github.com/pytorch/pytorch/issues/975 # - https://github.com/ray-project/ray/issues/3609 torch.set_num_threads(1) os.environ["OMP_NUM_THREADS"] = "1" evaluate_qmix([ player2_policies.Player2v0(), player2_policies.Player2v1(), player2_policies.Player2v2(), player2_policies.Player2v3(), player2_policies.Player2v4() ])