) from detectron2.modeling import build_model from detectron2.solver import build_lr_scheduler, build_optimizer from detectron2.utils.events import ( CommonMetricPrinter, EventStorage, JSONWriter, TensorboardXWriter, ) import pytorch_warmup as warmup import pfa import wandb wandb.init(project="deepscribe-detectron", sync_tensorboard=True) logger = logging.getLogger("detectron2") def get_evaluator(cfg, dataset_name, output_folder=None): """ Create evaluator(s) for a given dataset. This uses the special metadata "evaluator_type" associated with each builtin dataset. For your own dataset, you can simply create an evaluator manually in your script and do not have to worry about the hacky if-else logic here. """ if output_folder is None: output_folder = os.path.join(cfg.OUTPUT_DIR, "inference") return COCOEvaluator(dataset_name, cfg, True, output_folder)
import numpy as np import pandas as pd from utils import args_util, plmodel_util, dataloading import argparse from torch import nn from torch.utils.data import DataLoader from torch.utils.data import Dataset from pytorch_lightning.core.lightning import LightningModule import pytorch_lightning as pl os.environ['CUDA_DEVICE_ORDER'] = "PCI_BUS_ID" # os.environ['CUDA_VISIBLE_DEVICES'] = "1,2" from pytorch_lightning.callbacks import Callback, ModelCheckpoint from pytorch_lightning.utilities.distributed import rank_zero_only import wandb wandb.init(project="fp_lightning") @rank_zero_only def wandb_save(wandb_logger, config): wandb_logger.log_hyperparams(config) wandb_logger.experiment.save('./pl_fingerprint.py', policy="now") def main(): arg_parser = args_util.add_general_args() arg_parser = args_util.add_train_args(arg_parser) arg_parser = args_util.add_model_args(arg_parser) args = arg_parser.parse_args() #kishore update parameters
import numpy as np from keras.datasets import mnist from keras.models import Sequential from keras.layers import Dense, Flatten, Dropout from keras.utils import np_utils from keras.callbacks import Callback import json from wandb.keras import WandbCallback import wandb run = wandb.init() config = run.config config.optimizer = "adam" config.epochs = 50 config.dropout = 10 config.hidden_nodes = 100 # load data (X_train, y_train), (X_test, y_test) = mnist.load_data() img_width = X_train.shape[1] img_height = X_train.shape[2] X_train = X_train.astype('float32') X_train /= 255. X_test = X_test.astype('float32') X_test /= 255. # one hot encode outputs y_train = np_utils.to_categorical(y_train) y_test = np_utils.to_categorical(y_test)
default='O2', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--warmup_steps", default=1000, type=int, help="Linear warmup over warmup_steps.") args = parser.parse_args() if use_wandb: wandb.init(project='grammar', name=args.exp_name, config=args) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16))
def main(argv): args = parser.parse_args() print('Load test starting') project_name = args.project if project_name is None: project_name = 'artifacts-load-test-%s' % str(datetime.now()).replace( ' ', '-').replace(':', '-').replace('.', '-') env_project = os.environ.get('WANDB_PROJECT') sweep_id = os.environ.get('WANDB_SWEEP_ID') if sweep_id: del os.environ['WANDB_SWEEP_ID'] wandb_config_paths = os.environ.get('WANDB_CONFIG_PATHS') if wandb_config_paths: del os.environ['WANDB_CONFIG_PATHS'] wandb_run_id = os.environ.get('WANDB_RUN_ID') if wandb_run_id: del os.environ['WANDB_RUN_ID'] # set global entity and project before chdir'ing from wandb.apis import InternalApi api = InternalApi() settings_entity = api.settings('entity') settings_base_url = api.settings('base_url') os.environ['WANDB_ENTITY'] = (os.environ.get('LOAD_TEST_ENTITY') or settings_entity) os.environ['WANDB_PROJECT'] = project_name os.environ['WANDB_BASE_URL'] = (os.environ.get('LOAD_TEST_BASE_URL') or settings_base_url) # Change dir to avoid litering code directory pwd = os.getcwd() tempdir = tempfile.TemporaryDirectory() os.chdir(tempdir.name) artifact_name = 'load-artifact-' + ''.join( random.choices(string.ascii_lowercase + string.digits, k=10)) print('Generating source data') source_file_names = gen_files(args.gen_n_files, args.gen_max_small_size, args.gen_max_large_size) print('Done generating source data') procs = [] stop_queue = multiprocessing.Queue() stats_queue = multiprocessing.Queue() # start all processes # writers for i in range(args.num_writers): file_names = source_file_names if args.non_overlapping_writers: chunk_size = int(len(source_file_names) / args.num_writers) file_names = source_file_names[i * chunk_size:(i + 1) * chunk_size] p = multiprocessing.Process( target=proc_version_writer, args=(stop_queue, stats_queue, project_name, file_names, artifact_name, args.files_per_version_min, args.files_per_version_max)) p.start() procs.append(p) # readers for i in range(args.num_readers): p = multiprocessing.Process(target=proc_version_reader, args=(stop_queue, stats_queue, project_name, artifact_name, i)) p.start() procs.append(p) # deleters for i in range(args.num_deleters): p = multiprocessing.Process( target=proc_version_deleter, args=(stop_queue, stats_queue, artifact_name, args.min_versions_before_delete, args.delete_period_max)) p.start() procs.append(p) # cache garbage collector if args.cache_gc_period_max is None: print('Testing cache GC process not enabled!') else: p = multiprocessing.Process(target=proc_cache_garbage_collector, args=(stop_queue, args.cache_gc_period_max)) p.start() procs.append(p) # reset environment os.environ['WANDB_ENTITY'] = settings_entity os.environ['WANDB_BASE_URL'] = settings_base_url os.environ if env_project is None: del os.environ['WANDB_PROJECT'] else: os.environ['WANDB_PROJECT'] = env_project if sweep_id: os.environ['WANDB_SWEEP_ID'] = sweep_id if wandb_config_paths: os.environ['WANDB_CONFIG_PATHS'] = wandb_config_paths if wandb_run_id: os.environ['WANDB_RUN_ID'] = wandb_run_id # go back to original dir os.chdir(pwd) # test phase start_time = time.time() stats = defaultdict(int) run = wandb.init(job_type='main-test-phase') run.config.update(args) while time.time() - start_time < args.test_phase_seconds: stat_update = None try: stat_update = stats_queue.get(True, 5000) except queue.Empty: pass print('** Test time: %s' % (time.time() - start_time)) if stat_update: for k, v in stat_update.items(): stats[k] += v wandb.log(stats) print('Test phase time expired') # stop all processes and wait til all are done for i in range(len(procs)): stop_queue.put(True) print('Waiting for processes to stop') fail = False for proc in procs: proc.join() if proc.exitcode != 0: print('FAIL! Test phase failed') fail = True sys.exit(1) # drain remaining stats while True: try: stat_update = stats_queue.get_nowait() except queue.Empty: break for k, v in stat_update.items(): stats[k] += v print('Stats') import pprint pprint.pprint(dict(stats)) if fail: print('FAIL! Test phase failed') sys.exit(1) else: print('Test phase successfully completed') print('Starting verification phase') os.environ['WANDB_ENTITY'] = (os.environ.get('LOAD_TEST_ENTITY') or settings_entity) os.environ['WANDB_PROJECT'] = project_name os.environ['WANDB_BASE_URL'] = (os.environ.get('LOAD_TEST_BASE_URL') or settings_base_url) data_api = wandb.Api() # we need list artifacts by walking runs, accessing via # project.artifactType.artifacts only returns committed artifacts for run in data_api.runs('%s/%s' % (api.settings('entity'), project_name)): for v in run.logged_artifacts(): # TODO: allow deleted once we build deletion support if v.state != 'COMMITTED' and v.state != 'DELETED': print('FAIL! Artifact version not committed or deleted: %s' % v) sys.exit(1) print('Verification succeeded')
def main(args): parser = get_config() all_args = parse_args(args, parser) if all_args.algorithm_name == "rmappo" or all_args.algorithm_name == "rmappg": assert (all_args.use_recurrent_policy or all_args.use_naive_recurrent_policy), ( "check recurrent policy!") elif all_args.algorithm_name == "mappo" or all_args.algorithm_name == "mappg": assert (all_args.use_recurrent_policy == False and all_args.use_naive_recurrent_policy == False), ("check recurrent policy!") else: raise NotImplementedError # cuda if all_args.cuda and torch.cuda.is_available(): print("choose to use gpu...") device = torch.device("cuda:0") torch.set_num_threads(all_args.n_training_threads) if all_args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True else: print("choose to use cpu...") device = torch.device("cpu") torch.set_num_threads(all_args.n_training_threads) # run dir run_dir = Path( os.path.split(os.path.dirname(os.path.abspath(__file__)))[0] + "/results" ) / all_args.env_name / all_args.hanabi_name / all_args.algorithm_name / all_args.experiment_name if not run_dir.exists(): os.makedirs(str(run_dir)) # wandb if all_args.use_wandb: run = wandb.init(config=all_args, project=all_args.env_name, entity=all_args.user_name, notes=socket.gethostname(), name=str(all_args.algorithm_name) + "_" + str(all_args.experiment_name) + "_seed" + str(all_args.seed), group=all_args.hanabi_name, dir=str(run_dir), job_type="training", reinit=True) else: if not run_dir.exists(): curr_run = 'run1' else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in run_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = run_dir / curr_run if not run_dir.exists(): os.makedirs(str(run_dir)) setproctitle.setproctitle( str(all_args.algorithm_name) + "-" + str(all_args.env_name) + "-" + str(all_args.experiment_name) + "@" + str(all_args.user_name)) # seed torch.manual_seed(all_args.seed) torch.cuda.manual_seed_all(all_args.seed) np.random.seed(all_args.seed) # env init envs = make_train_env(all_args) eval_envs = make_eval_env(all_args) if all_args.use_eval else None num_agents = all_args.num_agents config = { "all_args": all_args, "envs": envs, "eval_envs": eval_envs, "num_agents": num_agents, "device": device, "run_dir": run_dir } # run experiments if all_args.share_policy: from onpolicy.runner.shared.hanabi_runner_backward import HanabiRunner as Runner else: from onpolicy.runner.separated.hanabi_runner_backward import HanabiRunner as Runner runner = Runner(config) runner.run() # post process envs.close() if all_args.use_eval and eval_envs is not envs: eval_envs.close() if all_args.use_wandb: run.finish() else: runner.writter.export_scalars_to_json( str(runner.log_dir + '/summary.json')) runner.writter.close()
def run_training(args): print('---------- Initialize W&B run for experiment tracking----------\n') run = wandb.init(entity=args.wandb_entity, project=args.wandb_project, job_type='train') wandb.config.update(args) print('---------- Perform Training ----------') savedir = args.savepath if not os.path.exists(savedir): os.mkdir(savedir) head_tail = os.path.split(args.dataset) savedir = os.path.join(savedir, head_tail[1]) if not os.path.exists(savedir): os.mkdir(savedir) if not os.path.exists(os.path.join(savedir, "trained model")): os.mkdir(os.path.join(savedir, "trained model")) print('creating directory %s' % (os.path.join(savedir, "trained model"))) if not os.path.exists(os.path.join(savedir, "saved training")): os.mkdir(os.path.join(savedir, "saved training")) print('creating directory %s' % (os.path.join(savedir, "saved training"))) print('XField type: %s' % (args.type)) print('Dimension of input xfield: %s' % (args.dim)) #loading images images, coordinates, all_pairs, h_res, w_res = load_imgs(args) dims = args.dim num_n = args.num_n # number of neighbors min_ = np.min(coordinates) max_ = np.max(coordinates) print('\n ------- Creating the model -------') # batch size is num_n + 1 (number of neighbors + target) inputs = tf.placeholder(tf.float32, shape=[num_n + 1, 1, 1, len(dims)]) # Jacobian network num_output = len(args.type) * 2 with tf.variable_scope("gen_flows"): flows = Flow(inputs, h_res, w_res, num_output, args.nfg, min_, max_) nparams_decoder = np.sum([ np.prod(v.get_shape().as_list()) for v in tf.trainable_variables() if v.name.startswith("gen_flows") ]) print('Number of learnable parameters (decoder): %d' % (nparams_decoder)) # learnt albedo # The albedos are initialized with constant 1.0 if args.type == ['light', 'view', 'time']: with tf.variable_scope("gen_flows"): # For light-view-time interpolation, we consider num_views*num_times albedos albedos = tf.Variable(tf.constant( 1.0, shape=[dims[1] * dims[2], h_res, w_res, 3]), name='albedo') index_albedo = tf.placeholder(tf.int32, shape=(1, )) albedo = tf.gather(albedos, index_albedo, 0) nparams = np.sum([ np.prod(v.get_shape().as_list()) for v in tf.trainable_variables() if v.name.startswith("gen_flows") ]) print( 'Number of learnable parameters (%d albedos with res %d x %d ): %d' % (dims[1] * dims[2], h_res, w_res, nparams - nparams_decoder)) elif args.type == ['light']: with tf.variable_scope("gen_flows"): # For light interpolation, we consider just one albedo albedo = tf.Variable(tf.constant(1.0, shape=[1, h_res, w_res, 3]), name='albedo') nparams = np.sum([ np.prod(v.get_shape().as_list()) for v in tf.trainable_variables() if v.name.startswith("gen_flows") ]) print( 'Number of learnable parameters (%d albedos with res %d x %d ): %d' % (1, h_res, w_res, nparams - nparams_decoder)) else: # For view and time interpolation, we do not train for albedo, we consider it as a constant non-learnable parameter albedo = tf.constant(1.0, shape=[1, h_res, w_res, 3]) Neighbors = tf.placeholder(tf.float32, shape=[num_n, h_res, w_res, 3]) # soft blending interpolated = Blending_train(inputs, Neighbors, flows, albedo, h_res, w_res, args) Reference = tf.placeholder(tf.float32, shape=[1, h_res, w_res, 3]) # L1 loss loss = tf.reduce_mean((tf.abs(interpolated - Reference))) gen_tvars = [ var for var in tf.trainable_variables() if var.name.startswith("gen_flows") ] learning_rate = tf.placeholder(tf.float32, shape=()) gen_optim = tf.train.AdamOptimizer(learning_rate) gen_grads = gen_optim.compute_gradients(loss, var_list=gen_tvars) gen_train = gen_optim.apply_gradients(gen_grads) saver = tf.train.Saver(max_to_keep=1000) sess = tf.Session() sess.run(tf.global_variables_initializer()) if args.load_pretrained: ckpt = tf.train.get_checkpoint_state("%s/trained model" % (savedir)) if ckpt: print('\n loading pretrained model ' + ckpt.model_checkpoint_path) saver.restore(sess, ckpt.model_checkpoint_path) print('------------ Start Training ------------') lr = args.lr print('Starting learning rate with %0.4f' % (lr)) stop_l1_thr = 0.01 iter_end = 100000 # total number of iterations indices = np.array([i for i in range(len(all_pairs))]) if len(indices ) < 500: # we considered around 500 iterations per each epoch indices = np.repeat(indices, 500 // len(indices)) epoch_size = len(indices) epoch_end = iter_end // epoch_size # total number of epochs if args.type == ['light', 'view', 'time']: st = time.time() min_loss = 1000 l1_loss_t = 1 epoch = 0 while l1_loss_t > stop_l1_thr and epoch <= epoch_end: l1_loss_t = 0 np.random.shuffle(indices) for id in range(epoch_size): pair = all_pairs[indices[id], ::] input_coords = coordinates[pair[:num_n + 1], ::] reference_img = images[pair[:1], ::] Neighbors_img = images[pair[1:num_n + 1], ::] _index = [pair[-1]] _, l1loss = sess.run( [gen_train, loss], feed_dict={ inputs: input_coords, Reference: reference_img, Neighbors: Neighbors_img, learning_rate: lr, index_albedo: _index }) l1_loss_t = l1_loss_t + l1loss print( '\r Epoch %3.0d Iteration %3.0d of %3.0d Cumulative L1 loss = %3.3f' % (epoch, id + 1, epoch_size, l1_loss_t), end=" ") wandb.log({'Cumulative L1 loss': l1_loss_t}) l1_loss_t = l1_loss_t / epoch_size print(" elapsed time %3.1f m Averaged L1 loss = %3.5f " % ((time.time() - st) / 60, l1_loss_t)) wandb.log({'epoch': epoch, 'Averaged L1 loss': l1_loss_t}) if l1_loss_t < min_loss: saver.save(sess, "%s/trained model/model.ckpt" % (savedir)) min_loss = l1_loss_t center = np.prod(dims) // 2 cv2.imwrite("%s/saved training/reference.png" % (savedir), np.uint8(images[center, ::] * 255)) pair = all_pairs[3 * center + 0, ::] out_img, flows_out = sess.run( [interpolated, flows], feed_dict={ inputs: coordinates[pair[:num_n + 1], ::], Neighbors: images[pair[1:num_n + 1], ::], index_albedo: [pair[-1]] }) out_img = np.minimum(np.maximum(out_img, 0.0), 1.0) cv2.imwrite("%s/saved training/recons_light.png" % (savedir), np.uint8(out_img[0, ::] * 255)) wandb.log({ 'Reconstructed Light': [ wandb.Image("%s/saved training/recons_light.png" % (savedir)) ] }) flow_color = flow_vis.flow_to_color(flows_out[0, :, :, 0:2], convert_to_bgr=False) cv2.imwrite("%s/saved training/flow_light.png" % (savedir), np.uint8(flow_color)) wandb.log({ 'Flow Light': [wandb.Image("%s/saved training/flow_light.png" % (savedir))] }) flow_color = flow_vis.flow_to_color(flows_out[0, :, :, 2:4], convert_to_bgr=False) cv2.imwrite("%s/saved training/flow_view.png" % (savedir), np.uint8(flow_color)) wandb.log({ 'Flow View': [wandb.Image("%s/saved training/flow_view.png" % (savedir))] }) flow_color = flow_vis.flow_to_color(flows_out[0, :, :, 4:6], convert_to_bgr=False) cv2.imwrite("%s/saved training/flow_time.png" % (savedir), np.uint8(flow_color)) wandb.log({ 'Flow Time': [wandb.Image("%s/saved training/flow_time.png" % (savedir))] }) pair = all_pairs[3 * center + 1, ::] out_img = sess.run(interpolated, feed_dict={ inputs: coordinates[pair[:num_n + 1], ::], Neighbors: images[pair[1:num_n + 1], ::], index_albedo: [pair[-1]] }) out_img = np.minimum(np.maximum(out_img, 0.0), 1.0) cv2.imwrite("%s/saved training/recons_view.png" % (savedir), np.uint8(out_img[0, ::] * 255)) wandb.log({ 'Reconstructed View': [wandb.Image("%s/saved training/recons_view.png" % (savedir))] }) pair = all_pairs[3 * center + 2, ::] out_img = sess.run(interpolated, feed_dict={ inputs: coordinates[pair[:num_n + 1], ::], Neighbors: images[pair[1:num_n + 1], ::], index_albedo: [pair[-1]] }) out_img = np.minimum(np.maximum(out_img, 0.0), 1.0) cv2.imwrite("%s/saved training/recons_time.png" % (savedir), np.uint8(out_img[0, ::] * 255)) wandb.log({ 'Reconstructed Time': [wandb.Image("%s/saved training/recons_time.png" % (savedir))] }) epoch = epoch + 1 if epoch == epoch_end // 2: lr = 0.00005 if args.type == ['view'] or args.type == ['time' ] or args.type == ['light']: st = time.time() img_mov = cv2.VideoWriter( '%s/saved training/epoch_recons.mp4' % (savedir), cv2.VideoWriter_fourcc(*'mp4v'), 10, (w_res, h_res)) flow_mov = cv2.VideoWriter( '%s/saved training/epoch_flows.mp4' % (savedir), cv2.VideoWriter_fourcc(*'mp4v'), 10, (w_res, h_res)) min_loss = 1000 l1_loss_t = 1 epoch = 0 while l1_loss_t > stop_l1_thr and epoch <= epoch_end: l1_loss_t = 0 np.random.shuffle(indices) for id in range(epoch_size): pair = all_pairs[indices[id], ::] input_coords = coordinates[pair[:num_n + 1], ::] reference_img = images[pair[:1], ::] Neighbors_img = images[pair[1:num_n + 1], ::] _, l1loss = sess.run( [gen_train, loss], feed_dict={ inputs: input_coords, Reference: reference_img, Neighbors: Neighbors_img, learning_rate: lr, }) l1_loss_t = l1_loss_t + l1loss print( '\r Epoch %3.0d Iteration %3.0d of %3.0d Cumulative L1 loss = %3.3f' % (epoch, id + 1, epoch_size, l1_loss_t), end=" ") wandb.log({'Cumulative L1 loss': l1_loss_t}) l1_loss_t = l1_loss_t / epoch_size print(" elapsed time %3.1f m Averaged L1 loss = %3.5f" % ((time.time() - st) / 60, l1_loss_t)) wandb.log({'epoch': epoch, 'Averaged L1 loss': l1_loss_t}) if l1_loss_t < min_loss: saver.save(sess, "%s/trained model/model.ckpt" % (savedir)) min_loss = l1_loss_t if args.type == ['light']: albedo_out = np.minimum(np.maximum(sess.run(albedo), 0.0), 1.0) cv2.imwrite("%s/saved training/albedo.png" % (savedir), np.uint8(albedo_out[0, :, :, :] * 255)) wandb.log({ 'Albedo': [wandb.Image("%s/saved training/albedo.png" % (savedir))] }) center = np.prod(dims) // 2 cv2.imwrite("%s/saved training/reference.png" % (savedir), np.uint8(images[center, ::] * 255)) wandb.log({ 'Reference': [wandb.Image("%s/saved training/reference.png" % (savedir))] }) pair = all_pairs[(len(all_pairs) // len(images)) * center, ::] out_img, flows_out = sess.run( [interpolated, flows], feed_dict={ inputs: coordinates[pair[:num_n + 1], ::], Neighbors: images[pair[1:num_n + 1], ::] }) out_img = np.minimum(np.maximum(out_img, 0.0), 1.0) cv2.imwrite("%s/saved training/recons.png" % (savedir), np.uint8(out_img[0, ::] * 255)) wandb.log({ 'Reconstruction': [wandb.Image("%s/saved training/recons.png" % (savedir))] }) flow_color = flow_vis.flow_to_color(flows_out[0, :, :, 0:2], convert_to_bgr=False) cv2.imwrite("%s/saved training/flow.png" % (savedir), np.uint8(flow_color)) wandb.log({ 'Flow': [wandb.Image("%s/saved training/flow.png" % (savedir))] }) img_mov.write(np.uint8(out_img[0, ::] * 255)) flow_mov.write(np.uint8(flow_color)) epoch = epoch + 1 if epoch == epoch_end // 2: lr = 0.00005 img_mov.release() flow_mov.release() wandb.log({ "epoch_recons": wandb.Video('%s/saved training/epoch_recons.mp4' % (savedir), fps=4, format="gif") }) wandb.log({ "epoch_flows": wandb.Video('%s/saved training/epoch_flows.mp4' % (savedir), fps=4, format="gif") })
# Commence training model = transfer_utils.train_model(model, dataloaders, dataset_sizes, class_names, criterion, optimizer, scheduler, num_epochs=args.epochs, curr_epoch=curr_epoch, checkpoint_dir=args.checkpoint_dir) if __name__ == "__main__": wandb.init(project="tm-poverty-prediction") logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser( description='Philippine Poverty Prediction') parser.add_argument('--batch-size', type=int, default=32, metavar='N', help='input batch size for training (default: 32)') parser.add_argument('--lr', type=float, default=1e-6, metavar='LR', help='learning rate (default: 1e-6)') parser.add_argument('--epochs',
def run_training(command_history: utils.CommandHistory, game_params: GameParams, model_params: ModelParams, optim_params: OptimParams, simulation_params: SimulationParams, execution_params: ExecutionParams, run_group="Default Group") -> None: wandb.init(project="thesis-az", group=run_group) cfg = { **asdict(game_params), **asdict(model_params), **asdict(optim_params), **asdict(simulation_params), **asdict(execution_params), } wandb.config.update(cfg) start_time = time.time() logger_path = os.path.join(execution_params.checkpoint_dir, "train.log") sys.stdout = utils.Logger(logger_path) print("#" * 70) print("#" + "TRAINING".center(68) + "#") print("#" * 70) print("setting-up pseudo-random generator...") seed_generator = utils.generate_random_seeds(seed=execution_params.seed) # checkpoint, resume from where it stops epoch = 0 ckpts = list( utils.gen_checkpoints(checkpoint_dir=execution_params.checkpoint_dir, only_last=True, real_time=False)) checkpoint = {} if ckpts: checkpoint = ckpts[0] former_command_history = checkpoint["command_history"] command_history.build_history(former_command_history) optim_params = command_history.update_params_from_checkpoint( checkpoint_params=checkpoint["optim_params"], resume_params=optim_params) simulation_params = command_history.update_params_from_checkpoint( checkpoint_params=checkpoint["simulation_params"], resume_params=simulation_params, ) execution_params = command_history.update_params_from_checkpoint( checkpoint_params=checkpoint["execution_params"], resume_params=execution_params, ) if command_history.last_command_contains("init_checkpoint"): if ckpts: raise RuntimeError( "Cannot restart from init_checkpoint, already restarting from non-empty checkpoint_dir" ) # pretrained model, consider new training from epoch zero print("loading pretrained model from checkpoint...") checkpoint = utils.load_checkpoint( checkpoint_path=model_params.init_checkpoint) if checkpoint: # game_params and model_params cannot change on a checkpoint # either write the same, or don't specify them ignored = {"init_checkpoint", "game_name"} # this one can change current_params = dict(game_params=game_params, model_params=model_params) for params_name, params in current_params.items(): for attr, val in asdict(params).items(): if command_history.last_command_contains( attr) and attr not in ignored: ckpt_val = getattr(checkpoint[params_name], attr) assert val == ckpt_val, f"When resuming, got '{val}' for {attr} but cannot override from past run with '{ckpt_val}'." specified_game_name = game_params.game_name game_params = checkpoint["game_params"] if specified_game_name is not None: game_params.game_name = specified_game_name model_params = checkpoint["model_params"] epoch = checkpoint["epoch"] print("reconstructing the model...") else: print("creating and saving the model...") train_device = execution_params.device[0] game_generation_devices = ([train_device] if len(execution_params.device) == 1 else execution_params.device[1:]) train_device = torch.device(train_device) model = create_model( game_params=game_params, model_params=model_params, resume_training=bool(checkpoint), model_state_dict=checkpoint["model_state_dict"] if checkpoint else None, ).to(train_device) model_path = execution_params.checkpoint_dir / "model.pt" model.save(str(model_path)) ddpmodel = None if execution_params.ddp: torch.distributed.init_process_group(backend="nccl") ddpmodel = nn.parallel.DistributedDataParallel( ModelWrapperForDDP(model)) print("creating optimizer...") optim = create_optimizer( model=model, optim_params=optim_params, optim_state_dict=checkpoint.get("optim_state_dict", None), ) print("creating training environment...") context, assembler, get_train_reward = create_training_environment( seed_generator=seed_generator, model_path=model_path, game_generation_devices=game_generation_devices, game_params=game_params, simulation_params=simulation_params, execution_params=execution_params) assembler.update_model(model.state_dict()) assembler.add_tournament_model("init", model.state_dict()) context.start() print("warming-up replay buffer...") warm_up_replay_buffer( assembler=assembler, replay_warmup=simulation_params.replay_warmup, replay_buffer=checkpoint.get("replay_buffer", None), ) print("training model...") train_model(command_history=command_history, start_time=start_time, train_device=train_device, model=model, ddpmodel=ddpmodel, model_path=model_path, optim=optim, context=context, assembler=assembler, get_train_reward=get_train_reward, game_params=game_params, model_params=model_params, optim_params=optim_params, simulation_params=simulation_params, execution_params=execution_params, epoch=epoch) elapsed_time = time.time() - start_time print(f"total time: {elapsed_time} s")
import tensorflow.keras.layers as tfkl import tensorflow.keras.backend as K from keras.layers import GlobalAveragePooling2D, Dense, Flatten from sklearn.preprocessing import LabelBinarizer from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, InputLayer, Activation from keras.models import Sequential from keras import optimizers from postprocess import Postprocess import params import requests import pandas as pd import pickle #Import wandb libraries import wandb wandb.init(project="vgg_training_03") from wandb.keras import WandbCallback def telegram_bot_sendtext(bot_message): bot_token = '1153335989:AAE4v1w9FD_vCUaG2qcq-WmuPwh_MBYWWho' bot_chatID = '675791133' send_text = 'https://api.telegram.org/bot' + bot_token + '/sendMessage?chat_id=' + bot_chatID + '&parse_mode=Markdown&text=' + bot_message response = requests.get(send_text) return response.json() def VGGish(pump=None,
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: train_dataset = datasets.load_dataset("corpora/com_voice_sex_corpus", split="train", cache_dir=model_args.cache_dir) eval_dataset = datasets.load_dataset("corpora/com_voice_sex_corpus", split="test", cache_dir=model_args.cache_dir) feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16_000, padding_value=0.0, do_normalize=True, return_attention_mask=True) processor = CustomWav2Vec2Processor(feature_extractor=feature_extractor) model = Wav2Vec2CommVoiceGenderModel.from_pretrained( "facebook/wav2vec2-large-xlsr-53", attention_dropout=0.01, hidden_dropout=0.01, feat_proj_dropout=0.0, mask_time_prob=0.05, layerdrop=0.01, gradient_checkpointing=True, ) if model_args.freeze_feature_extractor: model.freeze_feature_extractor() if data_args.max_train_samples is not None: train_dataset = train_dataset.select(range( data_args.max_train_samples)) if data_args.max_val_samples is not None: eval_dataset = eval_dataset.select(range(data_args.max_val_samples)) # Preprocessing the datasets. # We need to read the aduio files as arrays and tokenize the targets. def speech_file_to_array_fn(batch): start = 0 stop = 10 srate = 16_000 speech_array, sampling_rate = torchaudio.load(batch["file"]) speech_array = speech_array[0].numpy()[:stop * sampling_rate] batch["speech"] = librosa.resample(np.asarray(speech_array), sampling_rate, srate) batch["sampling_rate"] = srate batch["parent"] = batch["label"] return batch train_dataset = train_dataset.map( speech_file_to_array_fn, remove_columns=train_dataset.column_names, num_proc=data_args.preprocessing_num_workers, ) eval_dataset = eval_dataset.map( speech_file_to_array_fn, remove_columns=eval_dataset.column_names, num_proc=data_args.preprocessing_num_workers, ) def prepare_dataset(batch): # check that all files have the correct sampling rate assert ( len(set(batch["sampling_rate"])) == 1 ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}." batch["input_values"] = processor( batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values batch["labels"] = batch["parent"] return batch train_dataset = train_dataset.map( prepare_dataset, remove_columns=train_dataset.column_names, batch_size=training_args.per_device_train_batch_size, batched=True, num_proc=data_args.preprocessing_num_workers, ) eval_dataset = eval_dataset.map( prepare_dataset, remove_columns=eval_dataset.column_names, batch_size=training_args.per_device_train_batch_size, batched=True, num_proc=data_args.preprocessing_num_workers, ) from sklearn.metrics import classification_report, confusion_matrix def compute_metrics(pred): label_idx = [0, 1] label_names = ['female', 'male'] labels = pred.label_ids.argmax(-1) preds = pred.predictions.argmax(-1) acc = accuracy_score(labels, preds) f1 = f1_score(labels, preds, average='macro') report = classification_report(y_true=labels, y_pred=preds, labels=label_idx, target_names=label_names) matrix = confusion_matrix(y_true=labels, y_pred=preds) print(report) print(matrix) wandb.log({ "conf_mat": wandb.plot.confusion_matrix(probs=None, y_true=labels, preds=preds, class_names=label_names) }) wandb.log({ "precision_recall": wandb.plot.pr_curve(y_true=labels, y_probas=pred.predictions, labels=label_names) }) return {"accuracy": acc, "f1_score": f1} wandb.init(name=training_args.output_dir, config=training_args) # Data collator data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True) # Initialize our Trainer trainer = CTCTrainer( model=model, data_collator=data_collator, args=training_args, compute_metrics=compute_metrics, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=processor.feature_extractor, ) # Training if training_args.do_train: if last_checkpoint is not None: checkpoint = last_checkpoint elif os.path.isdir(model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # save the feature_extractor and the tokenizer if is_main_process(training_args.local_rank): processor.save_pretrained(training_args.output_dir) metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len( eval_dataset) metrics["eval_samples"] = min(max_val_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) return results
#WGAN import torch import torch.nn as nn import torch.nn.functional as F import torchvision import torch.optim as optim from torchvision import datasets, transforms from torch.autograd import Variable from torchvision.utils import save_image import wandb wandb.init(job_type='train', project='WGAN', name='WGAN') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') bs = 500 # transform = transforms.Compose([transforms.ToTensor(),transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))]) train_dataset = datasets.FashionMNIST(root='./fashion_mnist_data', train=True, transform=transforms.ToTensor(), download=True) test_dataset = datasets.FashionMNIST(root='./fashion_mnist_data', train=False, transform=transforms.ToTensor(), download=False) train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=bs,
tf.concat([train_inputs['X'], train_inputs['X2']], axis=2)) dT_train.append(train_inputs['target']) dX_test.append(test_inputs) dX_scaler.append(y_scaler) global_inputs_X = tf.concat(dX_train, 0) global_inputs_T = tf.concat(dT_train, 0) print('done with data') working = '.models/' + dset + '_models/global/trials' # 1️⃣ Start a new run, tracking config metadata run = wandb.init(project="3days_forcast", config={ 'layers': LAYERS, 'dropout': DROPOUT, 'neurons': NEURONS, 'learning rate': LR, 'batch_size': BATCHSIZE, "architecture": "RNN with forward lags for temporal", "dataset": "Columbia", "epochs": MAX_EPOCHS, 'patience': PATIENCE }) config = wandb.config # full data LSTM MIMO compilation and fit LSTMIMO = build_model(l=LAYERS, drop=DROPOUT, n=NEURONS, lr=LR) early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=PATIENCE, mode='min') history = LSTMIMO.fit(global_inputs_X,
def train( self, train_dataloader, output_dir, show_running_loss=True, eval_dataloader=None, verbose=True, **kwargs, ): """ Trains the model on train_dataset. Utility function to be used by the train_model() method. Not intended to be used directly. """ device = self.device model = self.model args = self.args tb_writer = SummaryWriter(logdir=args["tensorboard_dir"]) t_total = len(train_dataloader) // args[ "gradient_accumulation_steps"] * args["num_train_epochs"] no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args["weight_decay"], }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] warmup_steps = math.ceil(t_total * args["warmup_ratio"]) args["warmup_steps"] = warmup_steps if args[ "warmup_steps"] == 0 else args["warmup_steps"] optimizer = AdamW(optimizer_grouped_parameters, lr=args["learning_rate"], eps=args["adam_epsilon"]) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args["warmup_steps"], num_training_steps=t_total) if args["fp16"]: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args["fp16_opt_level"]) if args["n_gpu"] > 1: model = torch.nn.DataParallel(model) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args["num_train_epochs"]), desc="Epoch", disable=args["silent"]) epoch_number = 0 best_eval_metric = None early_stopping_counter = 0 if args["evaluate_during_training"]: training_progress_scores = self._create_training_progress_scores( **kwargs) if args["wandb_project"]: wandb.init(project=args["wandb_project"], config={**args}, **args["wandb_kwargs"]) wandb.watch(self.model) model.train() for _ in train_iterator: # epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate( tqdm(train_dataloader, desc="Current iteration", disable=args["silent"])): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch (lm_loss), (mc_loss), *_ = model( input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, mc_labels=mc_labels, lm_labels=lm_labels, ) # model outputs are always tuple in pytorch-transformers (see doc) loss = lm_loss * args["lm_coef"] + mc_loss * args["mc_coef"] if args["n_gpu"] > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training current_loss = loss.item() if show_running_loss: print("\rRunning loss: %f" % current_loss, end="") if args["gradient_accumulation_steps"] > 1: loss = loss / args["gradient_accumulation_steps"] if args["fp16"]: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() # torch.nn.utils.clip_grad_norm_( # amp.master_params(optimizer), args["max_grad_norm"] # ) else: loss.backward() # torch.nn.utils.clip_grad_norm_( # model.parameters(), args["max_grad_norm"] # ) tr_loss += loss.item() if (step + 1) % args["gradient_accumulation_steps"] == 0: if args["fp16"]: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args["max_grad_norm"]) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args["max_grad_norm"]) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args["logging_steps"] > 0 and global_step % args[ "logging_steps"] == 0: # Log metrics tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args["logging_steps"], global_step) logging_loss = tr_loss if args["wandb_project"]: wandb.log({ "Training loss": current_loss, "lr": scheduler.get_lr()[0], "global_step": global_step, }) if args["save_steps"] > 0 and global_step % args[ "save_steps"] == 0: # Save model checkpoint output_dir_current = os.path.join( output_dir, "checkpoint-{}".format(global_step)) self._save_model(output_dir_current, model=model) if args["evaluate_during_training"] and ( args["evaluate_during_training_steps"] > 0 and global_step % args["evaluate_during_training_steps"] == 0): # Only evaluate when single GPU otherwise metrics may not average well results, _, _ = self.eval_model( eval_dataloader, verbose=verbose and args["evaluate_during_training_verbose"], silent=True, **kwargs, ) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) output_dir_current = os.path.join( output_dir, "checkpoint-{}".format(global_step)) if args["save_eval_checkpoints"]: self._save_model(output_dir_current, model=model, results=results) training_progress_scores["global_step"].append( global_step) training_progress_scores["train_loss"].append( current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv( os.path.join(args["output_dir"], "training_progress_scores.csv"), index=False, ) if args["wandb_project"]: wandb.log( self._get_last_metrics( training_progress_scores)) if not best_eval_metric: best_eval_metric = results[ args["early_stopping_metric"]] self._save_model(args["best_model_dir"], model=model, results=results) if best_eval_metric and args[ "early_stopping_metric_minimize"]: if (results[args["early_stopping_metric"]] - best_eval_metric < args["early_stopping_delta"]): best_eval_metric = results[ args["early_stopping_metric"]] self._save_model(args["best_model_dir"], model=model, results=results) early_stopping_counter = 0 else: if args["use_early_stopping"]: if early_stopping_counter < args[ "early_stopping_patience"]: early_stopping_counter += 1 if verbose: logger.info( f" No improvement in {args['early_stopping_metric']}" ) logger.info( f" Current step: {early_stopping_counter}" ) logger.info( f" Early stopping patience: {args['early_stopping_patience']}" ) else: if verbose: logger.info( f" Patience of {args['early_stopping_patience']} steps reached" ) logger.info( " Training terminated.") train_iterator.close() return global_step, tr_loss / global_step else: if (results[args["early_stopping_metric"]] - best_eval_metric > args["early_stopping_delta"]): best_eval_metric = results[ args["early_stopping_metric"]] self._save_model(args["best_model_dir"], model=model, results=results) early_stopping_counter = 0 else: if args["use_early_stopping"]: if early_stopping_counter < args[ "early_stopping_patience"]: early_stopping_counter += 1 if verbose: logger.info( f" No improvement in {args['early_stopping_metric']}" ) logger.info( f" Current step: {early_stopping_counter}" ) logger.info( f" Early stopping patience: {args['early_stopping_patience']}" ) else: if verbose: logger.info( f" Patience of {args['early_stopping_patience']} steps reached" ) logger.info( " Training terminated.") train_iterator.close() return global_step, tr_loss / global_step epoch_number += 1 output_dir_current = os.path.join( output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number)) if args["save_model_every_epoch"] or args[ "evaluate_during_training"]: os.makedirs(output_dir_current, exist_ok=True) if args["save_model_every_epoch"]: self._save_model(output_dir_current, model=model) if args["evaluate_during_training"]: results, _, _ = self.eval_model( eval_dataloader, verbose=verbose and args["evaluate_during_training_verbose"], silent=True, **kwargs, ) self._save_model(output_dir_current, results=results) training_progress_scores["global_step"].append(global_step) training_progress_scores["train_loss"].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv(os.path.join(args["output_dir"], "training_progress_scores.csv"), index=False) if args["wandb_project"]: wandb.log(self._get_last_metrics(training_progress_scores)) if not best_eval_metric: best_eval_metric = results[args["early_stopping_metric"]] self._save_model(args["best_model_dir"], model=model, results=results) if best_eval_metric and args["early_stopping_metric_minimize"]: if results[args[ "early_stopping_metric"]] - best_eval_metric < args[ "early_stopping_delta"]: best_eval_metric = results[ args["early_stopping_metric"]] self._save_model(args["best_model_dir"], model=model, results=results) early_stopping_counter = 0 else: if results[args[ "early_stopping_metric"]] - best_eval_metric > args[ "early_stopping_delta"]: best_eval_metric = results[ args["early_stopping_metric"]] self._save_model(args["best_model_dir"], model=model, results=results) early_stopping_counter = 0 return global_step, tr_loss / global_step
) if args.mirror_augment: transform = transforms.Compose( [ transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5), inplace=True), ] ) else: transform = transforms.Compose( [ transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5), inplace=True), ] ) dataset = MultiResolutionDataset(args.path, transform, args.size, args.use_label, metadata, categories) loader = data.DataLoader( dataset, batch_size=args.batch, sampler=data_sampler(dataset, shuffle=True, distributed=args.distributed), drop_last=True, ) if get_rank() == 0 and wandb is not None and args.wandb: wandb.init(project='stylegan 2') train(args, loader, generator, discriminator, g_optim, d_optim, g_ema, device)
def __init__(self, gpu, world_size, dataset, batch_size, lr, mom, lambd, model, max_epoch, client_epoch, seed, exp_id, early_stop_round, early_stop_metric): super().__init__() torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) np.random.seed(seed) random.seed(seed) torch.backends.cudnn.deterministic = True self.iid = False logger.add(f"logs/asyncfed/{world_size}_{dataset}_{batch_size}_{lr}" f"_{mom}_{lambd}_{model}_{max_epoch}_{client_epoch}_PS{exp_id}.log") if wandb_enable: wandb.init(project="Async_FedAvg", name=f"async_{world_size}_{batch_size}_{max_epoch}_{client_epoch}_{lr}_{lambd}" f"_{mom}_{dataset}_{model}_{'iid' if self.iid else 'noniid'}", config={ "method": "async", "world size": world_size, "dataset": dataset, "iid": self.iid, "model": model, "batch size": batch_size, "learning rate": lr, "momentum": mom, "lambda": lambd, "global epoch": max_epoch, "client epoch": client_epoch, "seed": seed, "mom_metho": "normal", }) self.max_epoch = max_epoch * client_epoch self.client_epoch = client_epoch self.world_size = world_size self.mom = mom self.device = f"cuda:{gpu}" if torch.cuda.is_available() else "cpu" if dataset == "cifar100": class_num = 100 elif dataset == "emnist": class_num = 62 else: class_num = 10 self.model_name = model self.model = load_model(model, class_num=class_num).to(self.device) self.lr = lr self.lambd = lambd self.aggregation = [DataAggregation(r) for r in range(1, world_size)] self.embedding_list = [] self.dyn_task = np.array([0. for _ in range(self.world_size - 1)]) self.dyn_timer = np.array([0. for _ in range(self.world_size - 1)]) self.client_counter = 0 self.wtminus1 = {} self.mom_buffer = {} self.gminus1 = {} self.broadcast_fut_all = None self.cluster_is_ready = True self.optimizer = torch.optim.SGD(self.model.parameters(), lr=self.lr, momentum=0.) _, self.test_loader = partition_dataset(dataset, world_size - 1, 0, batch_size, seed, iid=self.iid) self.acc_list = [] self.early_stop_round = early_stop_round self.early_stop_metric = early_stop_metric
def train( run_name: str, # Data train_filepath: str, eval_filepath: str, type_vocab_filepath: str, spm_filepath: str, num_workers=1, max_seq_len=1024, max_eval_seq_len=1024, # Model resume_path: str = "", pretrain_resume_path: str = "", pretrain_resume_encoder_name: str = "encoder_q", # encoder_q, encoder_k, encoder pretrain_resume_project: bool = False, no_output_attention: bool = False, encoder_type: str = "transformer", n_encoder_layers: int = 6, d_model: int = 512, # Optimization num_epochs: int = 100, save_every: int = 2, batch_size: int = 256, lr: float = 8e-4, adam_beta1: float = 0.9, adam_beta2: float = 0.98, adam_eps: float = 1e-6, weight_decay: float = 0, warmup_steps: int = 5000, num_steps: int = 200000, # Loss subword_regularization_alpha: float = 0, ignore_any_loss: bool = False, # Computational use_cuda: bool = True, seed: int = 1, ): """Train model""" torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) run_dir = RUN_DIR / run_name run_dir.mkdir(exist_ok=True, parents=True) logger.add(str((run_dir / "train.log").resolve())) logger.info(f"Saving logs, model checkpoints to {run_dir}") config = locals() logger.info(f"Config: {config}") wandb.init(name=run_name, config=config, job_type="training", project="type_prediction", entity="ml4code") if use_cuda: assert torch.cuda.is_available(), "CUDA not available. Check env configuration, or pass --use_cuda False" sp = spm.SentencePieceProcessor() sp.Load(spm_filepath) pad_id = sp.PieceToId("[PAD]") id_to_target, target_to_id = load_type_vocab(type_vocab_filepath) no_type_id = target_to_id["O"] assert no_type_id == 0 # Just a sense check since O is the first line in the vocab file any_id = target_to_id["$any$"] collate_fn = get_collate_fn(pad_id, no_type_id) # Create training dataset and dataloader logger.info(f"Training data path {train_filepath}") train_dataset = DeepTyperDataset( train_filepath, type_vocab_filepath, spm_filepath, max_length=max_seq_len, subword_regularization_alpha=subword_regularization_alpha ) logger.info(f"Training dataset size: {len(train_dataset)}") train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, drop_last=True, collate_fn=collate_fn ) # Create eval dataset and dataloader logger.info(f"Eval data path {eval_filepath}") eval_dataset = DeepTyperDataset( eval_filepath, type_vocab_filepath, spm_filepath, max_length=max_eval_seq_len, subword_regularization_alpha=subword_regularization_alpha, split_source_targets_by_tab=eval_filepath.endswith(".json") ) logger.info(f"Eval dataset size: {len(eval_dataset)}") eval_loader = torch.utils.data.DataLoader( eval_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, collate_fn=collate_fn ) # Create model model = TypeTransformer(n_tokens=sp.GetPieceSize(), n_output_tokens=len(id_to_target), pad_id=pad_id, encoder_type=encoder_type, n_encoder_layers=n_encoder_layers, d_model=d_model) logger.info(f"Created TypeTransformer {encoder_type} with {count_parameters(model)} params") # Load pretrained checkpoint if pretrain_resume_path: assert not resume_path logger.info(f"Resuming training from pretraining checkpoint {pretrain_resume_path}, pretrain_resume_encoder_name={pretrain_resume_encoder_name}") checkpoint = torch.load(pretrain_resume_path) pretrained_state_dict = checkpoint["model_state_dict"] encoder_state_dict = {} output_state_dict = {} assert pretrain_resume_encoder_name in ["encoder_k", "encoder_q", "encoder"] for key, value in pretrained_state_dict.items(): if key.startswith(pretrain_resume_encoder_name + ".") and "project_layer" not in key: remapped_key = key[len(pretrain_resume_encoder_name + ".") :] logger.debug(f"Remapping checkpoint key {key} to {remapped_key}. Value mean: {value.mean().item()}") encoder_state_dict[remapped_key] = value if key.startswith(pretrain_resume_encoder_name + ".") and "project_layer.0." in key and pretrain_resume_project: remapped_key = key[len(pretrain_resume_encoder_name + ".project_layer.") :] logger.debug(f"Remapping checkpoint project key {key} to output key {remapped_key}. Value mean: {value.mean().item()}") output_state_dict[remapped_key] = value model.encoder.load_state_dict(encoder_state_dict) # TODO: check for head key rather than output for MLM model.output.load_state_dict(output_state_dict, strict=False) logger.info(f"Loaded state dict from {pretrain_resume_path}") # Set up optimizer model = nn.DataParallel(model) model = model.cuda() if use_cuda else model wandb.watch(model, log="all") optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(adam_beta1, adam_beta2), eps=adam_eps, weight_decay=weight_decay) scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps, num_steps) epoch = 0 global_step = 0 min_eval_metric = float("inf") if resume_path: assert not pretrain_resume_path logger.info(f"Resuming training from checkpoint {resume_path}") checkpoint = torch.load(resume_path) model.module.load_state_dict(checkpoint["model_state_dict"]) optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) epoch = checkpoint["epoch"] global_step = checkpoint["global_step"] min_eval_metric = checkpoint["min_eval_metric"] # Evaluate initial metrics logger.info(f"Evaluating model after epoch {epoch} ({global_step} steps)...") eval_metric, eval_metrics = _evaluate(model, eval_loader, sp, target_to_id=target_to_id, use_cuda=use_cuda, no_output_attention=no_output_attention) for metric, value in eval_metrics.items(): logger.info(f"Evaluation {metric} after epoch {epoch} ({global_step} steps): {value:.4f}") eval_metrics["epoch"] = epoch wandb.log(eval_metrics, step=global_step) for epoch in tqdm.trange(epoch + 1, num_epochs + 1, desc="training", unit="epoch", leave=False): logger.info(f"Starting epoch {epoch}\n") model.train() pbar = tqdm.tqdm(train_loader, desc=f"epoch {epoch}") for X, lengths, output_attn, labels in pbar: if use_cuda: X, lengths, output_attn, labels = X.cuda(), lengths.cuda(), output_attn.cuda(), labels.cuda() optimizer.zero_grad() if no_output_attention: logits = model(X, lengths, None) # BxLxVocab else: logits = model(X, lengths, output_attn) # BxLxVocab if ignore_any_loss: # Don't train with $any$ type labels_ignore_any = labels.clone() labels_ignore_any[labels_ignore_any == any_id] = no_type_id loss = F.cross_entropy(logits.transpose(1, 2), labels_ignore_any, ignore_index=no_type_id) else: loss = F.cross_entropy(logits.transpose(1, 2), labels, ignore_index=no_type_id) loss.backward() optimizer.step() scheduler.step() # Compute accuracy in training batch (corr1_any, corr5_any), num_labels_any = accuracy(logits, labels, topk=(1, 5), ignore_idx=(no_type_id,)) acc1_any, acc5_any = corr1_any / num_labels_any * 100, corr5_any / num_labels_any * 100 (corr1, corr5), num_labels = accuracy(logits, labels, topk=(1, 5), ignore_idx=(no_type_id, any_id)) acc1, acc5 = corr1 / num_labels * 100, corr5 / num_labels * 100 # Log loss global_step += 1 wandb.log( { "epoch": epoch, "train/loss": loss.item(), "train/acc@1": acc1, "train/acc@5": acc5, "train/acc@1_any": acc1_any, "train/acc@5_any": acc5_any, "lr": scheduler.get_last_lr()[0], }, step=global_step, ) pbar.set_description(f"epoch {epoch} loss {loss.item():.4f}") # Evaluate logger.info(f"Evaluating model after epoch {epoch} ({global_step} steps)...") eval_metric, eval_metrics = _evaluate( model, eval_loader, sp, target_to_id=target_to_id, use_cuda=use_cuda, no_output_attention=no_output_attention) for metric, value in eval_metrics.items(): logger.info(f"Evaluation {metric} after epoch {epoch} ({global_step} steps): {value:.4f}") eval_metrics["epoch"] = epoch wandb.log(eval_metrics, step=global_step) # Save checkpoint if save_every and epoch % save_every == 0 or eval_metric < min_eval_metric: checkpoint = { "model_state_dict": model.module.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "epoch": epoch, "global_step": global_step, "config": config, "eval_metric": eval_metric, "min_eval_metric": min_eval_metric } if eval_metric < min_eval_metric: logger.info(f"New best evaluation metric: prev {min_eval_metric:.4f} > new {eval_metric:.4f}") min_eval_metric = eval_metric model_file = run_dir / f"ckpt_best.pth" else: model_file = run_dir / f"ckpt_ep{epoch:04d}.pth" logger.info(f"Saving checkpoint to {model_file}...") torch.save(checkpoint, str(model_file.resolve())) logger.info("Done.")
parser.add_argument('--policy-frequency', type=int, default=2, help="the frequency of training policy (delayed)") parser.add_argument('--noise-clip', type=float, default=0.5, help='noise clip parameter of the Target Policy Smoothing Regularization') args = parser.parse_args() if not args.seed: args.seed = int(time.time()) # TRY NOT TO MODIFY: setup the environment experiment_name = f"{args.gym_id}__{args.exp_name}__{args.seed}__{int(time.time())}" writer = SummaryWriter(f"runs/{experiment_name}") writer.add_text('hyperparameters', "|param|value|\n|-|-|\n%s" % ( '\n'.join([f"|{key}|{value}|" for key, value in vars(args).items()]))) if args.prod_mode: import wandb wandb.init(project=args.wandb_project_name, entity=args.wandb_entity, sync_tensorboard=True, config=vars(args), name=experiment_name, monitor_gym=True, save_code=True) writer = SummaryWriter(f"/tmp/{experiment_name}") # TRY NOT TO MODIFY: seeding device = torch.device('cuda' if torch.cuda.is_available() and args.cuda else 'cpu') env = gym.make(args.gym_id) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = args.torch_deterministic env.seed(args.seed) env.action_space.seed(args.seed) env.observation_space.seed(args.seed) # respect the default timelimit assert isinstance(env.action_space, Box), "only continuous action space is supported"
if n_gpu == 1: data_path = "./data" elif n_gpu == 4: data_path = "./data" save_im_path = "./g_z/" + run_name if n_gpu == 1: save_checkpoints_path = "./checkpoints/" + run_name elif n_gpu == 4: save_checkpoints_path = "/hpf/largeprojects/agoldenb/lechang/" + run_name # load_checkpoint = "/hpf/largeprojects/agoldenb/lechang/trained-1600.pth" load_checkpoint = "no" # restart wandb.init(project="mri_gan_cancer", name=run_name) parser = argparse.ArgumentParser() parser.add_argument('--batch-size', type=str, default=str(batch_size), metavar='N', help='') parser.add_argument('--lr', type=str, default=str(learning_rate), metavar='N', help='') parser.add_argument('--data_path', type=str, default=data_path,
def train(hyp, opt, device, tb_writer=None, wandb=None): logger.info(f'Hyperparameters {hyp}') save_dir, epochs, batch_size, total_batch_size, weights, rank = \ Path(opt.save_dir), opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank # Directories wdir = save_dir / 'weights' wdir.mkdir(parents=True, exist_ok=True) # make dir last = wdir / 'last.pt' best = wdir / 'best.pt' results_file = save_dir / 'results.txt' # Save run settings with open(save_dir / 'hyp.yaml', 'w') as f: yaml.dump(hyp, f, sort_keys=False) with open(save_dir / 'opt.yaml', 'w') as f: yaml.dump(vars(opt), f, sort_keys=False) # Configure plots = not opt.evolve # create plots cuda = device.type != 'cpu' init_seeds(2 + rank) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) # data dict with torch_distributed_zero_first(rank): check_dataset(data_dict) # check train_path = data_dict['train'] test_path = data_dict['val'] nc = 1 if opt.single_cls else int(data_dict['nc']) # number of classes names = ['item'] if opt.single_cls and len( data_dict['names']) != 1 else data_dict['names'] # class names assert len(names) == nc, '%g names found for nc=%g dataset in %s' % ( len(names), nc, opt.data) # check # Model pretrained = weights.endswith('.pt') if pretrained: # with torch_distributed_zero_first(rank): # attempt_download(weights) # download if not found locally ckpt = torch.load(weights, map_location=device) # load checkpoint if hyp.get('anchors'): ckpt['model'].yaml['anchors'] = round( hyp['anchors']) # force autoanchor model = Model(opt.cfg or ckpt['model'].yaml, ch=3, nc=nc).to(device) # create exclude = ['anchor'] if opt.cfg or hyp.get('anchors') else [ ] # exclude keys state_dict = ckpt['model'].float().state_dict() # to FP32 state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude) # intersect model.load_state_dict(state_dict, strict=False) # load logger.info( 'Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights)) # report else: model = Model(opt.cfg, ch=3, nc=nc).to(device) # create # Freeze freeze = [] # parameter names to freeze (full or partial) for k, v in model.named_parameters(): v.requires_grad = True # train all layers if any(x in k for x in freeze): print('freezing %s' % k) v.requires_grad = False # Optimizer nbs = 64 # nominal batch size accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay logger.info(f"Scaled weight_decay = {hyp['weight_decay']}") pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_modules(): if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter): pg2.append(v.bias) # biases if isinstance(v, nn.BatchNorm2d): pg0.append(v.weight) # no decay elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter): pg1.append(v.weight) # apply decay if opt.adam: optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # Scheduler https://arxiv.org/pdf/1812.01187.pdf # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR lf = one_cycle(1, hyp['lrf'], epochs) # cosine 1->hyp['lrf'] scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) # Logging if rank in [-1, 0] and wandb and wandb.run is None: opt.hyp = hyp # add hyperparameters wandb_run = wandb.init( config=opt, resume="allow", project='YOLOv5' if opt.project == 'runs/train' else Path(opt.project).stem, name=save_dir.stem, id=ckpt.get('wandb_id') if 'ckpt' in locals() else None) loggers = {'wandb': wandb} # loggers dict # Resume start_epoch, best_fitness = 0, 0.0 if pretrained: # Optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # Results if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt # Epochs start_epoch = ckpt['epoch'] + 1 if opt.resume: assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % ( weights, epochs) if epochs < start_epoch: logger.info( '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt, state_dict # Image sizes gs = int(model.stride.max()) # grid size (max stride) nl = model.model[ -1].nl # number of detection layers (used for scaling hyp['obj']) imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size ] # verify imgsz are gs-multiples # DP mode if cuda and rank == -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) logger.info('Using SyncBatchNorm()') # EMA ema = ModelEMA(model) if rank in [-1, 0] else None # DDP mode if cuda and rank != -1: model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank) # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, rank=rank, world_size=opt.world_size, workers=opt.workers, image_weights=opt.image_weights, quad=opt.quad) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % ( mlc, nc, opt.data, nc - 1) # Process 0 if rank in [-1, 0]: ema.updates = start_epoch * nb // accumulate # set EMA updates testloader = create_dataloader( test_path, imgsz_test, total_batch_size, gs, opt, # testloader hyp=hyp, cache=opt.cache_images and not opt.notest, rect=True, rank=-1, world_size=opt.world_size, workers=opt.workers, pad=0.5)[0] if not opt.resume: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency # model._initialize_biases(cf.to(device)) if plots: plot_labels(labels, save_dir, loggers) if tb_writer: tb_writer.add_histogram('classes', c, 0) # Anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # Model parameters hyp['cls'] *= nc / 80. # scale hyp['cls'] to class count hyp['obj'] *= imgsz**2 / 640.**2 * 3. / nl # scale hyp['obj'] to image size and output layers model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # iou loss ratio (obj_loss = 1.0 or iou) model.class_weights = labels_to_class_weights( dataset.labels, nc).to(device) * nc # attach class weights model.names = names # Start training t0 = time.time() nw = max(round(hyp['warmup_epochs'] * nb), 1000) # number of warmup iterations, max(3 epochs, 1k iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0, 0, 0 ) # P, R, [email protected], [email protected], val_loss(box, obj, cls) scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=cuda) logger.info('Image sizes %g train, %g test\n' 'Using %g dataloader workers\nLogging results to %s\n' 'Starting training for %g epochs...' % (imgsz, imgsz_test, dataloader.num_workers, save_dir, epochs)) for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if opt.image_weights: # Generate indices if rank in [-1, 0]: cw = model.class_weights.cpu().numpy() * ( 1 - maps)**2 / nc # class weights iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw) # image weights dataset.indices = random.choices( range(dataset.n), weights=iw, k=dataset.n) # rand weighted idx # Broadcast if DDP if rank != -1: indices = (torch.tensor(dataset.indices) if rank == 0 else torch.zeros(dataset.n)).int() dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(4, device=device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) logger.info( ('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'total', 'targets', 'img_size')) if rank in [-1, 0]: pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float( ) / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # iou loss ratio (obj_loss = 1.0 or iou) accumulate = max( 1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp(ni, xi, [ hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch) ]) if 'momentum' in x: x['momentum'] = np.interp( ni, xi, [hyp['warmup_momentum'], hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward with amp.autocast(enabled=cuda): pred = model(imgs) # forward # loss, loss_items = compute_loss(pred, targets.to(device), model) # loss scaled by batch_size loss, loss_items = compute_loss_eiou( pred, targets.to(device), model) # loss scaled by batch_size if rank != -1: loss *= opt.world_size # gradient averaged between devices in DDP mode if opt.quad: loss *= 4. # Backward scaler.scale(loss).backward() # Optimize if ni % accumulate == 0: scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema: ema.update(model) # Print if rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1 ) # update mean losses mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) pbar.set_description(s) # Plot if plots and ni < 3: f = save_dir / f'train_batch{ni}.jpg' # filename Thread(target=plot_images, args=(imgs, targets, paths, f), daemon=True).start() # if tb_writer: # tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard elif plots and ni == 3 and wandb: wandb.log({ "Mosaics": [ wandb.Image(str(x), caption=x.name) for x in save_dir.glob('train*.jpg') ] }) # end batch ------------------------------------------------------------------------------------------------ # end epoch ---------------------------------------------------------------------------------------------------- # Scheduler lr = [x['lr'] for x in optimizer.param_groups] # for tensorboard scheduler.step() # DDP process 0 or single-GPU if rank in [-1, 0]: # mAP if ema: ema.update_attr(model, include=[ 'yaml', 'nc', 'hyp', 'gr', 'names', 'stride', 'class_weights' ]) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP results, maps, times = test.test( opt.data, batch_size=total_batch_size, imgsz=imgsz_test, model=ema.ema, single_cls=opt.single_cls, dataloader=testloader, save_dir=save_dir, plots=plots and final_epoch, log_imgs=opt.log_imgs if wandb else 0) # Write with open(results_file, 'a') as f: f.write( s + '%10.4g' * 7 % results + '\n') # P, R, [email protected], [email protected], val_loss(box, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) # Log tags = [ 'train/box_loss', 'train/obj_loss', 'train/cls_loss', # train loss 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/box_loss', 'val/obj_loss', 'val/cls_loss', # val loss 'x/lr0', 'x/lr1', 'x/lr2' ] # params for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags): if tb_writer: tb_writer.add_scalar(tag, x, epoch) # tensorboard if wandb: wandb.log({tag: x}) # W&B # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # weighted combination of [P, R, [email protected], [email protected]] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema, 'optimizer': None if final_epoch else optimizer.state_dict(), 'wandb_id': wandb_run.id if wandb else None } # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if rank in [-1, 0]: # Strip optimizers final = best if best.exists() else last # final model for f in [last, best]: if f.exists(): strip_optimizer(f) # strip optimizers if opt.bucket: os.system(f'gsutil cp {final} gs://{opt.bucket}/weights') # upload # Plots if plots: plot_results(save_dir=save_dir) # save as results.png if wandb: files = [ 'results.png', 'precision_recall_curve.png', 'confusion_matrix.png' ] wandb.log({ "Results": [ wandb.Image(str(save_dir / f), caption=f) for f in files if (save_dir / f).exists() ] }) if opt.log_artifacts: wandb.log_artifact(artifact_or_path=str(final), type='model', name=save_dir.stem) # Test best.pt logger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) if opt.data.endswith('coco.yaml') and nc == 80: # if COCO for conf, iou, save_json in ([0.25, 0.45, False], [0.001, 0.65, True]): # speed, mAP tests results, _, _ = test.test(opt.data, batch_size=total_batch_size, imgsz=imgsz_test, conf_thres=conf, iou_thres=iou, model=attempt_load(final, device).half(), single_cls=opt.single_cls, dataloader=testloader, save_dir=save_dir, save_json=save_json, plots=False) else: dist.destroy_process_group() wandb.run.finish() if wandb and wandb.run else None torch.cuda.empty_cache() return results
print('cropSize: ' + str(params['cropSize'])) params['imgSize'] = params['cropSize'] # Use CUDA os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id use_cuda = torch.cuda.is_available() # args.gpu_id = os.getenv('CUDA_VISIBLE_DEVICES') print(args.gpu_id) import visdom vis = None if args.visdom: vis = visdom.Visdom(server=args.server, port=8095, env='main_davis_viz1') vis.close() import wandb wandb.init(project='palindromes') vis.close() # Random seed if args.manualSeed is None: args.manualSeed = random.randint(1, 10000) random.seed(args.manualSeed) torch.manual_seed(args.manualSeed) if use_cuda: torch.cuda.manual_seed_all(args.manualSeed) class Wrap(nn.Module): def __init__(self, model): super(Wrap, self).__init__() self.model = model
name = run_name artifact = wandb.Artifact(f'{name}-model', 'model') for f in os.listdir(path): if f.startswith('wandb-'): continue # noqa: 701 if f == 'output.log': continue # noqa: 701 if f == 'requirements.txt': continue # noqa: 701 if f.startswith('events.'): continue # noqa: 701 if os.path.isdir(os.path.join(path, f)): continue # noqa: 701 artifact.add_file(os.path.join(path, f), f) wandb.run.log_artifact(artifact, aliases=['latest', run_name]) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('run') parser.add_argument('--name', default=None, help='artifact name') args = parser.parse_args() run = args.run root = pull_model(run) setup_logging() logger = logging.getLogger() logger.info('publishing artifact') wandb.init(resume=run) publish_model(root, args.name) logger.info('model published')
def test_resume_auto_success(live_mock_server, test_settings): run = wandb.init(reinit=True, resume=True, settings=test_settings) run.join() assert not os.path.exists(test_settings.resume_fname)
import tensorflow as tf # pylint: disable=no-name-in-module from tensorflow.keras.layers import Conv2D, Dense, Dropout, Flatten, Lambda, MaxPooling2D, Reshape, Input, CuDNNGRU, TimeDistributed from tensorflow.keras.models import Sequential, Model from tensorflow.keras.callbacks import ModelCheckpoint import tensorflow.keras.backend as K from datasets import LinesDataset, Generator from util import ctc_decode, format_batch_ctc, slide_window, ExampleLogger import wandb wandb.init() wandb.config.model = "cnn" wandb.config.window_width = 14 wandb.config.window_stride = 7 # Load our dataset dataset = LinesDataset(subsample_fraction=1) dataset.load_or_generate_data() image_height, image_width = dataset.input_shape output_length, num_classes = dataset.output_shape model = Sequential() model.add( Reshape((image_height, image_width, 1), input_shape=dataset.input_shape)) model.add(Conv2D(32, kernel_size=(3, 3), activation='relu')) model.add(Conv2D(64, (3, 3), activation='relu')) model.add(MaxPooling2D()) model.add(Dropout(0.3)) # We are going to use a Conv2D to slide over these outputs with window_width and window_stride, # and output softmax activations of shape (output_length, num_classes)./ # In your calculation of the necessary filter size,
def test_resume_must_failure(live_mock_server, test_settings): with pytest.raises(wandb.Error) as e: wandb.init(reinit=True, resume="must", settings=test_settings) assert "resume='must' but run" in e.value.message
import gym.wrappers from keras.models import Sequential from keras.layers import Dense, Activation, Flatten from keras.optimizers import Adam from rl.agents.dqn import DQNAgent from rl.policy import BoltzmannQPolicy, EpsGreedyQPolicy from rl.memory import SequentialMemory from rl.callbacks import Callback import random import wandb ENV_NAME = 'LunarLander-v2' wandb.init(project="KerasDQN", name="Performance") env = gym.make(ENV_NAME) # To get repeatable results. sd = 16 np.random.seed(sd) random.seed(sd) env.seed(sd) nb_actions = env.action_space.n env = gym.wrappers.Monitor(env, './monitor', force=True) model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add(Dense(40)) model.add(Activation('relu'))
os.makedirs('./results/' + dset + '/global/3days') if not os.path.exists('./models/' + dset + '_models'): os.makedirs('./models/' + dset + '_models') if HORIZON == 24: proj_name = 'dayahead' if not os.path.exists('./results/' + dset + '/global/dayahead'): os.makedirs('./results/' + dset + '/global/dayahead') if not os.path.exists('./models/' + dset + '_models'): os.makedirs('./models/' + dset + '_models') # 1️⃣ Start a new run, tracking config metadata run = wandb.init(project=proj_name, config={ 'layers': LAYERS, 'dropout': DROPOUT, 'neurons': NEURONS, 'learning rate': LR, 'batch_size': BATCHSIZE, "architecture": "global", "dataset": dset, "epochs": MAX_EPOCHS, 'patience': PATIENCE }) config = wandb.config # full data LSTM MIMO compilation and fit LSTMIMO = build_model(l=LAYERS, drop=DROPOUT, n=NEURONS, lr=LR) early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=PATIENCE, mode='min') history = LSTMIMO.fit(global_inputs_X,
def __init__( self, up_model: nn.Module, down_layer: nn.Module = None, train_dataset=None, dev_dataset=None, dev_evaluator=None, epochs: int = 1, visiable_device: str = "0", scheduler: str = 'warmuplinear', warmup_ratio: float = 0.1, optimizer_class: Type[Optimizer] = transformers.AdamW, optimizer_params: Dict[str, object] = { 'lr': 5e-5, 'eps': 1e-6, 'correct_bias': False }, weight_decay: float = 0.01, early_stop: int = 20, # 20 evaluation steps without improving on the early_stop_on metric as specified in dev_evaluator evaluation_steps: int = 500, output_path: str = None, save_best_model: bool = True, max_grad_norm: float = 1, fp16: bool = False, accumulation_steps=1, fp16_opt_level: str = 'O1', seed: int = 122, data_loader_shuffle=True, device: str = None, dev_batch_size: int = -1, # the same as train_batch_size n_gpu: int = None, report_model: bool = True, per_gpu_train_batch_size: int = 8, restore_training: bool = False, local_rank: int = -1, wandb_config=None): """ this trainer is written for training a sequential model that contains an upstream_layer (usually transformers) and a downstream_layer (usually task-specific heads like FF, RNN, CNN for encoding the output of upstram_layer) :param up_model: transformers like transformers.GPT2LMHeadModel or transformers.BERTModel :param down_layer: None if up_model already wraps up with an output encoder such as LMHead in GPT2LMHeadModel, else nn.Module for encoding the output of up_model :param train_dataset: train_dataset, it can be either instance of torch.data.Dataset or IterableDataset (defined in data.py) :param dev_dataset: dev_dataset, it can be either instance of torch.data.Dataset or IterableDataset :param dev_evaluator: dev_evaluator, evaluator on dev_dataset for early stop and performance tracking during training (defined in evaluate.py) :param epochs: number of epoches for training :param visiable_device: devices chosen to perform training :param scheduler: scheduler specially from transformers: see options in self._get_scheduler :param warmup_ratio: warmup_ratio ratio for learning rate over total training steps :param optimizer_class: transformers.AdamW de byfault :param optimizer_params: optimizer params :param weight_decay:weight decay :param early_stop:early stop steps :param evaluation_steps:logging steps :param output_path: path to save the checkpoint with the best performance as specified in early_stop_on in dev_evaluator instance :param save_best_model:save best checkpoint or the latest checkpoint :param max_grad_norm:max grad norm :param fp16: fp16 training :param accumulation_steps:accumulation steps :param fp16_opt_level:fp16 opt level :param seed:random seed for reproducibility :param data_loader_shuffle:Whether to shuffle data_loader of training dataset and dev dataset after epoch ends :param device: device for training, None or gpu for gpu training, cpu for gpu training :param dev_batch_size: development batch size, usually larger than training batch size due to no grads calculation and hence less burden on memory :param n_gpu: number of gpus for training :param report_model:if report model's structure and number of trainable params in logging :param per_gpu_train_batch_size: what it means literally :param restore_training: if restore training if the training process is interupped due to some accidents :param local_rank:for distributed training :param wandb_config: wandb logging if not none, else without wandb logging """ self.up_model = up_model if down_layer == None: # In this example, the upstream_layer already integrate the downstream head (namely, simple LM head as in transformers.GPT2LMHeadModel) # EmptyHeads is created here only for placeholder purpose down_layer = EmptyHeads() self.down_layer = down_layer assert output_path != None output_path = os.path.join("tmp", output_path) # os.makedirs(output_path,exist_ok=True) if restore_training: if not os.listdir(output_path): raise ValueError(f"no checkpoint found in {output_path}") else: logger.info( " loading embedding weights from saved checkpoint") self.up_model = self.up_model.reload( output_path ) # for other transformers (apart from bert), the load_saved function has not been added logger.info( " loading downstream weights from saved checkpoint") self.down_layer.load_saved(output_path) with open(output_path + "/ck_report.json") as f: self.ck_report = json.load(f) self.model = torch.nn.Sequential(self.up_model, self.down_layer) if is_wandb_available() and wandb_config != None: # keep track of model topology and gradients if is_wandb_available and args!=None wandb.init(project=wandb_config.wandb_project_name, config=wandb_config, name=wandb_config.wandb_run_name) wandb.watch((self.up_model, self.down_layer), log_freq=max(100, evaluation_steps)) self.wandb_config = wandb_config self._restore_training = restore_training self.early_stop = early_stop self._dev_evaluator = dev_evaluator self._evaluation_steps = evaluation_steps self._save_best_model = save_best_model self._max_grad_norm = max_grad_norm os.makedirs(output_path, exist_ok=True) if os.listdir(output_path) and not restore_training: out = input( "Output directory ({}) already exists and is not empty, you wanna remove it before start? (y/n)" .format(output_path)) if out == "y": shutil.rmtree(output_path) os.makedirs(output_path, exist_ok=True) else: raise ValueError( "Output directory ({}) already exists and is not empty". format(output_path)) logFormatter = logging.Formatter( "%(asctime)s - %(name)s - %(levelname)s - %(message)s") fileHandler = logging.FileHandler(os.path.join(output_path, "log.out"), mode="a") fileHandler.setFormatter(logFormatter) logger.addHandler(fileHandler) self._dev_evaluator.reset_logger(output_path) self.output_path = output_path if device is None or device == "cuda": if torch.cuda.is_available(): device = torch.device("cuda") n_gpu = 1 if n_gpu == 1 else torch.cuda.device_count() else: logger.warning("no cuda is found in your machine, now use cpu") device = torch.device("cpu") n_gpu = 0 elif device == "cpu": device = torch.device("cpu") n_gpu = 0 else: raise ValueError("set device to be None, cuda or cpu") assert n_gpu <= torch.cuda.device_count() logger.info("Use pytorch device: {}, with gpu_number={}".format( device, n_gpu)) self._train_batch_size = per_gpu_train_batch_size * max(1, n_gpu) self._dev_batch_size = dev_batch_size if dev_batch_size != -1 else self._train_batch_size if isinstance(train_dataset, data.IterableDataset): self._train_dataloader = DataLoader(train_dataset, batch_size=None) self._steps_per_epoch = len(self._train_dataloader.dataset) else: self._train_dataloader = DataLoader( train_dataset, shuffle=data_loader_shuffle, batch_size=self._train_batch_size) self._steps_per_epoch = len(self._train_dataloader) if isinstance(dev_dataset, data.IterableDataset): dev_dataloader = DataLoader(dev_dataset, batch_size=None) else: dev_dataloader = DataLoader(dev_dataset, shuffle=data_loader_shuffle, batch_size=self._dev_batch_size) if accumulation_steps > 1: self._steps_per_epoch = self._steps_per_epoch // accumulation_steps self._dev_data = dev_dataset self._dev_evaluator.reset_dataloader(dev_dataloader) self.collate_fn = CollateFunction(self.up_model) # Use customize batching self._train_dataloader.collate_fn = self.collate_fn self._train_data = train_dataset self._per_gpu_train_batch_size = per_gpu_train_batch_size set_seed(seed, n_gpu) if n_gpu > 1: self.model = torch.nn.DataParallel( self.model, device_ids=[int(i) for i in visiable_device.split(',')]) self.model = self.model.to(f'cuda:{self.model.device_ids[0]}') elif n_gpu == 1: self.model = self.model.to(device) self._device = device self._n_gpu = n_gpu self._total_train_steps = int(self._steps_per_epoch * epochs) self._epochs = epochs if report_model: count_params(self.model, print_details=True) param_optimizer = list(self.model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if local_rank != -1: self._total_train_steps = self._total_train_steps // torch.distributed.get_world_size( ) self._optimizer = optimizer_class(optimizer_grouped_parameters, **optimizer_params) warmup_steps = math.ceil( self._total_train_steps * warmup_ratio) # by default 20% of train data for warm-up logger.info(f" Warmup-steps: {warmup_steps}") self._scheduler = self._get_scheduler( self._optimizer, scheduler=scheduler, warmup_steps=warmup_steps, num_total=self._total_train_steps) if fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(self.model, self._optimizer, opt_level=fp16_opt_level) self.model = model self._optimizer = optimizer self._fp16 = fp16 tb_writer = None if local_rank in [-1, 0]: tb_writer = SummaryWriter() self._tb_writer = tb_writer self._local_rank = local_rank self._best_score = -float("inf") self._early_stop_count = 0 self.last_time = datetime.now() self.accumulation_steps = accumulation_steps
fps=29) optimize('/tmp/current_gif.gif') return visited_pos, visited_vel, acts, means, stds, vals def net_layers(hidden): if env_type == 'DISCRETE': act_space = env.action_space.n else: act_space = env.action_space.shape[0] obs_space = env.observation_space.shape[0] return [obs_space] + hidden + [act_space] wandb.init(entity="agkhalil", project="pytorch-ac-mountaincarcont") wandb.watch_called = False config = wandb.config config.batch_size = 50 config.episodes = 10000 config.lr_ac = 0.005 config.lr_cr = 0.00005 config.seed = 42 config.gamma = 0.99 eps = np.finfo(np.float32).eps.item() device = torch.device('cpu') torch.manual_seed(config.seed) lr_ac = config.lr_ac lr_cr = config.lr_cr
def __init__(self, **kwargs): super(WandbReporting, self).__init__(**kwargs) """" first do `wandb init` on terminal. """ wandb.init()
def _init(self): self._config = None wandb.init(**self.config.get("env_config", {}).get("wandb", {}))