def main(): tracking.init() for i in range(args.steps): logger.info('Step %s', i) # Scalars loss = get_loss(i) accuracy = get_accuracy(loss) # training metrics, but don't commit the step. tracking.log_metrics(step=i, loss=loss, accuracy=accuracy) # validation metrics, which could be reported in another part of the code if i % args.validate_every == 0: tracking.log_metric(name='val_acc', value=accuracy - 0.05, step=i) # Dist tracking.log_histogram('distribution', get_dist(i), 'auto', step=i) # Text tracking.log_text('text-ex', text=get_text(i), step=i) # images log_images(i) # HTML tracking.log_html('html-ex', html=get_html(i), step=i) # Generate sin wave as audio tracking.log_audio(data=get_audio(i), name='audio', step=i) time.sleep(0.25) plot_scatter(100) get_sin_plot(100) plot_mpl_figure(100) log_bokeh(100) log_altair(100) log_curves(100) log_plotly(100) log_curves(100) train_network()
def experiment(self) -> tracking.Run: if self._experiment: return self._experiment tracking.init( owner=self._owner, project=self._project, run_uuid=self._run_uuid, client=self._client, track_code=self._track_code, track_env=self._track_env, refresh_data=self._refresh_data, artifacts_path=self._artifacts_path, collect_artifacts=self._collect_artifacts, collect_resources=self._collect_resources, is_offline=self._is_offline, is_new=self._is_new, name=self._name, description=self._description, tags=self._tags, ) self._experiment = tracking.TRACKING_RUN return self._experiment
default='adam' ) parser.add_argument( '--log_learning_rate', type=int, default=-3 ) parser.add_argument( '--epochs', type=int, default=1 ) args = parser.parse_args() # Polyaxon tracking.init() logger.info('Loading data...') (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=args.max_features, skip_top=args.skip_top) logger.info('train sequences %s', len(x_train)) logger.info('test sequences %s', len(x_test)) # Polyaxon tracking.log_data_ref(content=x_train, name='x_train') tracking.log_data_ref(content=y_train, name='y_train') tracking.log_data_ref(content=x_test, name='x_test') tracking.log_data_ref(content=y_test, name='y_test') logger.info('Transforming data...')
def main(): """ Runs dataLayer processing scripts to turn raw dataLayer from (../raw) into cleaned dataLayer ready to be analyzed (saved in ../processed). """ ## Talk to Rune about how dataLayer is handle. config = TrainingConfig() # config = update_config(args,config) ## For polyaxon config.epochs = 501 config.run_polyaxon = True config.batch_size = 8 config.lr = 0.0002 config.save_model_step = 100 config.n_critic = 2 config.model_name = 'PartialConvolutionsWgan' # Test parametre vi kører med, som normalt sættes i experiments if config.run_polyaxon: # The POLYAXON_NO_OP env variable had to be set before any Polyaxon imports were allowed to happen from polyaxon import tracking tracking.init() input_root_path = Path( r'/data/inpainting/data_landset8/Test_dataset/Betaset') cache_path = Path('/cache') output_root_path = Path(tracking.get_outputs_path()) pathToData = input_root_path ## Delete later HACK inpainting_data_path = input_root_path / 'inpainting' # Set PyTorch to use the data directory for caching pre-trained models. If this is not done, each experiment # will download the pre-trained model and store it in each individual experiment container, thereby wasting # large amounts of disk space. # Code is from here: https://stackoverflow.com/a/52784628 os.environ['TORCH_HOME'] = str( cache_path / 'pytorch_cache') # setting the environment variable config.output_path = Path(os.getcwd()).joinpath('outputs') config.data_path = Path(r'/data/inpainting/') config.polyaxon_tracking = tracking if not config.run_polyaxon: os.environ['POLYAXON_NO_OP'] = 'true' # Setup Polyaxon (import must be done here as the POLYAXON_NO_OP variable was set inside Python) beta_test_path_list = glob(str(pathToData) + "/*/") # S1A_20201005_034656_DSC_109_RGBsar_cog.tif # S2B_MSIL2A_20201002T090719_N0214_R050_T35TMH_20201002T113443_B02_cog # S2B_MSIL2A_20201002T090719_N0214_R050_T35TMH_20201002T113443_B03_cog.tif # S2B_MSIL2A_20201002T090719_N0214_R050_T35TMH_20201002T113443_B04_cog.tif logger = logging.getLogger(__name__) logger.info('making final dataLayer set from raw dataLayer') logger.info(pathToData) ImageDict = get_dataset(beta_test_path_list, batch_size=config.batch_size) train = ImageDict['train_dataloader'] test = ImageDict['test_dataloader'] # Kører på WGAN GP if config.model_name == 'PartialConvolutions': curtraingModel = trainInpaintingWgan(train, test, generator, criticWgan, config) local_model_path = curtraingModel.trainGAN() elif config.model_name == 'PartialConvolutionsWgan': curtraingModel = trainInpaintingWgan(train, test, generator, criticWgan, config) local_model_path = curtraingModel.trainGAN() # local_model_path = Path(r"C:\Users\panda\PycharmProjects\Image_Inpainting_Sat\Master_Satelite_Image_Inpainting\OutputModels\PartialConvolutionsWgan_200.pt") if config.run_polyaxon: model_path = inpainting_data_path / 'models' modelOutputPath = Path.joinpath(model_path, 'OutputModels') stores_output_path = config.output_path / 'data' / 'storedData' else: localdir = Path().absolute().parent modelOutputPath = Path.joinpath(localdir, 'OutputModels') stores_output_path = localdir / 'data' / 'storedData' curevalModel = eval_model(config) curevalModel.run_eval(modelOutputPath, stores_output_path, model_path=local_model_path, test_dataloader=test)
def main(_): # Horovod: initialize Horovod. hvd.init() # Keras automatically creates a cache directory in ~/.keras/datasets for # storing the downloaded MNIST data. This creates a race # condition among the workers that share the same filesystem. If the # directory already exists by the time this worker gets around to creating # it, ignore the resulting exception and continue. cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets') if not os.path.exists(cache_dir): try: os.mkdir(cache_dir) except OSError as e: if e.errno == errno.EEXIST and os.path.isdir(cache_dir): pass else: raise # Download and load MNIST dataset. (x_train, y_train), (x_test, y_test) = \ keras.datasets.mnist.load_data('MNIST-data-%d' % hvd.rank()) # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it # into (-1, 784) to feed into our network. Also, need to normalize the # features between 0 and 1. x_train = np.reshape(x_train, (-1, 784)) / 255.0 x_test = np.reshape(x_test, (-1, 784)) / 255.0 # Build model... with tf.name_scope('input'): image = tf.placeholder(tf.float32, [None, 784], name='image') label = tf.placeholder(tf.float32, [None], name='label') predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN) lr_scaler = hvd.size() # By default, Adasum doesn't need scaling when increasing batch size. If used with NCCL, # scale lr by local_size if args.use_adasum: lr_scaler = hvd.local_size() if hvd.nccl_built() else 1 # Horovod: adjust learning rate based on lr_scaler. opt = tf.train.AdamOptimizer(0.001 * lr_scaler) # Horovod: add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer( opt, op=hvd.Adasum if args.use_adasum else hvd.Average) global_step = tf.train.get_or_create_global_step() train_op = opt.minimize(loss, global_step=global_step) hooks = [ # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states # from rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights # or restored from a checkpoint. hvd.BroadcastGlobalVariablesHook(0), # Horovod: adjust number of steps based on number of GPUs. tf.train.StopAtStepHook(last_step=20000 // hvd.size()), tf.train.LoggingTensorHook(tensors={ 'step': global_step, 'loss': loss }, every_n_iter=10), ] # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting them. checkpoint_dir = None if hvd.rank() == 0: tracking.init() checkpoint_dir = tracking.get_outputs_path() training_batch_generator = train_input_generator(x_train, y_train, batch_size=100) # The MonitoredTrainingSession takes care of session initialization, # restoring from a checkpoint, saving to a checkpoint, and closing when done # or an error occurs. with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, hooks=hooks, config=config) as mon_sess: while not mon_sess.should_stop(): # Run a training step synchronously. image_, label_ = next(training_batch_generator) mon_sess.run(train_op, feed_dict={image: image_, label: label_})