def main(argv): try: task_id = int(os.environ['SLURM_ARRAY_TASK_ID']) except KeyError: task_id = 0 model_save_dir = FLAGS.model_dir data_dir = FLAGS.data_dir print("Saving model to : " + str(model_save_dir)) print("Loading data from : " + str(data_dir)) test_data_dir = data_dir train_data_dir = data_dir epochs = FLAGS.epochs dropout_rate = FLAGS.dropout_rate weight_decay = FLAGS.weight_decay lr = FLAGS.learning_rate load_model = FLAGS.load_model model_save_dir+="_dropout_rate_"+str(dropout_rate)+"_learning_rate_"+str(lr)+"_weight_decay_"+str(weight_decay) model = basic_dmoe_cnn.basic_dmoe_cnn_mnist() train_data, test_data = tf.keras.datasets.mnist.load_data() train_data_gen = data_generator(train_data,BATCH_SIZE,is_training=True,take_n=NUM_IMAGES["train"]) val_data_gen = data_generator(train_data,100,is_training=False,is_validation = True,skip_n=NUM_IMAGES["train"],take_n=NUM_IMAGES["validation"]) test_data_gen = data_generator(test_data,100,is_training=False) trainer = ModelTrainer(model, train_data_gen, val_data_gen, test_data_gen, epochs, learning_rate_fn = learning_rate_fn, optimizer = tf.keras.optimizers.Adam, num_train_batches = int(NUM_IMAGES["train"]/BATCH_SIZE), base_learning_rate = lr, load_model = load_model, save_dir = model_save_dir, init_data = tf.random.normal([BATCH_SIZE,28,28,1]), start_epoch = 0) trainer.train()
snap_path = os.path.join(previous_training_path, 'snapshots') snap_steps = [ int(f[:-5].split('-')[-1]) for f in os.listdir(snap_path) if f[-5:] == '.meta' ] # Find which snapshot to restore chosen_step = np.sort(snap_steps)[step_ind] chosen_snap = os.path.join(previous_training_path, 'snapshots', 'snap-{:d}'.format(chosen_step)) else: chosen_snap = None # Create a trainer class trainer = ModelTrainer(model, restore_snap=chosen_snap) t2 = time.time() print('\n----------------') print('Done in {:.1f} s'.format(t2 - t1)) print('----------------\n') ################ # Start training ################ print('Start Training') print('**************\n') trainer.train(model, dataset)
t1 = time.time() net = KPFCNN(config, training_dataset.label_values, training_dataset.ignored_labels) debug = False if debug: print('\n*************************************\n') print(net) print('\n*************************************\n') for param in net.parameters(): if param.requires_grad: print(param.shape) print('\n*************************************\n') print("Model size %i" % sum(param.numel() for param in net.parameters() if param.requires_grad)) print('\n*************************************\n') # Define a trainer class trainer = ModelTrainer(net, config, chkp_path=chosen_chkp) print('Done in {:.1f}s\n'.format(time.time() - t1)) print('\nStart training') print('**************') # Training trainer.train(net, training_loader, test_loader, config) print('Forcing exit now') os.kill(os.getpid(), signal.SIGINT)
# Test the input pipeline alone with this debug function # dataset.check_input_pipeline_timing(config) ############## # Define Model ############## print('Creating Model') print('**************\n') t1 = time.time() # Model class model = KernelPointFCNN(dataset.flat_inputs, config) # Trainer class trainer = ModelTrainer(model) # trainer = ModelTrainer(model, restore_snap='results_kitti/Log_/snapshots/snap-') t2 = time.time() print('\n----------------') print('Done in {:.1f} s'.format(t2 - t1)) print('----------------\n') ################ # Start training ################ print('Start Training') print('**************\n')
print('*****************') # Define network model t1 = time.time() net = KPFCNN(config, training_dataset.label_values, training_dataset.ignored_labels) debug = False if debug: print('\n*************************************\n') print(net) print('\n*************************************\n') for param in net.parameters(): if param.requires_grad: print(param.shape) print('\n*************************************\n') print("Model size %i" % sum(param.numel() for param in net.parameters() if param.requires_grad)) print('\n*************************************\n') # Define a trainer class trainer = ModelTrainer(net, config, chkp_path=chosen_chkp) print('Done in {:.1f}s\n'.format(time.time() - t1)) print('\nStart training') print('**************') # Training trainer.train(net, training_loader, test_loader, config, time_limit=36000) print('Forcing exit now') os.kill(os.getpid(), signal.SIGINT)
# Find all snapshot in the chosen training folder snap_path = os.path.join(args.saving_path, 'snapshots') snap_steps = [ int(f[:-5].split('-')[-1]) for f in os.listdir(snap_path) if f[-5:] == '.meta' ] # Find which snapshot to restore if args.snap == -1: chosen_step = np.sort(snap_steps)[args.snap] else: chosen_step = args.snap + 1 chosen_snap = os.path.join(args.saving_path, 'snapshots', 'snap-{:d}'.format(chosen_step)) trainer = ModelTrainer(model, chosen_snap) else: trainer = ModelTrainer(model) t2 = time.time() print('\n----------------') print('Done in {:.1f} s'.format(t2 - t1)) print('----------------\n') ################ # Start training ################ print('Start Training') print('**************\n')
def main(argv): try: task_id = int(os.environ['SLURM_ARRAY_TASK_ID']) except KeyError: task_id = 0 model_save_dir = FLAGS.model_dir data_dir = FLAGS.data_dir print("Saving model to : " + str(model_save_dir)) print("Loading data from : " + str(data_dir)) test_data_dir = data_dir train_data_dir = data_dir epochs = FLAGS.epochs batch_size = FLAGS.batch_size dropout_rate = FLAGS.dropout_rate weight_decay = FLAGS.weight_decay lr = FLAGS.learning_rate load_model = FLAGS.load_model training_percentage = FLAGS.training_percentage preload_samples = FLAGS.preload_samples ds = Dataset(data_dir,is_training_set = True) n_total = ds.n_samples def augment_fn(sample,training): return augment_input(sample,ds.n_classes,training) dg = DataGenerator(ds,augment_fn, training_percentage = training_percentage, preload_samples = preload_samples, save_created_features = False, max_samples_per_audio = 99, is_training=True) n_train = int(n_total*training_percentage/100) n_val = n_total-n_train #ResNet 18 classifier_model = Classifier(ResBlockBasicLayer, n_blocks = 4, n_layers = [2,2,2,2], strides = [2,2,2,2], channel_base = [64,128,256,512], n_classes = ds.n_classes+1, init_ch = 64, init_ksize = 7, init_stride = 2, use_max_pool = True, kernel_regularizer = tf.keras.regularizers.l2(2e-4), kernel_initializer = tf.keras.initializers.he_normal(), name = "classifier", dropout=dropout_rate) #Generator model used to augment to false samples generator_model = Generator(8, [8,8,16,16,32,32,64,64], kernel_regularizer = tf.keras.regularizers.l2(2e-4), kernel_initializer = tf.keras.initializers.he_normal(), name = "generator") #Discriminator for estimating the Wasserstein distance discriminator_model = Discriminator(3, [32,64,128], [4,4,4], name = "discriminator") data_gen = data_generator(dg.generate_all_samples_from_scratch,batch_size, is_training=True, n_classes = ds.n_classes) trainer = ModelTrainer(data_gen, None, None, epochs, EvalFunctions, model_settings = [{'model':classifier_model, 'optimizer_type':tf.keras.optimizers.SGD, 'base_learning_rate':lr, 'learning_rate_fn':learning_rate_fn, 'init_data':tf.random.normal([batch_size,BINS,N_FRAMES,N_CHANNELS])}, {'model':generator_model, 'optimizer_type':tf.keras.optimizers.Adam, 'base_learning_rate':lr*0.0001, 'learning_rate_fn':learning_rate_fn, 'init_data':tf.random.normal([batch_size,BINS,N_FRAMES,N_CHANNELS])}, {'model':discriminator_model, 'optimizer_type':tf.keras.optimizers.Adam, 'base_learning_rate':lr*0.002, 'learning_rate_fn':learning_rate_fn, 'init_data':tf.random.normal([batch_size,BINS,N_FRAMES,N_CHANNELS])}], summaries = None, num_train_batches = int(n_train/batch_size), load_model = load_model, save_dir = model_save_dir, input_keys = ["input_features","false_sample"], label_keys = ["labels"], start_epoch = 0) all_predictions = trainer.predict_dataset(data_gen) np.save(os.path.join(data_dir,"train_set_predictions.npy"),all_predictions)
# Calibrate samplers test_sampler.calibration(test_loader) # debug_timing(test_dataset, test_sampler, test_loader) # debug_show_clouds(training_dataset, training_sampler, training_loader) print('\nModel Preparation') print('*****************') # Define network model t1 = time.time() net = KPCNN(config) net = torch.nn.DataParallel(net) # Define a trainer class trainer = ModelTrainer(net, config, chkp_path=chosen_chkp) print('Done in {:.1f}s\n'.format(time.time() - t1)) print('\nStart training') print('**************') # Testing try: #trainer.train(net, training_loader, test_loader, config) trainer.validation(net, val_loader=test_loader, config=config) except: print('Caught an error') os.kill(os.getpid(), signal.SIGINT) print('Forcing exit now') os.kill(os.getpid(), signal.SIGINT)