def train_multidec(args): print("Training multidec") device = torch.device(args.gpu) print("Loading dataset...") full_dataset = load_multi_csv_data(args, CONFIG) print("Loading dataset completed") # full_loader = DataLoader(full_dataset, batch_size=args.batch_size, shuffle=False) image_encoder = MDEC_encoder(input_dim=args.input_dim, z_dim=args.latent_dim, n_clusters=args.n_clusters, encodeLayer=[500, 500, 2000], activation="relu", dropout=0) image_encoder.load_model(os.path.join(CONFIG.CHECKPOINT_PATH, "image_sdae_" + str(args.latent_dim)) + ".pt") text_encoder = MDEC_encoder(input_dim=args.input_dim, z_dim=args.latent_dim, n_clusters=args.n_clusters, encodeLayer=[500, 500, 2000], activation="relu", dropout=0) text_encoder.load_model(os.path.join(CONFIG.CHECKPOINT_PATH, "text_sdae_" + str(args.latent_dim)) + ".pt") mdec = MultiDEC(device=device, image_encoder=image_encoder, text_encoder=text_encoder, n_clusters=args.n_clusters) exp = Experiment("MDEC " + str(args.latent_dim) + '_' + str(args.n_clusters), capture_io=True) print(mdec) for arg, value in vars(args).items(): exp.param(arg, value) try: mdec.fit(full_dataset, lr=args.lr, batch_size=args.batch_size, num_epochs=args.epochs, save_path=CONFIG.CHECKPOINT_PATH) print("Finish!!!") finally: exp.end()
def pretrain_ddec(args): print("Pretraining...") print("Loading dataset...") with open(os.path.join(args.text_embedding_dir, 'word_embedding.p'), "rb") as f: embedding_model = cPickle.load(f) with open(os.path.join(args.text_embedding_dir, 'word_idx.json'), "r", encoding='utf-8') as f: word_idx = json.load(f) train_dataset, test_dataset = load_pretrain_data(args.image_dir, word_idx[1], args, CONFIG) print("Loading dataset completed") dualnet = DualNet(pretrained_embedding=embedding_model, text_features=args.text_features, z_dim=args.z_dim, n_classes=args.n_classes) if args.resume: print("loading model...") dualnet.load_model("/4TBSSD/CHECKPOINT/pretrain_" + str(args.z_dim) + "_0.pt") exp = Experiment("Dualnet_pretrain_" + str(args.z_dim), capture_io=True) print(dualnet) for arg, value in vars(args).items(): exp.param(arg, value) try: dualnet.fit(train_dataset, test_dataset, args=args, save_path="/4TBSSD/CHECKPOINT/pretrain_" + str(args.z_dim) + "_0.pt") print("Finish!!!") finally: exp.end()
def train_reconstruction_all(args): device = torch.device(args.gpu) df_input_data = pd.read_csv(os.path.join( CONFIG.CSV_PATH, args.prefix + "_" + args.target_csv), index_col=0, encoding='utf-8-sig') exp = Experiment(args.target_modal + " SDAE " + str(args.latent_dim), capture_io=True) try: for arg, value in vars(args).items(): exp.param(arg, value) print("Loading dataset...") train_dataset, val_dataset = load_autoencoder_data( df_input_data, CONFIG) print("Loading dataset completed") train_loader, val_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=args.shuffle), \ DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False) sdae = StackedDAE(input_dim=args.input_dim, z_dim=args.latent_dim, binary=False, encodeLayer=[500, 500, 2000], decodeLayer=[2000, 500, 500], activation="relu", dropout=args.dropout, device=device) if args.resume: print("resume from checkpoint") sdae.load_model( os.path.join( CONFIG.CHECKPOINT_PATH, args.prefix + "_" + args.target_modal + "_" + args.target_dataset + "_sdae_" + str(args.latent_dim) + "_all.pt")) else: sdae.pretrain(train_loader, val_loader, lr=args.lr, batch_size=args.batch_size, num_epochs=args.pretrain_epochs, corrupt=0.2, loss_type="mse") sdae.fit(train_loader, val_loader, lr=args.lr, num_epochs=args.epochs, corrupt=0.2, loss_type="mse", save_path=os.path.join( CONFIG.CHECKPOINT_PATH, args.prefix + "_" + args.target_modal + "_" + args.target_dataset + "_sdae_" + str(args.latent_dim) + "_all.pt")) finally: exp.end()
def train_bayes(params): """ Wrapper around train function to serve as objective function for Gaussian optimization in scikit-optimize routine gp_minimize. Arguments: ---------- params: list, shape=[nb_layers + 2,] List of search space dimensions. Entries have to be tuples (lower_bound, upper_bound) for Reals or Integers. Returns: -------- tbd """ # Create Hyperdash hd_experiment hd_exp = Experiment(project_name) # Translate params into format understood by train function # n_layer = 4 # layer_sizes = hd_exp.param('layer_sizes', (2**np.array(params[:n_layer])).tolist()) # learning_rate = hd_exp.param('learning rate', 10**params[n_layer]) # mini_batch_size = hd_exp.param('mini batch size', int(2**params[n_layer + 1])) # pkeep = hd_exp.param('dropout prob', 1) # hyper_params = [layer_sizes, learning_rate, mini_batch_size, pkeep] # hyper_param_str = make_hyper_param_str(hyper_params) layer_sizes = [4096] * 4 learning_rate = hd_exp.param('learning rate', 10**params[0]) mini_batch_size = hd_exp.param('mini batch size', int(2**params[1])) pkeep = hd_exp.param('dropout prob', 1) hyper_params = [layer_sizes, learning_rate, mini_batch_size, pkeep] hyper_param_str = make_hyper_param_str(hyper_params) # Call train function tic = time.time() logger.info('Start training for ' + hyper_param_str) log_df, best_error = train(train_tuple, validation_tuple, hyper_params, nb_epochs, random_seed, hd_exp, project_dir) elapsed_time = time.time() - tic logger.info('Finished training in {} s.'.format(elapsed_time)) # Writing Pandas log file to csv file on disk. logger.info('Writing pandas DF log to disk.') log_df.to_csv(project_dir + '/' + hyper_param_str + '/data_df.csv') # Finish Hyperdash Experiment hd_exp.end() return best_error
def objective(self, params): """ objective function to optimize :param params: hyperparamters for optimizer :return: maximum validation accuracy :rtype: float """ # get instances dataset = Datasets.get(self.dataset_name) model = Models.get(self.model_name, dataset=dataset) optimizer = Optimizers.get(self.optimizer_name, params=params) # configure hyperdash experiment hd_exp = HyperdashExperiment( f'{self.dataset_name}', api_key_getter=lambda: self.config['hyperdash']['api_key']) hd_exp.param('dataset_name', self.dataset_name) hd_exp.param('model_name', self.model_name) hd_exp.param('optimizer_name', self.optimizer_name) for k, v in params.items(): hd_exp.param(k, v) # set callbacks callbacks = [ Hyperdash(['accuracy', 'loss', 'val_accuracy', 'val_loss'], hd_exp), EarlyStopping('val_accuracy', patience=10, min_delta=0.01, verbose=1), TerminateOnNaN() ] # get data (x_train, y_train), *_ = dataset.get_batch() # start learning model.compile(loss=self.loss, optimizer=optimizer, metrics=['accuracy']) history = model.fit(x_train, y_train, batch_size=self.batch_size, epochs=self.epochs, callbacks=callbacks, validation_split=0.2, verbose=2) # stop hyperdash experiment hd_exp.end() # return maximum validation accuracy val_accuracy = np.array(history.history['val_accuracy']) return max(val_accuracy) * (-1)
def main(): """Start training.""" exp = Experiment("diffrend test") # Parse args opt = Parameters().parse() for key, val in opt.__dict__.items(): exp.param(key, val) # Create dataset loader dataset_load = Dataset_load(opt) # Create GAN gan = GAN(opt, dataset_load, exp) # Train gan gan.train()
def demo(args=None): from_file = get_api_key_from_file() from_env = get_api_key_from_env() api_key = from_env or from_file if not api_key: print(""" `hyperdash demo` requires a Hyperdash API key. Try setting your API key in the HYPERDASH_API_KEY environment variable, or in a hyperdash.json file in the local directory or your user's home directory with the following format: { "api_key": "<YOUR_API_KEY>" } """) return print(""" Running the following program: from hyperdash import Experiment exp = Experiment("Dogs vs. Cats") # Parameters estimators = exp.param("Estimators", 500) epochs = exp.param("Epochs", 5) batch = exp.param("Batch Size", 64) for epoch in xrange(1, epochs + 1): accuracy = 1. - 1./epoch loss = float(epochs - epoch)/epochs print("Training model (epoch {})".format(epoch)) time.sleep(1) # Metrics exp.metric("Accuracy", accuracy) exp.metric("Loss", loss) exp.end() """) from hyperdash import Experiment exp = Experiment("Dogs vs. Cats") # Parameters estimators = exp.param("Estimators", 500) epochs = exp.param("Epochs", 5) batch = exp.param("Batch Size", 64) for epoch in xrange(epochs): print("Training model (epoch {})".format(epoch)) accuracy = 1. - 1. / (epoch + 1) loss = float(epochs - epoch) / (epochs + 1) # Metrics exp.metric("Accuracy", accuracy) exp.metric("Loss", loss) time.sleep(1) exp.end()
agent = DDPG(nb_states, nb_actions, args) evaluate = Evaluator(args.validate_episodes, args.validate_steps, args.output, max_episode_length=args.max_episode_length) exp = None if args.mode == 'train': exp = Experiment("sim2real-ddpg-real-cheetah") for arg in ["env", "rate", "prate", "hidden1", "hidden2", "warmup", "discount", "bsize", "rmsize", "window_length", "tau", "ou_theta", "ou_sigma", "ou_mu", "validate_episodes", "max_episode_length", "validate_steps", "init_w", "train_iter", "epsilon", "seed", "resume"]: arg_val = getattr(args, arg) import socket exp.param("host", socket.gethostname()) train(args, args.train_iter, agent, env, evaluate, args.validate_steps, args.output, max_episode_length=args.max_episode_length, debug=args.debug, exp=exp) # when done exp.end() elif args.mode == 'test': test(args.validate_episodes, agent, env, evaluate, args.resume, visualize=True, debug=args.debug) else: raise RuntimeError('undefined mode {}'.format(args.mode))
def train_multidec(args): print("Training weight calc") device = torch.device(args.gpu) df_image_data = pd.read_csv(os.path.join( CONFIG.CSV_PATH, args.prefix_csv + "_pca_normalized_image_encoded_" + args.target_dataset + ".csv"), index_col=0, encoding='utf-8-sig') df_text_data = pd.read_csv(os.path.join( CONFIG.CSV_PATH, args.prefix_csv + "_text_doc2vec_" + args.target_dataset + ".csv"), index_col=0, encoding='utf-8-sig') df_label = pd.read_csv(os.path.join(CONFIG.CSV_PATH, args.label_csv), index_col=0, encoding='utf-8-sig') label_array = np.array(df_label['category']) n_clusters = np.max(label_array) + 1 #n_clusters = args.n_clusters exp = Experiment(args.prefix_csv + "_ODEC", capture_io=True) for arg, value in vars(args).items(): exp.param(arg, value) try: acc_list = [] nmi_list = [] f_1_list = [] for fold_idx in range(args.start_fold, args.fold): print("Current fold: ", fold_idx) df_train = pd.read_csv(os.path.join( CONFIG.CSV_PATH, "train_" + str(fold_idx) + "_" + args.target_dataset + "_label.csv"), index_col=0, encoding='utf-8-sig') if args.sampled_n is not None: df_train = df_train.sample(n=args.sampled_n, random_state=42) df_test = pd.read_csv(os.path.join( CONFIG.CSV_PATH, "test_" + str(fold_idx) + "_" + args.target_dataset + "_label.csv"), index_col=0, encoding='utf-8-sig') print("Loading dataset...") full_dataset, train_dataset, val_dataset = load_semi_supervised_csv_data( df_image_data, df_text_data, df_train, df_test, CONFIG) print("\nLoading dataset completed") image_encoder = MDEC_encoder(input_dim=args.input_dim, z_dim=args.latent_dim, n_clusters=n_clusters, encodeLayer=[500, 500, 2000], activation="relu", dropout=0) image_encoder.load_model( os.path.join( CONFIG.CHECKPOINT_PATH, args.prefix_model + "_image" "_" + args.target_dataset + "_sdae_" + str(args.latent_dim) + '_' + str(fold_idx)) + ".pt") # image_encoder.load_model(os.path.join(CONFIG.CHECKPOINT_PATH, "sampled_plus_labeled_scaled_image_sdae_" + str(fold_idx)) + ".pt") text_encoder = MDEC_encoder(input_dim=args.input_dim, z_dim=args.latent_dim, n_clusters=n_clusters, encodeLayer=[500, 500, 2000], activation="relu", dropout=0) text_encoder.load_model( os.path.join( CONFIG.CHECKPOINT_PATH, args.prefix_model + "_text" "_" + args.target_dataset + "_sdae_" + str(args.latent_dim) + '_' + str(fold_idx)) + ".pt") # text_encoder.load_model(os.path.join(CONFIG.CHECKPOINT_PATH, "sampled_plus_labeled_scaled_text_sdae_" + str(fold_idx)) + ".pt") mdec = MultiDEC(device=device, image_encoder=image_encoder, text_encoder=text_encoder, ours=args.ours, use_prior=args.use_prior, fl=args.fl, n_clusters=n_clusters) mdec.load_model( os.path.join( CONFIG.CHECKPOINT_PATH, args.prefix_csv + "_odec_" + str(args.latent_dim) + '_' + str(fold_idx)) + ".pt") mdec.to(device) mdec.eval() wcalc = WeightCalc(device=device, ours=args.ours, use_prior=args.use_prior, input_dim=args.input_dim, n_clusters=n_clusters) wcalc.fit_predict( mdec, full_dataset, train_dataset, val_dataset, args, CONFIG, lr=args.lr, batch_size=args.batch_size, num_epochs=args.epochs, save_path=os.path.join( CONFIG.CHECKPOINT_PATH, args.prefix_csv + "_wcalc_" + str(args.latent_dim) + '_' + str(fold_idx)) + ".pt", tol=args.tol, kappa=args.kappa) acc_list.append(wcalc.acc) nmi_list.append(wcalc.nmi) f_1_list.append(wcalc.f_1) print("#Average acc: %.4f, Average nmi: %.4f, Average f_1: %.4f" % (np.mean(acc_list), np.mean(nmi_list), np.mean(f_1_list))) finally: exp.end()
def train_reconstruction(args): device = torch.device(args.gpu) print("Loading dataset...") train_dataset, val_dataset = load_imgseq_data(args, CONFIG) print("Loading dataset completed") train_loader, val_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=args.shuffle),\ DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False) #imgseq_encoder = imgseq_model.RNNEncoder(args.embedding_dim, args.num_layer, args.latent_size, bidirectional=True) #imgseq_decoder = imgseq_model.RNNDecoder(CONFIG.MAX_SEQUENCE_LEN, args.embedding_dim, args.num_layer, args.latent_size, bidirectional=True) t1 = CONFIG.MAX_SEQUENCE_LEN t2 = int(math.floor((t1 - 3) / 1) + 1) # "2" means stride size t3 = int(math.floor((t2 - 3) / 1) + 1) imgseq_encoder = imgseq_model.ConvolutionEncoder( embedding_dim=args.embedding_dim, t3=t3, filter_size=300, filter_shape=3, latent_size=1000) imgseq_decoder = imgseq_model.DeconvolutionDecoder( embedding_dim=args.embedding_dim, t3=t3, filter_size=300, filter_shape=3, latent_size=1000) if args.resume: print("Restart from checkpoint") checkpoint = torch.load(os.path.join(CONFIG.CHECKPOINT_PATH, args.resume), map_location=lambda storage, loc: storage) start_epoch = checkpoint['epoch'] imgseq_encoder.load_state_dict(checkpoint['imgseq_encoder']) imgseq_decoder.load_state_dict(checkpoint['imgseq_decoder']) else: print("Start from initial") start_epoch = 0 imgseq_autoencoder = imgseq_model.ImgseqAutoEncoder( imgseq_encoder, imgseq_decoder) criterion = nn.MSELoss().to(device) imgseq_autoencoder.to(device) optimizer = AdamW(imgseq_autoencoder.parameters(), lr=1., weight_decay=args.weight_decay, amsgrad=True) step_size = args.half_cycle_interval * len(train_loader) clr = cyclical_lr(step_size, min_lr=args.lr, max_lr=args.lr * args.lr_factor) scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, [clr]) if args.resume: optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) exp = Experiment("Image-sequence autoencoder " + str(args.latent_size), capture_io=False) for arg, value in vars(args).items(): exp.param(arg, value) try: imgseq_autoencoder.train() for epoch in range(start_epoch, args.epochs): print("Epoch: {}".format(epoch)) for steps, batch in enumerate(train_loader): torch.cuda.empty_cache() feature = Variable(batch).to(device) optimizer.zero_grad() feature_hat = imgseq_autoencoder(feature) loss = criterion(feature_hat, feature) loss.backward() optimizer.step() scheduler.step() if (steps * args.batch_size) % args.log_interval == 0: print("Epoch: {} at {} lr: {}".format( epoch, str(datetime.datetime.now()), str(scheduler.get_lr()))) print("Steps: {}".format(steps)) print("Loss: {}".format(loss.detach().item())) input_data = feature[0] del feature, feature_hat, loss exp.log("\nEpoch: {} at {} lr: {}".format( epoch, str(datetime.datetime.now()), str(scheduler.get_lr()))) _avg_loss = eval_reconstruction(imgseq_autoencoder, criterion, val_loader, device) exp.log("\nEvaluation - loss: {}".format(_avg_loss)) util.save_models( { 'epoch': epoch + 1, 'imgseq_encoder': imgseq_encoder.state_dict(), 'imgseq_decoder': imgseq_decoder.state_dict(), 'avg_loss': _avg_loss, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict() }, CONFIG.CHECKPOINT_PATH, "imgseq_autoencoder_" + str(args.latent_size)) print("Finish!!!") finally: exp.end()
from torch.utils.data import DataLoader import numpy as np from hyperdash import Experiment from inversegraphics_generator.img_dataset import IqImgDataset from inversegraphics_generator.iqtest_objs import get_data_dir from inversegraphics_generator.resnet50 import MultiResNet, ContrastiveLoss EPOCHS = 40 BATCH = 64 LEARNING_RATE = 0.0001 SIZE = 1000 MARGIN = 2 exp = Experiment("[ig] cnn-siamese2") exp.param("epoch", EPOCHS) exp.param("size", SIZE) exp.param("batch", BATCH) exp.param("learning rate", LEARNING_RATE) # ds = IqImgDataset("/data/lisa/data/iqtest/iqtest-dataset-ambient.h5", "train/labeled", max_size=SIZE) ds = IqImgDataset(os.path.join(get_data_dir(), "test.h5"), "train/labeled", max_size=SIZE) dl = DataLoader(ds, batch_size=BATCH, shuffle=True, num_workers=0) model = MultiResNet(siamese=True) #.cuda() # Loss and optimizer criterion_contrast = ContrastiveLoss(MARGIN) optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
# net.load_state_dict(checkpoint['state_dict']) # print("MODEL LOADED, CONTINUING TRAINING") # return "TRAINING AVG LOSS: {}\n" \ # "TRAINING AVG DIFF: {}".format( # checkpoint["epoch_avg_loss"], checkpoint["epoch_avg_diff"]) # else: # if optional: # pass # model loading was optional, so nothing to do # else: # # shit, no model # raise Exception("model couldn't be found:", MODEL_PATH_BEST) loss_function = nn.MSELoss() if hyperdash_support: exp = Experiment("[sim2real] lstm - real v3") exp.param("exp", EXPERIMENT) exp.param("layers", LSTM_LAYERS) exp.param("nodes", HIDDEN_NODES) # if TRAIN: optimizer = optim.Adam(net.parameters()) # if CONTINUE: # old_model_string = loadModel(optional=True) # print(old_model_string) # else: # old_model_string = loadModel(optional=False) loss_history = [np.inf] # very high loss because loss can't be empty for min() for epoch in np.arange(EPOCHS):
net.load_state_dict(checkpoint['state_dict']) return "TRAINING AVG LOSS: {}\n" \ "TRAINING AVG DIFF: {}".format( checkpoint["epoch_avg_loss"], checkpoint["epoch_avg_diff"]) else: if optional: pass # model loading was optional, so nothing to do else: #shit, no model raise Exception("model couldn't be found:", MODEL_PATH_BEST) loss_function = nn.MSELoss() if hyperdash_support: exp = Experiment("simple lstm - fl4") exp.param("layers", LSTM_LAYERS) exp.param("nodes", HIDDEN_NODES) if TRAIN: optimizer = optim.Adam(net.parameters()) if CONTINUE: old_model_string = loadModel(optional=True) print(old_model_string) else: old_model_string = loadModel(optional=False) loss_history = [9999999 ] # very high loss because loss can't be empty for min() # h0 = Variable(torch.randn(, 3, 20)) # c0 = Variable(torch.randn(2, 3, 20))
def test_experiment_handles_numpy_numbers(self): nums_to_test = [ ("int_", np.int_()), ("intc", np.intc()), ("intp", np.intp()), ("int8", np.int8()), ("int16", np.int16()), ("int32", np.int32()), ("int64", np.int64()), ("uint8", np.uint8()), ("uint16", np.uint16()), ("uint32", np.uint32()), ("uint64", np.uint64()), ("float16", np.float16()), ("float32", np.float32()), ("float64", np.float64()), ] # Make sure the SDK doesn't choke and JSON serialization works exp = Experiment("MNIST") for name, num in nums_to_test: exp.metric("test_metric_{}".format(name), num) exp.param("test_param_{}".format(name), num) exp.end() # Test params match what is expected params_messages = [] for msg in server_sdk_messages: payload = msg["payload"] if "params" in payload: params_messages.append(payload) expected_params = [] for name, num in nums_to_test: obj = { "params": {}, "is_internal": False, } obj["params"]["test_param_{}".format(name)] = num obj["is_internal"] = False expected_params.append(obj) assert len(expected_params) == len(params_messages) for i, message in enumerate(params_messages): print(message) print(expected_params[i]) assert message == expected_params[i] # Test metrics match what is expected metrics_messages = [] for msg in server_sdk_messages: payload = msg["payload"] if "name" in payload: metrics_messages.append(payload) expected_metrics = [] for name, num in nums_to_test: expected_metrics.append({ "name": "test_metric_{}".format(name), "value": num, "is_internal": False, }) assert len(expected_metrics) == len(metrics_messages) for i, message in enumerate(metrics_messages): assert message == expected_metrics[i]
def gen_estimator(period=None): resnet_size = int(flags_obj.resnet_size) data_format = flags_obj.data_format batch_size = flags_obj.batch_size resnet_version = int(flags_obj.resnet_version) loss_scale = flags_core.get_loss_scale(flags_obj) dtype_tf = flags_core.get_tf_dtype(flags_obj) num_epochs_per_decay = flags_obj.num_epochs_per_decay learning_rate_decay_factor = flags_obj.learning_rate_decay_factor end_learning_rate = flags_obj.end_learning_rate learning_rate_decay_type = flags_obj.learning_rate_decay_type weight_decay = flags_obj.weight_decay zero_gamma = flags_obj.zero_gamma lr_warmup_epochs = flags_obj.lr_warmup_epochs base_learning_rate = flags_obj.base_learning_rate use_resnet_d = flags_obj.use_resnet_d use_dropblock = flags_obj.use_dropblock dropblock_kp = [float(be) for be in flags_obj.dropblock_kp] label_smoothing = flags_obj.label_smoothing momentum = flags_obj.momentum bn_momentum = flags_obj.bn_momentum train_epochs = flags_obj.train_epochs piecewise_lr_boundary_epochs = [ int(be) for be in flags_obj.piecewise_lr_boundary_epochs ] piecewise_lr_decay_rates = [ float(dr) for dr in flags_obj.piecewise_lr_decay_rates ] use_ranking_loss = flags_obj.use_ranking_loss use_se_block = flags_obj.use_se_block use_sk_block = flags_obj.use_sk_block mixup_type = flags_obj.mixup_type dataset_name = flags_obj.dataset_name kd_temp = flags_obj.kd_temp no_downsample = flags_obj.no_downsample anti_alias_filter_size = flags_obj.anti_alias_filter_size anti_alias_type = flags_obj.anti_alias_type cls_loss_type = flags_obj.cls_loss_type logit_type = flags_obj.logit_type embedding_size = flags_obj.embedding_size pool_type = flags_obj.pool_type arc_s = flags_obj.arc_s arc_m = flags_obj.arc_m bl_alpha = flags_obj.bl_alpha bl_beta = flags_obj.bl_beta exp = None if install_hyperdash and flags_obj.use_hyperdash: exp = Experiment(flags_obj.model_dir.split("/")[-1]) resnet_size = exp.param("resnet_size", int(flags_obj.resnet_size)) batch_size = exp.param("batch_size", flags_obj.batch_size) exp.param("dtype", flags_obj.dtype) learning_rate_decay_type = exp.param( "learning_rate_decay_type", flags_obj.learning_rate_decay_type) weight_decay = exp.param("weight_decay", flags_obj.weight_decay) zero_gamma = exp.param("zero_gamma", flags_obj.zero_gamma) lr_warmup_epochs = exp.param("lr_warmup_epochs", flags_obj.lr_warmup_epochs) base_learning_rate = exp.param("base_learning_rate", flags_obj.base_learning_rate) use_dropblock = exp.param("use_dropblock", flags_obj.use_dropblock) dropblock_kp = exp.param( "dropblock_kp", [float(be) for be in flags_obj.dropblock_kp]) piecewise_lr_boundary_epochs = exp.param( "piecewise_lr_boundary_epochs", [int(be) for be in flags_obj.piecewise_lr_boundary_epochs]) piecewise_lr_decay_rates = exp.param( "piecewise_lr_decay_rates", [float(dr) for dr in flags_obj.piecewise_lr_decay_rates]) mixup_type = exp.param("mixup_type", flags_obj.mixup_type) dataset_name = exp.param("dataset_name", flags_obj.dataset_name) exp.param("autoaugment_type", flags_obj.autoaugment_type) classifier = tf.estimator.Estimator( model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config, params={ 'resnet_size': resnet_size, 'data_format': data_format, 'batch_size': batch_size, 'resnet_version': resnet_version, 'loss_scale': loss_scale, 'dtype': dtype_tf, 'num_epochs_per_decay': num_epochs_per_decay, 'learning_rate_decay_factor': learning_rate_decay_factor, 'end_learning_rate': end_learning_rate, 'learning_rate_decay_type': learning_rate_decay_type, 'weight_decay': weight_decay, 'zero_gamma': zero_gamma, 'lr_warmup_epochs': lr_warmup_epochs, 'base_learning_rate': base_learning_rate, 'use_resnet_d': use_resnet_d, 'use_dropblock': use_dropblock, 'dropblock_kp': dropblock_kp, 'label_smoothing': label_smoothing, 'momentum': momentum, 'bn_momentum': bn_momentum, 'embedding_size': embedding_size, 'train_epochs': train_epochs, 'piecewise_lr_boundary_epochs': piecewise_lr_boundary_epochs, 'piecewise_lr_decay_rates': piecewise_lr_decay_rates, 'with_drawing_bbox': flags_obj.with_drawing_bbox, 'use_ranking_loss': use_ranking_loss, 'use_se_block': use_se_block, 'use_sk_block': use_sk_block, 'mixup_type': mixup_type, 'kd_temp': kd_temp, 'no_downsample': no_downsample, 'dataset_name': dataset_name, 'anti_alias_filter_size': anti_alias_filter_size, 'anti_alias_type': anti_alias_type, 'cls_loss_type': cls_loss_type, 'logit_type': logit_type, 'arc_s': arc_s, 'arc_m': arc_m, 'pool_type': pool_type, 'bl_alpha': bl_alpha, 'bl_beta': bl_beta, 'train_steps': total_train_steps, }) return classifier, exp
sys.path.append("./") from models.burgers_train_separate import BurgersSeparate parser = argparse.ArgumentParser() parser.add_argument("--niter", default=10000, type=int) parser.add_argument("--scipyopt", default=False) parser.add_argument("--name", default="default") parser.add_argument("--traindata", nargs="+") parser.add_argument("--testdata", default="../MyData/burgers_polynominal.mat") args = parser.parse_args() logname = f"log/burgers_{args.name}.log" figurename = f"Burgers_{args.name}" # filen = "../MyData/burgers_cos.mat" exp = Experiment(args.name) exp.param("niter", args.niter) exp.param("scipyopt", args.scipyopt) exp.param("testdata", args.testdata) for i, n in enumerate(args.traindata): exp.param(f"traindata{i}", n) dataloader = DataLoader(args.traindata, args.testdata, 10.0, 8.0) sol_data = dataloader.get_solver_data(20000) idn_data = dataloader.get_train_batch() u_layers = [[2, 50, 50, 50, 50, 1] for _ in range(len(args.traindata))] pde_layers = [3, 100, 100, 1] layers = [2, 50, 50, 50, 50, 1] idn_lbs = idn_data["idn_lbs"] idn_ubs = idn_data["idn_ubs"]
class BaseTrainer(_BaseTrainer): """ Base trainer to make pytorch training be easier. Args: data-augmentation (bool): Crop randomly and add random noise for data augmentation. epoch (int): Number of epochs to train. opt (str): Optimization method. gpu (bool): Use GPU. seed (str): Random seed to train. train (str): Path to training image-pose list file. val (str): Path to validation image-pose list file. batchsize (int): Learning minibatch size. out (str): Output directory. resume (str): Initialize the trainer from given file. The file name is 'epoch-{epoch number}.iter'. resume_model (str): Load model definition file to use for resuming training (it\'s necessary when you resume a training). The file name is 'epoch-{epoch number}.model'. resume_opt (str): Load optimization states from this file (it\'s necessary when you resume a training). The file name is 'epoch-{epoch number}.state'. """ def __init__(self, **kwargs): self.data_augmentation = kwargs['data_augmentation'] self.epoch = kwargs['epoch'] self.gpu = (kwargs['gpu'] >= 0) self.opt = kwargs['opt'] self.seed = kwargs['seed'] self.train = kwargs['train'] self.val = kwargs['val'] self.batchsize = kwargs['batchsize'] self.out = kwargs['out'] self.resume = kwargs['resume'] self.resume_model = kwargs['resume_model'] self.resume_opt = kwargs['resume_opt'] self.hyperdash = kwargs['hyperdash'] if self.hyperdash: self.experiment = Experiment(self.hyperdash) for key, val in kwargs.items(): self.experiment.param(key, val) # validate arguments. self._validate_arguments() self.lowest_loss = 0 self.device = torch.device('cuda' if kwargs['gpu'] >= 0 else 'cpu') #self.experiment.log_multiple_params(kwargs) self.dataloader = torch.utils.data.DataLoader def _validate_arguments(self): if self.seed is not None and self.data_augmentation: raise NotSupportedError('It is not supported to fix random seed for data augmentation.') if self.gpu and not torch.cuda.is_available(): raise GPUNotFoundError('GPU is not found.') #for path in (self.train, self.val): # if not os.path.isfile(path): # raise FileNotFoundError('{0} is not found.'.format(path)) if self.opt not in ('MomentumSGD', 'Adam'): raise UnknownOptimizationMethodError( '{0} is unknown optimization method.'.format(self.opt)) if self.resume is not None: for path in (self.resume, self.resume_model, self.resume_opt): if not os.path.isfile(path): raise FileNotFoundError('{0} is not found.'.format(path)) # TODO: make it acceptable multiple optimizer, or define out of this trainer. def _get_optimizer(self, model, **kwargs): if self.opt == 'MomentumSGD': optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9) elif self.opt == "Adam": optimizer = optim.Adam(model.parameters()) else: try: optimizer = getattr(optim, self.opt)(**kwargs) except OptimNotSupportedError: print("This optim is not available. See https://pytorch.org/docs/stable/optim.html") return optimizer def forward(self, batch, model, criterion): data, target = map(lambda d: d.to(self.device), batch) output = model(data) loss = criterion(output, target) return loss def _train(self, model, optimizer, criterion, train_iter, logger, start_time, log_interval=10): model.train() loss_sum = 0.0 for iteration, batch in enumerate(tqdm(train_iter, desc='this epoch'), 1): optimizer.zero_grad() loss = self.forward(batch, model, criterion, isTest=False) loss_sum += loss loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 500) optimizer.step() if self.hyperdash: self.experiment.metric("loss", int(loss.cpu().data.numpy()), log=False) if iteration % log_interval == 0: log = 'elapsed_time: {0}, loss: {1}'.format(time.time() - start_time, loss.data[0]) logger.write(log) return loss_sum / len(train_iter) def _test(self, model, test_iter, criterion, logger, start_time): model.eval() test_loss = 0 for batch in test_iter: loss = self.forward(batch, model, criterion, isTest=True) print('Test loss: {}'.format(loss.data)) test_loss += loss.item() test_loss /= len(test_iter) log = 'elapsed_time: {0}, validation/loss: {1}'.format(time.time() - start_time, test_loss) if self.hyperdash: self.experiment.metric('test_loss', int(test_loss.cpu().data.numpy())) logger.write(log) return test_loss def _checkpoint(self, epoch, model, optimizer, logger): filename = os.path.join(self.out, 'epoch-{0}'.format(epoch + 1)) torch.save({'epoch': epoch + 1, 'logger': logger.state_dict()}, filename + '.iter') torch.save(model.state_dict(), filename + '.model') torch.save(optimizer.state_dict(), filename + '.state') def _best_checkpoint(self, epoch, model, optimizer, logger): filename = os.path.join(self.out, 'best_model') torch.save({'epoch': epoch + 1, 'logger': logger.state_dict()}, filename + '.iter') torch.save(model.state_dict(), filename + '.model') torch.save(optimizer.state_dict(), filename + '.state') def fit(self, model, train_data, val_data, criterion): """ Execute training """ # set random seed. if self.seed is not None: random.seed(self.seed) torch.manual_seed(self.seed) if self.gpu: torch.cuda.manual_seed(self.seed) # initialize model to train. if self.resume_model: model.load_state_dict(torch.load(self.resume_model)) # prepare gpu. if self.gpu: model.cuda() # load the datasets. train_iter = self.dataloader(train_data, batch_size=self.batchsize, shuffle=True) val_iter = self.dataloader(val_data, batch_size=3, shuffle=False) # set up an optimizer. optimizer = self._get_optimizer(model) if self.resume_opt: optimizer.load_state_dict(torch.load(self.resume_opt)) # set intervals. val_interval = 3 resume_interval = self.epoch / 10 log_interval = 10 # set logger and start epoch. logger = TrainLogger(self.out) start_epoch = 0 if self.resume: resume = torch.load(self.resume) start_epoch = resume['epoch'] logger.load_state_dict(resume['logger']) # start training. start_time = time.time() loss = 0 for epoch in trange(start_epoch, self.epoch, initial=start_epoch, total=self.epoch, desc=' total'): self._train(model, optimizer, criterion, train_iter, log_interval, logger, start_time) if (epoch) % val_interval == 0: loss = self._test(model, val_iter, criterion, logger, start_time) if self.lowest_loss == 0 or self.lowest_loss > loss: logger.write('Best model updated. loss: {} => {}'.format(self.lowest_loss, loss)) self._best_checkpoint(epoch, model, optimizer, logger) self.lowest_loss = loss if (epoch + 1) % resume_interval == 0: self._checkpoint(epoch, model, optimizer, logger) if self.hyperdash: self.experiment.end() @staticmethod def get_args(): # arg definition parser = argparse.ArgumentParser( description='Training pose net for comparison \ between chainer and pytorch about implementing DeepPose.') parser.add_argument( '--data-augmentation', '-a', action='store_true', help='Crop randomly and add random noise for data augmentation.') parser.add_argument( '--epoch', '-e', type=int, default=100, help='Number of epochs to train.') parser.add_argument( '--opt', '-o', type=str, default='Adam', choices=['MomentumSGD', 'Adam'], help='Optimization method.') parser.add_argument( '--gpu', '-g', type=int, default=0, help='GPU ID (negative value indicates CPU).') parser.add_argument( '--seed', '-s', type=int, help='Random seed to train.') parser.add_argument( '--train', type=str, default='data/train', help='Path to training image-pose list file.') parser.add_argument( '--val', type=str, default='data/test', help='Path to validation image-pose list file.') parser.add_argument( '--batchsize', type=int, default=32, help='Learning minibatch size.') parser.add_argument( '--out', default='result', help='Output directory') parser.add_argument( '--resume', default=None, help='Initialize the trainer from given file. \ The file name is "epoch-{epoch number}.iter".') parser.add_argument( '--resume-model', type=str, default=None, help='Load model definition file to use for resuming training \ (it\'s necessary when you resume a training). \ The file name is "epoch-{epoch number}.mode"') parser.add_argument( '--resume-opt', type=str, default=None, help='Load optimization states from this file \ (it\'s necessary when you resume a training). \ The file name is "epoch-{epoch number}.state"') parser.add_argument( '--hyperdash', type=str, default=None, help='If you use hyperdash logging, enter here the name of experiment. Before using, you have to login to hyperdash with "hyperdash login --github". The default is None that means no logging with hyperdash') args = parser.parse_args() return args
def train_multidec(args): print("Training multidec") device = torch.device(args.gpu) df_image_data = pd.read_csv(os.path.join(CONFIG.CSV_PATH, args.image_csv), index_col=0, encoding='utf-8-sig') df_text_data = pd.read_csv(os.path.join(CONFIG.CSV_PATH, args.text_csv), index_col=0, encoding='utf-8-sig') df_label = pd.read_csv(os.path.join(CONFIG.CSV_PATH, args.label_csv), index_col=0, encoding='utf-8-sig') short_code_array = np.array(df_label.index) label_array = np.array(df_label['category']) n_clusters = np.max(label_array) + 1 short_code_train, short_code_val, label_train, label_val = train_test_split( short_code_array, label_array, test_size=0.2, random_state=42) df_train = pd.DataFrame(data=label_train, index=short_code_train, columns=df_label.columns) df_val = pd.DataFrame(data=label_val, index=short_code_val, columns=df_label.columns) print("Loading dataset...") train_dataset, val_dataset = load_multi_csv_data(df_image_data, df_text_data, df_train, df_val, CONFIG) print("Loading dataset completed") image_encoder = MDEC_encoder(input_dim=args.input_dim, z_dim=args.latent_dim, n_clusters=n_clusters, encodeLayer=[500, 500, 2000], activation="relu", dropout=0) image_encoder.load_model( os.path.join(CONFIG.CHECKPOINT_PATH, "image_sdae_" + str(args.latent_dim)) + ".pt") text_encoder = MDEC_encoder(input_dim=args.input_dim, z_dim=args.latent_dim, n_clusters=n_clusters, encodeLayer=[500, 500, 2000], activation="relu", dropout=0) text_encoder.load_model( os.path.join(CONFIG.CHECKPOINT_PATH, "text_sdae_" + str(args.latent_dim)) + ".pt") mdec = MultiDEC(device=device, image_encoder=image_encoder, text_encoder=text_encoder, n_clusters=n_clusters) exp = Experiment("MDEC " + str(args.latent_dim), capture_io=True) print(mdec) for arg, value in vars(args).items(): exp.param(arg, value) try: mdec.fit(train_dataset, val_dataset, lr=args.lr, batch_size=args.batch_size, num_epochs=args.epochs, save_path=CONFIG.CHECKPOINT_PATH) print("Finish!!!") finally: exp.end()
nb_states = env.observation_space.shape[0] nb_actions = env.action_space.shape[0] agent = DDPG(nb_states, nb_actions, args) evaluate = Evaluator(args.validate_episodes, args.validate_steps, args.output, max_episode_length=args.max_episode_length) exp = None if args.mode == 'train': if hyperdash_support: exp = Experiment("sim2real-ddpg-simplus-cheetah") exp.param("model", MODEL_PATH) for arg in [ "env", "rate", "prate", "hidden1", "hidden2", "warmup", "discount", "bsize", "rmsize", "window_length", "tau", "ou_theta", "ou_sigma", "ou_mu", "validate_episodes", "max_episode_length", "validate_steps", "init_w", "train_iter", "epsilon", "seed", "resume" ]: arg_val = getattr(args, arg) exp.param(arg, arg_val) import socket exp.param("host", socket.gethostname()) train(args, args.train_iter,
def train(train_list, test_list, lr, epoch, batchsize, insize, outsize, save_interval=10, weight_decay=5e-4, lr_step=10, model_name='resnet34', loss_name='focal_loss', metric_name='arc_margin', optim_name='adam', num_workers=4, print_freq=1e+6, debug=False): device = torch.device("cuda") train_dataset = Dataset(train_list, mode='train', insize=insize, debug=debug) trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=batchsize, shuffle=True, num_workers=num_workers) test_dataset = Dataset(test_list, mode='test', insize=insize, debug=debug) testloader = torch.utils.data.DataLoader(test_dataset, batch_size=batchsize, shuffle=False, num_workers=num_workers) class_num = train_dataset.get_classnum() print('{} train iters per epoch:'.format(len(trainloader))) print('{} test iters per epoch:'.format(len(testloader))) if loss_name == 'focal_loss': criterion = FocalLoss(gamma=2) else: criterion = torch.nn.CrossEntropyLoss() if model_name == 'resnet18': model = resnet_face18(insize, outsize) elif model_name == 'resnet34': model = resnet34(insize, outsize) elif model_name == 'resnet50': model = resnet50(insize, outsize) elif model_name == 'resnet101': model = resnet101(insize, outsize) elif model_name == 'resnet152': model = resnet152(insize, outsize) elif model_name == 'shuffle': model = ShuffleFaceNet(outsize) elif model_name == 'simplev1': model = CNNv1(insize, outsize, activation='relu', kernel_pattern='v1') else: raise ValueError('Invalid model name: {}'.format(model_name)) if metric_name == 'add_margin': metric_fc = AddMarginProduct(outsize, class_num, s=30, m=0.35) elif metric_name == 'arc_margin': metric_fc = ArcMarginProduct(outsize, class_num, s=30, m=0.5, easy_margin=False) elif metric_name == 'sphere': metric_fc = SphereProduct(outsize, class_num, m=4) else: metric_fc = nn.Linear(outsize, class_num) # view_model(model, opt.input_shape) print(model) model.to(device) model = DataParallel(model) metric_fc.to(device) metric_fc = DataParallel(metric_fc) assert optim_name in ['sgd', 'adam'] if optim_name == 'sgd': optimizer = torch.optim.SGD([{ 'params': model.parameters() }, { 'params': metric_fc.parameters() }], lr=lr, weight_decay=weight_decay) elif optim_name == 'adam': optimizer = torch.optim.Adam([{ 'params': model.parameters() }, { 'params': metric_fc.parameters() }], lr=lr, weight_decay=weight_decay) scheduler = StepLR(optimizer, step_size=lr_step, gamma=0.1) start = time.time() training_id = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') hyperdash_exp = Experiment(training_id) checkpoints_dir = os.path.join('logs', training_id) if not os.path.exists(checkpoints_dir): os.makedirs(checkpoints_dir) logging_path = os.path.join(checkpoints_dir, 'history.csv') config = {} config['train_list'] = train_list config['test_list'] = test_list config['lr'] = lr config['epoch'] = epoch config['batchsize'] = batchsize config['insize'] = insize config['outsize'] = outsize config['save_interval'] = save_interval config['weight_decay'] = weight_decay config['lr_step'] = lr_step config['model_name'] = model_name config['loss_name'] = loss_name config['metric_name'] = metric_name config['optim_name'] = optim_name config['num_workers'] = num_workers config['debug'] = debug for k, v in config.items(): hyperdash_exp.param(k, v, log=False) with open(os.path.join(checkpoints_dir, 'train_config.json'), 'w') as f: json.dump(config, f, indent=4) with open(logging_path, 'w') as f: f.write('epoch,time_elapsed,train_loss,train_acc,test_loss,test_acc\n') prev_time = datetime.datetime.now() for i in range(epoch): model.train() for ii, data in enumerate(tqdm(trainloader, disable=True)): data_input, label = data data_input = data_input.to(device) label = label.to(device).long() feature = model(data_input) output = metric_fc(feature, label) loss = criterion(output, label) pred_classes = np.argmax(output.data.cpu().numpy(), axis=1) acc = np.mean( (pred_classes == label.data.cpu().numpy()).astype(int)) optimizer.zero_grad() loss.backward() #import pdb; pdb.set_trace() optimizer.step() #scheduler.step() iters = i * len(trainloader) + ii if iters % print_freq == 0 or debug: speed = print_freq / (time.time() - start) time_str = time.asctime(time.localtime(time.time())) print('{} train epoch {} iter {} {} iters/s loss {} acc {}'. format(time_str, i, ii, speed, loss.item(), acc)) start = time.time() model.eval() for ii, data in enumerate(tqdm(testloader, disable=True)): data_input, label = data data_input = data_input.to(device) label = label.to(device).long() feature = model(data_input) output = metric_fc(feature, label) test_loss = criterion(output, label) output = np.argmax(output.data.cpu().numpy(), axis=1) test_acc = np.mean( (output == label.data.cpu().numpy()).astype(int)) #test_acc = np.mean((torch.argmax(output, dim=1) == label).type(torch.int32)) if i % save_interval == 0 or i == epoch: save_model(model.module, checkpoints_dir, model_name, i) save_model(metric_fc.module, checkpoints_dir, metric_name, i) new_time = datetime.datetime.now() with open(logging_path, 'a') as f: f.write('{},{},{},{},{},{}\n'.format( i, (new_time - prev_time).total_seconds(), loss.item(), acc, test_loss.item(), test_acc)) prev_time = datetime.datetime.now() hyperdash_exp.metric('train_loss', loss.item(), log=False) hyperdash_exp.metric('train_acc', acc, log=False) hyperdash_exp.metric('test_loss', test_loss.item(), log=False) hyperdash_exp.metric('test_acc', test_acc, log=False) hyperdash_exp.end() print('Finished {}'.format(training_id))
def run_pusher3dof(args, sim=True, vanilla=False): try: from hyperdash import Experiment hyperdash_support = True except: hyperdash_support = False env = NormalizedEnv(gym.make(args.env)) torques = [1.0] * 3 # if real colored = False if sim: torques = [args.t0, args.t1, args.t2] colored = True if not vanilla: env.env._init( torques=torques, colored=colored ) if args.seed > 0: np.random.seed(args.seed) env.seed(args.seed) nb_states = env.observation_space.shape[0] nb_actions = env.action_space.shape[0] agent = DDPG(nb_states, nb_actions, args) evaluate = Evaluator( args.validate_episodes, args.validate_steps, args.output, max_episode_length=args.max_episode_length ) exp = None if args.mode == 'train': if hyperdash_support: prefix = "real" if sim: prefix = "sim" exp = Experiment("s2r-pusher3dof-ddpg-{}".format(prefix)) import socket exp.param("host", socket.gethostname()) exp.param("type", prefix) # sim or real exp.param("vanilla", vanilla) # vanilla or not exp.param("torques", torques) exp.param("folder", args.output) for arg in ["env", "max_episode_length", "train_iter", "seed", "resume"]: arg_val = getattr(args, arg) exp.param(arg, arg_val) train(args, args.train_iter, agent, env, evaluate, args.validate_steps, args.output, max_episode_length=args.max_episode_length, debug=args.debug, exp=exp) # when done exp.end() elif args.mode == 'test': test(args.validate_episodes, agent, env, evaluate, args.resume, visualize=args.vis, debug=args.debug, load_best=args.best) else: raise RuntimeError('undefined mode {}'.format(args.mode))
def test_experiment(self): # Run a test job via the Experiment API # Make sure log file is where is supposed to be # look at decorator # verify run start/stop is sent with patch("sys.stdout", new=StringIO()) as faked_out: exp = Experiment("MNIST") exp.log("test print") exp.param("batch size", 32) for i in exp.iter(2): time.sleep(1) exp.metric("accuracy", i * 0.2) time.sleep(0.1) exp.end() # Test params match what is expected params_messages = [] for msg in server_sdk_messages: payload = msg["payload"] if "params" in payload: params_messages.append(payload) expect_params = [ { "params": { "batch size": 32, }, "is_internal": False, }, { "params": { "hd_iter_0_epochs": 2, }, "is_internal": True, }, ] assert len(expect_params) == len(params_messages) for i, message in enumerate(params_messages): assert message == expect_params[i] # Test metrics match what is expected metrics_messages = [] for msg in server_sdk_messages: payload = msg["payload"] if "name" in payload: metrics_messages.append(payload) expect_metrics = [ { "is_internal": True, "name": "hd_iter_0", "value": 0 }, { "is_internal": False, "name": "accuracy", "value": 0 }, { "is_internal": True, "name": "hd_iter_0", "value": 1 }, { "is_internal": False, "name": "accuracy", "value": 0.2 }, ] assert len(expect_metrics) == len(metrics_messages) for i, message in enumerate(metrics_messages): assert message == expect_metrics[i] captured_out = faked_out.getvalue() assert "error" not in captured_out # Make sure correct API name / version headers are sent assert server_sdk_headers[0][API_KEY_NAME] == API_NAME_EXPERIMENT assert server_sdk_headers[0][ VERSION_KEY_NAME] == get_hyperdash_version() # Make sure logs were persisted expect_logs = [ "{ batch size: 32 }", "test print", "| Iteration 0 of 1 |", "| accuracy: 0.000000 |", ] log_dir = get_hyperdash_logs_home_path_for_job("MNIST") latest_log_file = max([ os.path.join(log_dir, filename) for filename in os.listdir(log_dir) ], key=os.path.getmtime) with open(latest_log_file, "r") as log_file: data = log_file.read() for log in expect_logs: assert_in(log, data) os.remove(latest_log_file)
# digits.py from sklearn import svm, datasets from hyperdash import Experiment # Preprocess data digits = datasets.load_digits(100) test_cases = 50 X_train, y_train = digits.data[:-test_cases], digits.target[:-test_cases] X_test, y_test = digits.data[-test_cases:], digits.target[-test_cases:] # Create an experiment with a model name, then autostart exp = Experiment("Digits Classifier") # Record the value of hyperparameter gamma for this experiment gamma = exp.param("gamma", 0.1) # Param can record any basic type (Number, Boolean, String) classifer = svm.SVC(gamma=gamma) classifer.fit(X_train, y_train) # Record a numerical performance metric exp.metric("accuracy", classifer.score(X_test, y_test)) # Cleanup and mark that the experiment successfully completed exp.end()
if args.memdebug: import gc import objgraph import ipdb run_name = time.strftime("%y%m%d%H%M%S") assert args.algo in ['a2c', 'ppo', 'acktr'] if args.recurrent_policy: assert args.algo in ['a2c', 'ppo'], \ 'Recurrent policy is not implemented for ACKTR' exp = None if has_hyperdash: exp = Experiment("{} - {}".format(args.env_name, args.algo)) exp.param("NAME", run_name) for param, value in vars(args).items(): exp.param(param, value) num_updates = int(args.num_frames) // args.num_steps // args.num_processes num_breaks = num_updates / 10 torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) try: os.makedirs(args.log_dir) except OSError: files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv'))
class KerasModel: def __init__(self, name='deathbot', load_weights=False, training=False, batch_size=100, lr=1e-3, location=None): self.session = tf.Session() self.name = name if training: from hyperdash import Experiment self.exp = Experiment(name) if name in MODELS.keys(): self.model = MODELS[name]() if not training else MODELS[name]( self.exp) adam = Adam(lr=lr) nadam = Nadam(lr=lr) #rms = RMSprop(lr=lr) #sgd = SGD(lr=lr) self.optimizer = adam if name == "evo" else nadam loss = ["binary_crossentropy", "categorical_crossentropy", "poisson"] self.model.compile(optimizer=self.optimizer, loss=loss[1], metrics=["acc"]) self.callbacks = [] if training: self.exp.param("lr", lr) reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=4, min_lr=1e-4, verbose=1) tb = TensorBoard('./models/logs/{}'.format(name), write_graph=True) cp = ModelCheckpoint( filepath='./models/weights-{}.hdf5'.format(name), monitor='val_acc', verbose=1, save_best_only=True) hd = Hyperdash(self.exp, self.model) es = EarlyStopping('val_acc', patience=5, verbose=1) self.callbacks = [cp, tb, hd, reduce_lr, es] if load_weights: #print(os.listdir(os.getcwd())) self.model.load_weights('./deathbot/weights-{}.hdf5'.format(name)) if training: print('Weights Loaded...') def save(self, path): self.model.save(path + self.name + ".h5") def fit(self, input_data, expected_output_data, batch_size=100, epochs=1): input_data = self.normalize_input(input_data) return self.model.fit(input_data, expected_output_data, batch_size=batch_size, epochs=epochs, verbose=1, callbacks=self.callbacks, validation_split=0.2, shuffle=False) def predict(self, input_data, batch_size=1, p=False): return list( map( self.clean_pred, self.model.predict(self.normalize_input(np.array([input_data ])), batch_size=batch_size)[0])) def compute_loss(self, input_data, expected_output_data): return self.model.evaluate(self.normalize_input(input_data), expected_output_data, batch_size=1, verbose=1) @staticmethod def clean_pred(pred): return pred if pred > 0.01 else 0.0 @staticmethod def normalize_input(input_data): # Assert the shape is what we expect assert len(input_data.shape) == 3 and input_data.shape[ 1] == PLANET_MAX_NUM and input_data.shape[2] == PER_PLANET_FEATURES m = np.expand_dims(input_data.mean(axis=1), axis=1) s = np.expand_dims(input_data.std(axis=1), axis=1) return (input_data - m) / (s + 1e-6)
class NeuralNet(object): LAYER1_SIZE = 522 # 12 LAYER2_SIZE = 256 # 6 LAYER3_SIZE = 128 LAYER4_SIZE = 64 LAYER5_SIZE = 32 OUTPUT_SIZE = 1 def __init__(self, name='nn-model', cached_model=None, seed=None, lr=1e-4, training=False): self.graph = tf.Graph() self.training = training if self.training: from hyperdash import Experiment self.exp = Experiment(name) with self.graph.as_default(): if seed is not None: tf.set_random_seed(seed) self.session = tf.Session() self.features = tf.placeholder(dtype=tf.float32, name="input_features", shape=(None, PLANET_MAX_NUM, PER_PLANET_FEATURES)) # target_distribution describes what the bot did in a real game. # For instance, if it sent 20% of the ships to the first planet and 15% of the ships to the second planet, # then expected_distribution = [0.2, 0.15 ...] self.target_distribution = tf.placeholder( dtype=tf.float32, name="target_distribution", shape=(None, PLANET_MAX_NUM)) # Combine all the planets from all the frames together, so it's easier to share # the weights and biases between them in the network. flattened_frames = tf.reshape(self.features, [-1, PER_PLANET_FEATURES]) layer1 = fully_connected(flattened_frames, 512) layer2 = fully_connected(layer1, 256) layer3 = fully_connected(layer2, 128) # Group back into frames layer4 = fully_connected(layer3, 64) layer5 = fully_connected(layer4, 32) layer6 = fully_connected(layer5, 1, activation_fn=None) logits = tf.reshape(layer6, [-1, PLANET_MAX_NUM]) self.prediction_normalized = tf.nn.softmax(logits) self.loss_op = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( logits=logits, labels=self.target_distribution)) self.optimizer = tf.train.AdamOptimizer( learning_rate=lr) # returns Op self.train_op = self.optimizer.minimize(self.loss_op) # self.acc_op = tf.reduce_mean(tf.reduce_min(tf.cast(self.prediction_normalized, tf.float32), 1)) # self.acc, self.update_acc_op = tf.metrics.mean_per_class_accuracy(self.target_distribution, self.prediction_normalized, 28) # multilabel_accuracy(self.prediction_normalized, self.target_distribution) self.saver = tf.train.Saver() if self.training: self.exp.param("lr", lr) if cached_model is None: self.session.run([ tf.global_variables_initializer(), tf.local_variables_initializer() ]) else: self.session.run(tf.local_variables_initializer()) self.saver.restore(self.session, cached_model) def fit(self, input_data, expected_output_data): loss, _ = self.session.run( [self.loss_op, self.train_op], feed_dict={ self.features: normalize_input(input_data), self.target_distribution: expected_output_data }) if self.training: self.exp.metric("training_loss", loss) return loss def predict(self, input_data): """ Given data from 1 frame, predict where the ships should be sent. :param input_data: numpy array of shape (PLANET_MAX_NUM, PER_PLANET_FEATURES) :return: 1-D numpy array of length (PLANET_MAX_NUM) describing percentage of ships that should be sent to each planet """ return self.session.run( self.prediction_normalized, feed_dict={self.features: normalize_input(np.array([input_data]))})[0] def compute_loss(self, input_data, expected_output_data): """ Compute loss on the input data without running any training. :param input_data: numpy array of shape (number of frames, PLANET_MAX_NUM, PER_PLANET_FEATURES) :param expected_output_data: numpy array of shape (number of frames, PLANET_MAX_NUM) :return: training loss on the input data """ loss = self.session.run(self.loss_op, feed_dict={ self.features: normalize_input(input_data), self.target_distribution: expected_output_data }) if self.training: self.exp.metric("val_loss", loss) return loss def save(self, path): """ Serializes this neural net to given path. :param path: """ self.saver.save(self.session, path)
def train_reconstruction(args): device = torch.device(args.gpu) print("Loading embedding model...") with open( os.path.join(CONFIG.DATASET_PATH, args.target_dataset, 'word_embedding.p'), "rb") as f: embedding_model = cPickle.load(f) with open(os.path.join(CONFIG.DATASET_PATH, args.target_dataset, 'word_idx.json'), "r", encoding='utf-8') as f: word_idx = json.load(f) print("Loading embedding model completed") print("Loading dataset...") train_dataset, val_dataset = load_text_data(args, CONFIG, word2idx=word_idx[1]) print("Loading dataset completed") train_loader, val_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=args.shuffle),\ DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False) # t1 = max_sentence_len + 2 * (args.filter_shape - 1) t1 = CONFIG.MAX_SENTENCE_LEN t2 = int(math.floor( (t1 - args.filter_shape) / 2) + 1) # "2" means stride size t3 = int(math.floor((t2 - args.filter_shape) / 2) + 1) args.t3 = t3 embedding = nn.Embedding.from_pretrained( torch.FloatTensor(embedding_model)) text_encoder = text_model.ConvolutionEncoder(embedding, t3, args.filter_size, args.filter_shape, args.latent_size) text_decoder = text_model.DeconvolutionDecoder(embedding, args.tau, t3, args.filter_size, args.filter_shape, args.latent_size, device) if args.resume: print("Restart from checkpoint") checkpoint = torch.load(os.path.join(CONFIG.CHECKPOINT_PATH, args.resume), map_location=lambda storage, loc: storage) start_epoch = checkpoint['epoch'] text_encoder.load_state_dict(checkpoint['text_encoder']) text_decoder.load_state_dict(checkpoint['text_decoder']) else: print("Start from initial") start_epoch = 0 text_autoencoder = text_model.TextAutoencoder(text_encoder, text_decoder) criterion = nn.NLLLoss().to(device) text_autoencoder.to(device) optimizer = AdamW(text_autoencoder.parameters(), lr=1., weight_decay=args.weight_decay, amsgrad=True) step_size = args.half_cycle_interval * len(train_loader) clr = cyclical_lr(step_size, min_lr=args.lr, max_lr=args.lr * args.lr_factor) scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, [clr]) if args.resume: optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) exp = Experiment("Text autoencoder " + str(args.latent_size), capture_io=False) for arg, value in vars(args).items(): exp.param(arg, value) try: text_autoencoder.train() for epoch in range(start_epoch, args.epochs): print("Epoch: {}".format(epoch)) for steps, batch in enumerate(train_loader): torch.cuda.empty_cache() feature = Variable(batch).to(device) optimizer.zero_grad() prob = text_autoencoder(feature) loss = criterion(prob.transpose(1, 2), feature) loss.backward() optimizer.step() scheduler.step() if (steps * args.batch_size) % args.log_interval == 0: input_data = feature[0] single_data = prob[0] _, predict_index = torch.max(single_data, 1) input_sentence = util.transform_idx2word( input_data.detach().cpu().numpy(), idx2word=word_idx[0]) predict_sentence = util.transform_idx2word( predict_index.detach().cpu().numpy(), idx2word=word_idx[0]) print("Epoch: {} at {} lr: {}".format( epoch, str(datetime.datetime.now()), str(scheduler.get_lr()))) print("Steps: {}".format(steps)) print("Loss: {}".format(loss.detach().item())) print("Input Sentence:") print(input_sentence) print("Output Sentence:") print(predict_sentence) del input_data, single_data, _, predict_index del feature, prob, loss exp.log("\nEpoch: {} at {} lr: {}".format( epoch, str(datetime.datetime.now()), str(scheduler.get_lr()))) _avg_loss, _rouge_1, _rouge_2 = eval_reconstruction_with_rouge( text_autoencoder, word_idx[0], criterion, val_loader, device) exp.log("\nEvaluation - loss: {} Rouge1: {} Rouge2: {}".format( _avg_loss, _rouge_1, _rouge_2)) util.save_models( { 'epoch': epoch + 1, 'text_encoder': text_encoder.state_dict(), 'text_decoder': text_decoder.state_dict(), 'avg_loss': _avg_loss, 'Rouge1:': _rouge_1, 'Rouge2': _rouge_2, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict() }, CONFIG.CHECKPOINT_PATH, "text_autoencoder_" + str(args.latent_size)) print("Finish!!!") finally: exp.end()
def main(job_dir, data_path, model_id, weights_path, loss, train_csv, val_csv, batch_size, train_epocs, optimizer, is_tpu, lr, hyperdash_key, **args): logging.getLogger().setLevel(logging.INFO) if not os.path.exists("output"): os.makedirs("output") batch_size *= 3 is_full_data = False hyperdash_capture_io = True # Setting up Hyperdash def get_api_key(): return hyperdash_key if hyperdash_key: exp = Experiment(model_id, get_api_key, capture_io=hyperdash_capture_io) exp.param("model_name", job_dir.split("/")[-1]) exp.param("data_path", data_path) exp.param("batch_size", batch_size) exp.param("train_epocs", train_epocs) exp.param("optimizer", optimizer) exp.param("lr", lr) if weights_path: exp.param("weights_path", weights_path) exp.param("loss", loss) exp.param("train_csv", train_csv) exp.param("val_csv", val_csv) logging.info("Downloading Training Image from path {}".format(data_path)) downloads_training_images(data_path, is_cropped=("_cropped" in job_dir)) logging.info("Building Model: {}".format(model_id)) if model_id in globals(): model_getter = globals()[model_id] model = model_getter() else: raise RuntimeError("Failed. Model function {} not found".format(model_id)) if loss+"_fn" in globals(): _loss_tensor = globals()[loss+"_fn"](batch_size) else: raise RuntimeError("Failed. Loss function {} not found".format(loss+"_fn")) accuracy = accuracy_fn(batch_size) img_width, img_height = [int(v) for v in model.input[0].shape[1:3]] trainable_count, non_trainable_count = print_trainable_counts(model) if hyperdash_key: exp.param("trainable_count", trainable_count) exp.param("non_trainable_count", non_trainable_count) dg = DataGenerator({ "rescale": 1. / 255, "horizontal_flip": True, "vertical_flip": True, "zoom_range": 0.2, "shear_range": 0.2, "rotation_range": 30, "fill_mode": 'nearest' }, data_path, train_csv, val_csv, target_size=(img_width, img_height)) train_generator = dg.get_train_generator(batch_size, is_full_data) test_generator = dg.get_test_generator(batch_size) if weights_path: with file_io.FileIO(weights_path, mode='r') as input_f: with file_io.FileIO("weights.h5", mode='w+') as output_f: output_f.write(input_f.read()) model.load_weights("weights.h5") # model = multi_gpu_model(model, gpus=4) if optimizer=="mo": model.compile(loss=_loss_tensor, optimizer=tf.train.MomentumOptimizer(learning_rate=lr, momentum=0.9, use_nesterov=True), metrics=[accuracy]) elif optimizer=="rms": model.compile(loss=_loss_tensor, optimizer=tf.train.RMSPropOptimizer(lr), metrics=[accuracy]) else: logging.error("Optimizer not supported") return csv_logger = CSVLogger(job_dir, "output/training.log") model_checkpoint_path = "weights-improvement-{epoch:02d}-{val_loss:.2f}.h5" model_checkpointer = ModelCheckpoint(job_dir, model_checkpoint_path, save_best_only=True, save_weights_only=True, monitor="val_loss", verbose=1) tensorboard = TensorBoard(log_dir=job_dir + '/logs/', histogram_freq=0, write_graph=True, write_images=True) # test_accuracy = TestAccuracy(data_path) # Not using test data as of now callbacks = [csv_logger, model_checkpointer, tensorboard] if hyperdash_key: callbacks.append(HyperdashCallback(exp)) model_json = model.to_json() write_file_and_backup(model_json, job_dir, "output/model.def") with open("output/model_code.pkl", 'wb') as f: dill.dump(model_getter, f) backup_file(job_dir, "output/model_code.pkl") model_code = inspect.getsource(model_getter) write_file_and_backup(model_code, job_dir, "output/model_code.txt") if is_tpu: model = tf.contrib.tpu.keras_to_tpu_model( model, strategy=tf.contrib.tpu.TPUDistributionStrategy( tf.contrib.cluster_resolver.TPUClusterResolver(os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS']) ) ) history = model.fit_generator(train_generator, steps_per_epoch=(train_generator.n//(train_generator.batch_size)), validation_data=test_generator, epochs=train_epocs, validation_steps=(test_generator.n//(test_generator.batch_size)), callbacks=callbacks) pd.DataFrame(history.history).to_csv("output/history.csv") backup_file(job_dir, "output/history.csv") model.save_weights('output/model.h5') backup_file(job_dir, 'output/model.h5')
class Experiment: @logger.read def __init__(self, dataset_name, model_name, optimizer_name, trial_num): """ :param dataset_name: name of the dataset :type dataset_name: str :param model_name: name of the model :type model_name: str :param optimizer_name: name of the optimizer :type optimizer_name: str :param trial_num: current number of repeated trials :type trial_num: int """ # get optimized hyperparameters with open( f'../params/{dataset_name}_{model_name}_{optimizer_name}/result.json' ) as f: params = json.load(f) # get instances self.dataset = Datasets.get(dataset_name) self.model = Models.get(model_name, dataset=self.dataset) self.optimizer = Optimizers.get(optimizer_name, params=params) # get config with open('./config.json') as f: config = json.load(f) # get constants c = config['constants'][dataset_name][model_name] self.loss = c['loss'] self.batch_size = c['batch_size'] self.epochs = c['epochs'] # configure and initialize directory d = self.main_dir = f'../data/{dataset_name}_{model_name}_{optimizer_name}/trial{trial_num}' if os.path.exists(d): shutil.rmtree(d) os.makedirs(d) # configure hyperdash experiment self.hd_exp = HyperdashExperiment( f'{dataset_name}', api_key_getter=lambda: config['hyperdash']['api_key']) self.hd_exp.param('dataset_name', dataset_name) self.hd_exp.param('model_name', model_name) self.hd_exp.param('optimizer_name', optimizer_name) self.hd_exp.param('trial_num', trial_num) for k, v in params.items(): self.hd_exp.param(k, v) # set callbacks self.callbacks = [ Hyperdash(['accuracy', 'loss', 'val_accuracy', 'val_loss'], self.hd_exp), TensorBoard(log_dir=f'{self.main_dir}/tensorboard'), TimeLogger(filename=f'{self.main_dir}/time.csv'), CSVLogger(filename=f'{self.main_dir}/result.csv', append=True) ] @logger.write def begin(self): # get data (x_train, y_train), (x_test, y_test) = self.dataset.get_batch() # start learning self.model.compile(loss=self.loss, optimizer=self.optimizer, metrics=['accuracy']) self.model.fit(x_train, y_train, batch_size=self.batch_size, epochs=self.epochs, callbacks=self.callbacks, validation_split=0.2, verbose=2) # save final scores score = self.model.evaluate(x_test, y_test, verbose=1) with open(f'{self.main_dir}/test.json', 'w') as f: json.dump({ 'test loss': score[0], 'test accuracy': score[1] }, f, indent=4) # stop hyperdash experiment self.hd_exp.end()
print("MODEL LOADED, CONTINUING TRAINING") return "TRAINING AVG LOSS: {}\n" \ "TRAINING AVG DIFF: {}".format( checkpoint["epoch_avg_loss"], checkpoint["epoch_avg_diff"]) else: if optional: pass # model loading was optional, so nothing to do else: # shit, no model raise Exception("model couldn't be found:", MODEL_PATH_BEST) loss_function = nn.MSELoss() if hyperdash_support: exp = Experiment("simple lstm - pusher simple") exp.param("layers", LSTM_LAYERS) exp.param("nodes", HIDDEN_NODES) exp.param("action steps", ACTION_STEPS) if TRAIN: optimizer = optim.Adam(net.parameters()) if CONTINUE: old_model_string = loadModel(optional=True) print(old_model_string) else: old_model_string = loadModel(optional=False) loss_min = [float('inf')] for epoch in np.arange(EPOCHS):