def __init__(self, flags, word_index_to_embeddings_map, train_or_others): self.lstm_size = flags.lstm_size self.max_grad_norm = flags.max_grad_norm self.batch_size = flags.batch_size self.learning_rate = flags.learning_rate self.max_len = flags.max_len self.embedding_size = flags.embedding_size self.word_index_to_embeddings_map = word_index_to_embeddings_map self.num_layers = flags.num_layers self.train_or_others = train_or_others self.rich_context = flags.rich_context self.l2_strength = flags.l2_strength self.keep_prob_placeholder = tf.placeholder(tf.float32, shape=[], name='keep_prob') self.global_step = tf.Variable(0, trainable=False) print_params(flags) self.sentences_placeholder() self.get_embedding() #sentence to lstm and then ,concat and max pooling self.build_sentences() #attention for both w0 and w1 self.attention() self.pred_loss() self.saver = tf.train.Saver()
def __init__(self, cfg, try_load_best=False): self.cfg = cfg self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.model = VectorQuantizedVAE(cfg.input_dim, cfg.dim, K=cfg.K).to(self.device) self.optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-3) utils.print_params(self.model) self.writer = SummaryWriter(cfg.tbp) self.last_epoch = 0 self.best_loss = None ckp = cfg.ckp if try_load_best and os.path.isfile(cfg.ckp_best): ckp = cfg.ckp_best if os.path.isfile(ckp): checkpoint = torch.load(ckp) self.model.load_state_dict(checkpoint['model_state_dict']) self.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) self.last_epoch = checkpoint['epoch'] self.best_loss = checkpoint['best_loss'] print('Load {}! Epoch: {} Best loss: {:.4f}'.format( ckp, self.last_epoch, self.best_loss))
def restore_model(session, folder, train_data, validation_data, test_data, override_params = {}): print('Restoring model from %s' % folder) if not os.path.exists(folder): print('Error: Folder does not exist', folder) return [] # load saved parameters and override if needed params_path = folder + '/hyperparams.json' hyperparams = load_hyperparams(params_path) last_num_steps_per_epoch = utils.calc_num_steps_per_epoch(train_data, hyperparams) for param in override_params: if param in hyperparams: hyperparams[param] = override_params[param] utils.print_params(hyperparams) # Add ops to save and restore all the variables. graph = computation_graph.build_graph(hyperparams, validation_data, test_data) # Restore variables from disk. model_path = tf.train.latest_checkpoint(folder) print('Restoring model %s' % model_path) saver = graph.saver saver.restore(session, model_path) print("Model restored.") global_step = graph.global_step last_step = global_step.eval() last_epoch = last_step // last_num_steps_per_epoch print('Restored global_step %d last_epoch %d' % (last_step, last_epoch)) return [graph, hyperparams, last_epoch]
def run_model(hyperparams, data, save_to_folder): '''data: train_dataset, train_labels, valid_dataset, valid_labels, test_dataset, test_labels]''' if save_to_folder != '': hyperparams['save_folder'] = save_to_folder utils.print_params(hyperparams) with tf.Graph().as_default(), tf.Session() as session: # build graph flow model train_dataset, train_labels, valid_dataset, valid_labels, test_dataset, test_labels = data graph = computation_graph.build_graph(hyperparams, valid_dataset, test_dataset) # run graph flow model accuracy = model_training.train_model(session, graph, hyperparams, train_dataset, train_labels, valid_labels, test_labels) # report final results print('------Final results-------') print('Final train accuracy %1.2f, validation accuracy %1.2f %%' % (accuracy[0], accuracy[1])) print("Test accuracy: %1.2f%%" % accuracy[2]) return accuracy
def _two_label_performance(target_names, params): # get_params noise_amount = params['noise_amount'] # set params params['target_names'] = target_names print_params(**params) # fit hopfield print('\n.. fitting hopfield\n') hf, X, y, target_names, params = fit_hopfield(params) print_params(**params) # recall print('\n.. recalling\n') X, X_noise, X_recall = recall_with_noise(clf=hf, X=X, noise_amount=noise_amount) print_header('result') similarities, accurate = get_recalling_performance(X, X_recall) print('similarity:', np.mean(similarities)) print('accuracy:', np.mean(accurate)) similarity = np.mean(similarities) accuracy = np.mean(accurate) return similarity, accuracy
def main_process(dtrain, dtest, params, epsilon, stop_value=None): print("Starting hyperparameter tuning with start params:") print(utils.print_params(params)) print("With epsilon (stop) value: {}".format(epsilon)) gradients = utils.get_gradient_list(params, global_constraint.STEP) steps = utils.get_possible_steps(params, gradients, []) min_mae = float("Inf") step_mae = float("Inf") iterations = 0 best_params = params.copy() last_steps = [] while True: last_steps = steps.copy() for step_params in steps: print(utils.print_params(step_params)) cv_results = xgb.cv(step_params, dtrain, num_boost_round=10, seed=42, nfold=5, metrics={'mae'}, early_stopping_rounds=10) mean_mae = cv_results['test-mae-mean'].min() boost_rounds = cv_results['test-mae-mean'].argmin() print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds)) iterations = iterations + 1 print(iterations) if mean_mae < min_mae: min_mae = mean_mae best_params = step_params.copy() if stop_value is not None and min_mae < stop_value: break if (abs(step_mae - min_mae) < epsilon): if (iterations < 500): utils.reduce_steps() step_mae = min_mae steps = utils.get_possible_steps(best_params, gradients, last_steps) else: break else: step_mae = min_mae steps = utils.get_possible_steps(best_params, gradients, last_steps) print(len(steps)) print(len(steps)) print("Found best solution:") print(utils.print_params(best_params)) print("MAE:") print(min_mae) return (params, min_mae, iterations)
def main(): ''' Input Stage ''' print('\nInput Stage...') start = time.time() inputs = get_input_params() # Convert mm to pixel cvt_inputs = cvt_mm2pixel(inputs, pitch_of_pixel=inputs['P_D']) # Convert depth data inputstage = InputStage(inputs['name'], int(args.f), int(args.g), args.is_prediction) d, P_I, delta_d, color, L = inputstage.convert_depth( inputs['color'], cvt_inputs['depth'], cvt_inputs['f'], cvt_inputs['g'], cvt_inputs['P_D'], cvt_inputs['P_L']) print('Input Stage Done.') # Print parameters utils.print_params(inputs, cvt_inputs, d, P_I, delta_d, color, L) ''' Calculation Stage ''' # Generate elemental images print('\nCalculation Stage...') calculationstage = CalculationStage(inputs['name'], int(args.f), int(args.g), args.is_prediction) if args.is_gpu: elem_plane = calculationstage.generate_elemental_imgs_GPU( color, L, int(cvt_inputs['P_L']), P_I, cvt_inputs['g'], inputs['num_of_lenses']) else: elem_plane = calculationstage.generate_elemental_imgs_CPU( color, L, int(cvt_inputs['P_L']), P_I, cvt_inputs['g'], inputs['num_of_lenses']) print('Elemental Image Array generated.') ''' Generate Sub Aperture ''' print('\nGenerate sub aperture images...') aperture = SubAperture(inputs['name'], int(args.f), int(args.g), args.is_prediction) sub_apertures = aperture.generate_sub_apertures(elem_plane, int(cvt_inputs['P_L']), inputs['num_of_lenses']) print('Sub-Aperture Images generated.') print('\nElapsed time : {}s'.format(time.time() - start)) print('Done.')
def train(self): print('Training model ...') # load params self.window_size = self.train_data.window_size self.userTagIntent_vocab_size = self.train_data.userTagIntent_vocab_size self.agentAct_vocab_size = self.train_data.agentAct_vocab_size self.id2agentAct = self.train_data.id2agentAct other_npz = '{}/other_vars.npz'.format(self.model_folder) train_vars = {'window_size': self.window_size, 'userTagIntent_vocab_size': self.userTagIntent_vocab_size, 'agentAct_vocab_size': self.agentAct_vocab_size, 'id2agentAct': self.id2agentAct} np.savez_compressed(other_npz, **train_vars) self.params['window_size'] = self.window_size self.params['userTagIntent_vocab_size'] = self.userTagIntent_vocab_size self.params['agentAct_vocab_size'] = self.agentAct_vocab_size print_params(self.params) # build model graph, save graph and plot graph self._build() self._plot_graph() graph_yaml = '{}/graph-arch.yaml'.format(self.model_folder) with open(graph_yaml, 'w') as fyaml: fyaml.write(self.model.to_yaml()) # load train data X_train = self.train_data.userTagIntent_vecBin y_train = self.train_data.agentAct_vecBin train_utter_txt = self.train_data.userUtter_txt train_act_txt = self.train_data.agentAct_txt train_fname = '{}/train.target'.format(self.model_folder) writeUtterActTxt(train_utter_txt, train_act_txt, train_fname) # load dev data X_dev = self.dev_data.userTagIntent_vecBin y_dev = self.dev_data.agentAct_vecBin dev_utter_txt = self.dev_data.userUtter_txt dev_act_txt = self.dev_data.agentAct_txt dev_fname = '{}/dev.target'.format(self.model_folder) writeUtterActTxt(dev_utter_txt, dev_act_txt, dev_fname) for ep in xrange(self.epoch_nb): print('<Epoch {}>'.format(ep)) self.model.fit(x=X_train, y=y_train, batch_size=self.batch_size, nb_epoch=1, verbose=2) act_probs = self.model.predict(X_dev) precision, recall, fscore, accuracy_frame, threshold = eval_intentPredict(act_probs, y_dev) print('ep={}, precision={:.4f}, recall={:.4f}, fscore={:.4f}, accuracy_frame={:.4f}, threshold={:.4f}'.format(ep, precision, recall, fscore, accuracy_frame, threshold)) dev_pred_txt = getActPred(act_probs, threshold, self.id2agentAct) dev_results_fname = '{}/dev_results/dev_ep={}.pred'.format(self.model_folder, ep) writeUtterActTxt(dev_utter_txt, dev_pred_txt, dev_results_fname) print('Write dev results: {}'.format(dev_results_fname)) weights_fname = '{}/weights/ep={}_f1={:.4f}_frameAcc={:.4f}_th={:.4f}.h5'.format(self.model_folder, ep, fscore, accuracy_frame, threshold) print('Saving Model: {}'.format(weights_fname)) self.model.save_weights(weights_fname, overwrite=True)
def __init__(self): self.ckp = 'results/vae.pt' self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.model = VAE().to(self.device) self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3) utils.print_params(self.model) self.writer = SummaryWriter('runs/vae') self.last_epoch = 0 if os.path.isfile(self.ckp): checkpoint = torch.load(self.ckp) self.model.load_state_dict(checkpoint['model_state_dict']) self.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) self.last_epoch = checkpoint['epoch'] loss = checkpoint['loss'] print( 'Load checkpoint! Last Epoch: {} Average loss: {:.4f}'.format( self.last_epoch, loss))
def hopfield_single_performance( n_sample, n_label, noise_amount, fit_mode, save_fig, ): # parameters params = { 'n_sample': n_sample, 'n_label': n_label, 'noise_amount': noise_amount, 'fit_mode': fit_mode, } print_params(**params) # fit hopfield print('\n.. fitting hopfield\n') hf, X, y, target_names, params = fit_hopfield(params) print_params(**params) # recall print('\n.. recalling\n') X, X_noise, X_recall = recall_with_noise(clf=hf, X=X, noise_amount=noise_amount) print_header('result') similarities, accurate = get_recalling_performance(X, X_recall) print('similarity:', np.mean(similarities)) print('accuracy:', np.mean(accurate)) # compare 3 images & save if save_fig: print('\n.. view recalling result\n') view_recalling_result(X, X_noise, X_recall, accurate=accurate, **params) similarity = np.mean(similarities) accuracy = np.mean(accurate) return similarity, accuracy
def hopfield_two_label_performance( n_sample, noise_amount, fit_mode, save_fig, ): # parameters params = { 'n_sample': n_sample, 'noise_amount': noise_amount, 'fit_mode': fit_mode, } print_params(**params) labels = [] similarities = [] accuracies = [] for target_names in itertools.combinations('chiltx', 2): similarity, accuracy = _two_label_performance(target_names, params) labels.append(','.join(target_names)) similarities.append(similarity) accuracies.append(accuracy) print('labels:', labels) print('similarities:', similarities) print('accuracies:', accuracies) fig, ax = plt.subplots() ind = np.arange(len(labels)) width = 0.35 ax.bar(ind, similarities, width, label='similarities', color='r') ax.bar(ind+width, accuracies, width, label='accuracies', color='b') ax.set_xlabel('two labels') ax.set_ylabel('performance') ax.set_xticks(ind+width) ax.set_xticklabels(labels) ax.set_ylim(0, 1) plt.legend(loc='lower right') # plt.show() plt.savefig('two_label_performance.png')
def random_process(dtrain, dtest, iterations): print("Starting hyperparameter tuning with start params:") random.seed(a=42) min_mae = float("Inf") l = 0 for i in range(0, iterations): step_params = { 'max_depth': 0 + random.randint(0, 10), 'min_child_weight': 0 + random.randint(0, 10), 'eta': random.uniform(LOWER_BOUND, 1), 'subsample': random.uniform(LOWER_BOUND, 1), 'colsample_bytree': random.uniform(LOWER_BOUND, 1), 'objective': 'reg:linear' } print(utils.print_params(step_params)) cv_results = xgb.cv(step_params, dtrain, num_boost_round=10, seed=42, nfold=5, metrics={'mae'}, early_stopping_rounds=10) mean_mae = cv_results['test-mae-mean'].min() boost_rounds = cv_results['test-mae-mean'].argmin() print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds)) if mean_mae < min_mae: min_mae = mean_mae best_params = step_params.copy() l = l + 1 print(l) print("\t") print(l) print("Found best solution:") print(utils.print_params(best_params)) print("MAE:") print(min_mae) return (best_params, min_mae)
def __init__(self, cfg, try_load_best=False): self.cfg = cfg self.net = GatedPixelCNN(input_dim=cfg.input_dim, dim=cfg.dim, n_layers=cfg.n_layers, n_classes=cfg.n_classes).cuda() utils.print_params(self.net) self.optimizer = optim.Adam(self.net.parameters()) self.last_epoch = 0 self.best_loss = None self.writer = SummaryWriter(cfg.tbp) ckp = cfg.ckp if try_load_best and os.path.isfile(cfg.ckp_best): ckp = cfg.ckp_best if os.path.isfile(ckp): checkpoint = torch.load(ckp) self.net.load_state_dict(checkpoint['net']) self.optimizer.load_state_dict(checkpoint['optimizer']) self.last_epoch = checkpoint['epoch'] self.best_loss = checkpoint['best_loss'] print('Load {}! Epoch: {} Best loss: {:.4f}'.format( ckp, self.last_epoch, self.best_loss))
def random_process_class(dtrain, dtest, iterations, y_test): print("Starting hyperparameter tuning with start params:") random.seed(a=42) maxacc = 0 l = 0 for i in range(0, iterations): step_params = { 'max_depth': 0 + random.randint(0, 10), 'min_child_weight': 0 + random.randint(0, 10), 'eta': random.uniform(LOWER_BOUND, 1), 'subsample': random.uniform(LOWER_BOUND, 1), 'colsample_bytree': random.uniform(LOWER_BOUND, 1), 'objective': 'binary:logistic' } cv_results = xgb.train( step_params, dtrain, num_boost_round=10, ) preds = cv_results.predict(dtest) preds = [1 if z > 0.5 else 0 for z in preds] #print(preds) err = 0 res = [i for i, j in zip(preds, y_test) if i == j] #accuracy = accuracy_score(dtest.label, predictions) #print("Accuracy: %.2f%%" % (accuracy * 100.0)) print(len(res)) print(100 * len(res) / len(preds)) if len(res) > maxacc: maxacc = len(res) best_params = step_params.copy() print("\t") print(l) print("Found best solution:") print(utils.print_params(best_params)) print("Random result:") print(maxacc) print(maxacc / len(y_test)) return (best_params, maxacc)
def train(opt): if opt.use_model == 'bert': # datasets train_set = BERTDGLREDataset(opt.train_set, opt.train_set_save, word2id, ner2id, rel2id, dataset_type='train', opt=opt) # dev_set = BERTDGLREDataset(opt.dev_set, opt.dev_set_save, word2id, ner2id, rel2id, dataset_type='dev', # instance_in_train=train_set.instance_in_train, opt=opt) # dataloaders train_loader = DGLREDataloader(train_set, batch_size=opt.batch_size, shuffle=True, negativa_alpha=opt.negativa_alpha) # dev_loader = DGLREDataloader(dev_set, batch_size=opt.test_batch_size, dataset_type='dev') model = GAIN_BERT(opt) elif opt.use_model == 'bilstm': # datasets train_set = DGLREDataset(opt.train_set, opt.train_set_save, word2id, ner2id, rel2id, dataset_type='train', opt=opt) # dev_set = DGLREDataset(opt.dev_set, opt.dev_set_save, word2id, ner2id, rel2id, dataset_type='dev', # instance_in_train=train_set.instance_in_train, opt=opt) # dataloaders train_loader = DGLREDataloader(train_set, batch_size=opt.batch_size, shuffle=True, negativa_alpha=opt.negativa_alpha) # dev_loader = DGLREDataloader(dev_set, batch_size=opt.test_batch_size, dataset_type='dev') model = GAIN_GloVe(opt) else: assert 1 == 2, 'please choose a model from [bert, bilstm].' print(model.parameters) print_params(model) start_epoch = 1 pretrain_model = opt.pretrain_model lr = opt.lr model_name = opt.model_name if pretrain_model != '': chkpt = torch.load(pretrain_model, map_location=torch.device('cpu')) model.load_state_dict(chkpt['checkpoint']) logging('load model from {}'.format(pretrain_model)) start_epoch = chkpt['epoch'] + 1 lr = chkpt['lr'] logging('resume from epoch {} with lr {}'.format(start_epoch, lr)) else: logging('training from scratch with lr {}'.format(lr)) model = get_cuda(model) if opt.use_model == 'bert': bert_param_ids = list(map(id, model.bert.parameters())) base_params = filter(lambda p: p.requires_grad and id(p) not in bert_param_ids, model.parameters()) optimizer = optim.AdamW([ {'params': model.bert.parameters(), 'lr': lr * 0.01}, {'params': base_params, 'weight_decay': opt.weight_decay} ], lr=lr) else: optimizer = optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=lr, weight_decay=opt.weight_decay) BCE = nn.BCEWithLogitsLoss(reduction='none') if opt.coslr: scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=(opt.epoch // 4) + 1) checkpoint_dir = opt.checkpoint_dir if not os.path.exists(checkpoint_dir): os.mkdir(checkpoint_dir) fig_result_dir = opt.fig_result_dir if not os.path.exists(fig_result_dir): os.mkdir(fig_result_dir) best_ign_auc = 0.0 best_ign_f1 = 0.0 best_epoch = 0 model.train() global_step = 0 total_loss = 0 plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim(0.0, 1.0) plt.xlim(0.0, 1.0) plt.title('Precision-Recall') plt.grid(True) acc_NA, acc_not_NA, acc_total = Accuracy(), Accuracy(), Accuracy() logging('begin..') for epoch in range(start_epoch, opt.epoch + 1): start_time = time.time() for acc in [acc_NA, acc_not_NA, acc_total]: acc.clear() for ii, d in enumerate(train_loader): relation_multi_label = d['relation_multi_label'] relation_mask = d['relation_mask'] relation_label = d['relation_label'] predictions = model(words=d['context_idxs'], src_lengths=d['context_word_length'], mask=d['context_word_mask'], entity_type=d['context_ner'], entity_id=d['context_pos'], mention_id=d['context_mention'], distance=None, entity2mention_table=d['entity2mention_table'], graphs=d['graphs'], h_t_pairs=d['h_t_pairs'], relation_mask=relation_mask, path_table=d['path_table'], entity_graphs=d['entity_graphs'], ht_pair_distance=d['ht_pair_distance'] ) loss = torch.sum(BCE(predictions, relation_multi_label) * relation_mask.unsqueeze(2)) / ( opt.relation_nums * torch.sum(relation_mask)) optimizer.zero_grad() loss.backward() if opt.clip != -1: nn.utils.clip_grad_value_(model.parameters(), opt.clip) optimizer.step() if opt.coslr: scheduler.step(epoch) output = torch.argmax(predictions, dim=-1) output = output.data.cpu().numpy() relation_label = relation_label.data.cpu().numpy() for i in range(output.shape[0]): for j in range(output.shape[1]): label = relation_label[i][j] if label < 0: break is_correct = (output[i][j] == label) if label == 0: acc_NA.add(is_correct) else: acc_not_NA.add(is_correct) acc_total.add(is_correct) global_step += 1 total_loss += loss.item() log_step = opt.log_step if global_step % log_step == 0: cur_loss = total_loss / log_step elapsed = time.time() - start_time logging( '| epoch {:2d} | step {:4d} | ms/b {:5.2f} | train loss {:5.3f} | NA acc: {:4.2f} | not NA acc: {:4.2f} | tot acc: {:4.2f} '.format( epoch, global_step, elapsed * 1000 / log_step, cur_loss * 1000, acc_NA.get(), acc_not_NA.get(), acc_total.get())) total_loss = 0 start_time = time.time() if epoch % opt.test_epoch == 0: logging('-' * 89) eval_start_time = time.time() model.eval() ign_f1, ign_auc, pr_x, pr_y = test(model, dev_loader, model_name, id2rel=id2rel) model.train() logging('| epoch {:3d} | time: {:5.2f}s'.format(epoch, time.time() - eval_start_time)) logging('-' * 89) if ign_f1 > best_ign_f1: best_ign_f1 = ign_f1 best_ign_auc = ign_auc best_epoch = epoch path = os.path.join(checkpoint_dir, model_name + '_best.pt') torch.save({ 'epoch': epoch, 'checkpoint': model.state_dict(), 'lr': lr, 'best_ign_f1': ign_f1, 'best_ign_auc': ign_auc, 'best_epoch': epoch }, path) plt.plot(pr_x, pr_y, lw=2, label=str(epoch)) plt.legend(loc="upper right") plt.savefig(os.path.join(fig_result_dir, model_name)) if epoch % opt.save_model_freq == 0: path = os.path.join(checkpoint_dir, model_name + '_{}.pt'.format(epoch)) torch.save({ 'epoch': epoch, 'lr': lr, 'checkpoint': model.state_dict() }, path) print("Finish training") print("Best epoch = %d | Best Ign F1 = %f" % (best_epoch, best_ign_f1)) print("Storing best result...") print("Finish storing")
def train(opt, isbody=False): train_ds = MedicalExtractionDataset(opt.train_data) dev_ds = MedicalExtractionDataset(opt.dev_data) test_ds = MedicalExtractionDataset(opt.test_data) dev_dl = DataLoader(dev_ds, batch_size=opt.dev_batch_size, shuffle=False, num_workers=opt.num_worker) test_dl = DataLoader(test_ds, batch_size=opt.dev_batch_size, shuffle=False, num_workers=opt.num_worker) if isbody: logging('training for body') model = MedicalExtractionModelForBody(opt) else: logging('training for subject, decorate and body') model = MedicalExtractionModel(opt) # print(model.parameters) print_params(model) start_epoch = 1 learning_rate = opt.lr total_epochs = opt.epochs pretrain_model = opt.pretrain_model model_name = opt.model_name # 要保存的模型名字 # load pretrained model if pretrain_model != '' and not isbody: chkpt = torch.load(pretrain_model, map_location=torch.device('cpu')) model.load_state_dict(chkpt['checkpoints']) logging('load model from {}'.format(pretrain_model)) start_epoch = chkpt['epoch'] + 1 learning_rate = chkpt['learning_rate'] logging('resume from epoch {} with learning_rate {}'.format( start_epoch, learning_rate)) else: logging('training from scratch with learning_rate {}'.format( learning_rate)) model = get_cuda(model) num_train_steps = int(len(train_ds) / opt.batch_size * opt.epochs) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }, ] # optimizer = optim.Adam(model.parameters(), lr=learning_rate) optimizer = optim.AdamW(optimizer_parameters, lr=learning_rate) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=opt.num_warmup_steps, num_training_steps=num_train_steps) threshold = opt.threshold criterion = nn.BCEWithLogitsLoss(reduction='none') checkpoint_dir = opt.checkpoint_dir if not os.path.exists(checkpoint_dir): os.mkdir(checkpoint_dir) es = EarlyStopping(patience=opt.patience, mode="min", criterion='val loss') for epoch in range(start_epoch, total_epochs + 1): train_loss = 0.0 model.train() train_dl = DataLoader(train_ds, batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_worker) tk_train = tqdm(train_dl, total=len(train_dl)) for batch in tk_train: optimizer.zero_grad() subject_target_ids = batch['subject_target_ids'] decorate_target_ids = batch['decorate_target_ids'] freq_target_ids = batch['freq_target_ids'] body_target_ids = batch['body_target_ids'] mask = batch['mask'].float().unsqueeze(-1) body_mask = batch['body_mask'].unsqueeze(-1) loss = None if isbody: body_logits = model( input_ids=batch['body_input_ids'], attention_mask=batch['body_mask'], token_type_ids=batch['body_token_type_ids']) loss = torch.sum( criterion(body_logits, body_target_ids) * body_mask) / torch.sum(body_mask) else: subject_logits, decorate_logits, freq_logits = model( input_ids=batch['input_ids'], attention_mask=batch['mask'], token_type_ids=batch['token_type_ids']) loss = torch.sum( (criterion(subject_logits, subject_target_ids) + criterion(decorate_logits, decorate_target_ids) + criterion(freq_logits, freq_target_ids)) * mask) / torch.sum(mask) loss.backward() optimizer.step() scheduler.step() tk_train.set_postfix(train_loss='{:5.3f} / 1000'.format( 1000 * loss.item()), epoch='{:2d}'.format(epoch)) train_loss += loss.item() * subject_target_ids.shape[0] avg_train_loss = train_loss * 1000 / len(train_ds) print('train loss per example: {:5.3f} / 1000'.format(avg_train_loss)) avg_val_loss = test(model, dev_ds, dev_dl, criterion, threshold, 'val', isbody=isbody) # 保留最佳模型方便evaluation if isbody: save_model_path = os.path.join(checkpoint_dir, model_name + '_body_best.pt') else: save_model_path = os.path.join(checkpoint_dir, model_name + '_best.pt') es(avg_val_loss, model, model_path=save_model_path, epoch=epoch, learning_rate=learning_rate) if es.early_stop: print("Early stopping") break # 保存epoch的模型方便断点续训 if epoch % opt.save_model_freq == 0: if isbody: save_model_path = os.path.join( checkpoint_dir, model_name + '_body_{}.pt'.format(epoch)) else: save_model_path = os.path.join( checkpoint_dir, model_name + '_{}.pt'.format(epoch)) torch.save( { 'epoch': epoch, 'learning_rate': learning_rate, 'checkpoints': model.state_dict() }, save_model_path) # load best model and test if isbody: best_model_path = os.path.join(checkpoint_dir, model_name + '_body_best.pt') else: best_model_path = os.path.join(checkpoint_dir, model_name + '_best.pt') chkpt = torch.load(best_model_path, map_location=torch.device('cpu')) model.load_state_dict(chkpt['checkpoints']) if isbody: logging('load best body model from {} and test ...'.format( best_model_path)) else: logging('load best model from {} and test ...'.format(best_model_path)) test(model, test_ds, test_dl, criterion, threshold, 'test', isbody) model.cpu()
optimizer = torch.optim.SGD(params=net.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay) criterion = nn.CrossEntropyLoss().cuda() net = torch.nn.DataParallel(net).to(device) best_model, best_acc = engine.train_reg(args, net, criterion, optimizer, trainloader, testloader, n_epochs) torch.save({'state_dict':best_model.state_dict()}, os.path.join(model_path, "{}_{}_{:.2f}.tar").format(model_name, mode, best_acc)) if __name__ == '__main__': file = "./config/" + dataset_name + ".json" args = utils.load_json(json_file=file) log_file = "{}_{}.txt".format(model_name, mode) utils.Tee(os.path.join(log_path, log_file), 'w') print(log_file) print("---------------------Training [%s]---------------------" % model_name) utils.print_params(args["dataset"], args[model_name], dataset=args['dataset']['name']) train_file = args['dataset']['train_file'] test_file = args['dataset']['test_file'] trainloader = utils.init_dataloader(args, train_file, mode="train") testloader = utils.init_dataloader(args, test_file, mode="test") main(args, model_name, trainloader, testloader)
def train(opt): train_ds = MedicalExtractionDataset(opt.train_data) dev_ds = MedicalExtractionDataset(opt.dev_data) dev_dl = DataLoader(dev_ds, batch_size=opt.dev_batch_size, shuffle=False, num_workers=1 ) model = MedicalExtractionModel(opt) print(model.parameters) print_params(model) start_epoch = 1 learning_rate = opt.lr total_epochs = opt.epochs log_step = opt.log_step pretrain_model = opt.pretrain_model model_name = opt.model_name # 要保存的模型名字 # load pretrained model if pretrain_model != '': chkpt = torch.load(pretrain_model, map_location=torch.device('cpu')) model.load_state_dict(chkpt['checkpoints']) logging('load model from {}'.format(pretrain_model)) start_epoch = chkpt['epoch'] + 1 learning_rate = chkpt['learning_rate'] logging('resume from epoch {} with learning_rate {}'.format(start_epoch, learning_rate)) else: logging('training from scratch with learning_rate {}'.format(learning_rate)) model = get_cuda(model) # TODO 如果用Bert可以改成AdamW optimizer = optim.Adam(model.parameters(), lr=learning_rate) # TODO loss function # criterion = checkpoint_dir = opt.checkpoint_dir if not os.path.exists(checkpoint_dir): os.mkdir(checkpoint_dir) # training process # 1. global_step = 0 total_loss = 0 for epoch in range(1, total_epochs + 1): start_time = time.time() train_dl = DataLoader(train_ds, batch_size=opt.batch_size, shuffle=True, num_workers=8 ) model.train() for batch in train_dl: optimizer.zero_grad() # TODO 喂数据 # TODO loss计算 loss = None loss.backward() optimizer.step() global_step += 1 total_loss += loss.item() if global_step % log_step == 0: cur_loss = total_loss / log_step elapsed = time.time() - start_time logging( '| epoch {:2d} | step {:4d} | ms/b {:5.2f} | train loss {:5.3f} '.format( epoch, global_step, elapsed * 1000 / log_step, cur_loss * 1000)) total_loss = 0 start_time = time.time() if epoch % opt.test_epoch == 0: model.eval() with torch.no_grad(): for batch in dev_dl: # TODO 在验证集上测试 pass # save model # TODO 可以改成只save在dev上最佳的模型 if epoch % opt.save_model_freq == 0: path = os.path.join(checkpoint_dir, model_name + '_{}.pt'.format(epoch)) torch.save({ 'epoch': epoch, 'learning_rate': learning_rate, 'checkpoint': model.state_dict() }, path)
print("The best test l2 is {:.3f}".format(test_l2)) print( "==============================================================================" ) if __name__ == '__main__': file = dataset_name + ".json" args = utils.load_params(file) if w > 0: log_file = "attack" + '_' + target_name + '_{}_{}.txt'.format( target_mode, w) else: log_file = "attack" + '_' + target_name + '_{}.txt'.format(target_mode) logger = utils.Tee(os.path.join(save_log_path, log_file), 'w') utils.print_params(args) train_file = args['dataset']['test_file'] test_file = args['dataset']['train_file'] trainloader = utils.init_dataloader(args, train_file, mode="train") testloader = utils.init_dataloader(args, test_file, mode="test") eval_model = utils.get_model(args, "VGG16", "reg") eval_model = torch.nn.DataParallel(eval_model).to(device) utils.load_state_dict(eval_model, eval_path) save_img_path = os.path.join( save_img_path, "attack_{}_{}".format(target_name, target_mode)) os.makedirs(save_img_path, exist_ok=True) main(args, trainloader, testloader, eval_model)
def train(self): print('Training model ...') # load params self.maxlen_userUtter = self.train_data.maxlen_userUtter self.word_vocab_size = self.train_data.word_vocab_size self.userIntent_vocab_size = self.train_data.userIntent_vocab_size self.userTag_vocab_size = self.train_data.userTag_vocab_size self.id2word = self.train_data.id2word self.id2userTag = self.train_data.id2userTag self.id2userIntent = self.train_data.id2userIntent self.userTag2id = self.train_data.userTag2id other_npz = '{}/other_vars.npz'.format(self.model_folder) train_vars = { 'id2userTag': self.id2userTag, 'id2word': self.id2word, 'id2userIntent': self.id2userIntent, 'userTag2id': self.userTag2id, 'userTag_vocab_size': self.userTag_vocab_size, 'userIntent_vocab_size': self.userIntent_vocab_size, 'word_vocab_size': self.word_vocab_size, 'maxlen_userUtter': self.maxlen_userUtter } np.savez_compressed(other_npz, **train_vars) self.params['maxlen_userUtter'] = self.maxlen_userUtter self.params['word_vocab_size'] = self.word_vocab_size self.params['userTag_vocab_size'] = self.userTag_vocab_size self.params['userIntent_vocab_size'] = self.userIntent_vocab_size print_params(self.params) # build model graph, save graph and plot graph self._build() self._plot_graph() graph_yaml = '{}/graph-arch.yaml'.format(self.model_folder) with open(graph_yaml, 'w') as fyaml: fyaml.write(self.model.to_yaml()) # load train data X_train = self.train_data.userUtter_encodePad tag_train = self.train_data.userTag_1hotPad intent_train = self.train_data.userIntent_vecBin train_utter_txt = self.train_data.userUtter_txt train_intent_txt = self.train_data.userIntent_txt train_tag_txt = self.train_data.userTag_txt train_target_fname = '{}/train.target'.format(self.model_folder) writeUtterTagIntentTxt(train_utter_txt, train_tag_txt, train_intent_txt, train_target_fname) # load dev data X_dev = self.dev_data.userUtter_encodePad tag_dev = self.dev_data.userTag_1hotPad intent_dev = self.dev_data.userIntent_vecBin dev_utter_txt = self.dev_data.userUtter_txt dev_intent_txt = self.dev_data.userIntent_txt dev_tag_txt = self.dev_data.userTag_txt dev_target_fname = '{}/dev.target'.format(self.model_folder) writeUtterTagIntentTxt(dev_utter_txt, dev_tag_txt, dev_intent_txt, dev_target_fname) # get mask matrix for train and dev set mask_array_train = np.zeros_like(X_train) mask_array_train[X_train != 0] = 1 mask_array_dev = np.zeros_like(X_dev) mask_array_dev[X_dev != 0] = 1 # jointly training for ep in xrange(self.epoch_nb): print('<Epoch {}>'.format(ep)) print '------------------------------------------------------------' print X_train self.model.fit(x=X_train, y={ 'slot_output': tag_train, 'intent_output': intent_train }, sample_weight={ 'slot_output': mask_array_train, 'intent_output': None }, batch_size=self.batch_size, nb_epoch=1, verbose=2) tag_probs, intent_probs = self.model.predict(X_dev) # calculate token-level scores precision_tag, recall_tag, fscore_tag, accuracy_frame_tag = eval_slotTagging( tag_probs, mask_array_dev, tag_dev, self.userTag2id['tag-O']) print( 'SlotTagging: ep={}, precision={:.4f}, recall={:.4f}, fscore={:.4f}, accuracy_frame={:.4f}' .format(ep, precision_tag, recall_tag, fscore_tag, accuracy_frame_tag)) precision_intent, recall_intent, fscore_intent, accuracy_frame_intent, threshold = eval_intentPredict( intent_probs, intent_dev) print( 'Intent Prediction: ep={}, precision={:.4f}, recall={:.4f}, fscore={:.4f}, accuracy_frame={:.4f}, threshold={:.4f}' .format(ep, precision_intent, recall_intent, fscore_intent, accuracy_frame_intent, threshold)) accuracy_frame_both = getNLUframeAccuracy(tag_probs, mask_array_dev, tag_dev, intent_probs, intent_dev, threshold) print('NLU Frame: ep={}, accuracy={:.4f}'.format( ep, accuracy_frame_both)) dev_tag_pred_txt, dev_intent_pred_txt = getNLUpred( tag_probs, mask_array_dev, self.id2userTag, intent_probs, threshold, self.id2userIntent) dev_results_fname = '{}/dev_results/dev_ep={}.pred'.format( self.model_folder, ep) writeUtterTagIntentTxt(dev_utter_txt, dev_tag_pred_txt, dev_intent_pred_txt, dev_results_fname) print('Write dev results: {}'.format(dev_results_fname)) weights_fname = '{}/weights/ep={}_tagF1={:.4f}frameAcc={:.4f}_intentF1={:.4f}frameAcc={:.4f}th={:.4f}.h5'.format( self.model_folder, ep, fscore_tag, accuracy_frame_tag, fscore_intent, accuracy_frame_intent, threshold) print('Saving Model: {}'.format(weights_fname)) self.model.save_weights(weights_fname, overwrite=True)
def main_process_class(dtrain, dtest, params, epsilon, y_test, stop_value=None): print("Starting hyperparameter tuning with start params:") print(utils.print_params(params)) print("With epsilon (stop) value: {}".format(epsilon)) gradients = utils.get_gradient_list(params, global_constraint.STEP) steps = utils.get_possible_steps(params, gradients, []) maxacc = 0 step_mae = 0 iterations = 0 best_params = params.copy() last_steps = [] while True: last_steps = steps.copy() for step_params in steps: print(utils.print_params(step_params)) #bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, , verbose = 2) cv_results = xgb.train( step_params, dtrain, num_boost_round=10, ) print(step_mae) preds = cv_results.predict(dtest) preds = [1 if z > 0.5 else 0 for z in preds] #print(preds) err = 0 res = [i for i, j in zip(preds, y_test) if i == j] #accuracy = accuracy_score(dtest.label, predictions) #print("Accuracy: %.2f%%" % (accuracy * 100.0)) print(len(res)) print(100 * len(res) / len(preds)) if len(res) > maxacc: maxacc = len(res) best_params = step_params.copy() iterations = iterations + 1 print(iterations) if (abs(step_mae - maxacc) < epsilon): if (iterations < 500): utils.reduce_steps() step_mae = maxacc steps = utils.get_possible_steps(best_params, gradients, last_steps) else: break else: step_mae = maxacc print("aaaa") steps = utils.get_possible_steps(best_params, gradients, last_steps) print("Found best solution:") print(utils.print_params(best_params)) print("MAE:") print(maxacc) return (params, maxacc, iterations)
def main(): if not torch.cuda.is_available(): raise NotImplementedError() hparams = type('', (object, ), EMOTIONX_MODEL_HPARAMS)() # dict to class # data fr_train_dialogs, fr_train_labels = load_data(hparams, hparams.fr_train_path) train_dialogs = fr_train_dialogs train_labels = fr_train_labels test_dialogs, test_labels = load_data(hparams, hparams.fr_test_path) assert len(train_dialogs) == len(train_labels) assert len(test_dialogs) == len(test_labels) # hyper-parameter hparams.n_appear = [sum(train_labels, []).count(i) for i in range(5)] max_i = len(train_dialogs) // hparams.batch_size total_step = 0 print_per = len(train_dialogs) // 4 highest_micro_f1 = 0. # model model = EmotionX_Model(hparams) model.cuda() model.train() print_params(model) optimizer = torch.optim.Adam(model.parameters(), hparams.learning_rate) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=max_i) writer = SummaryWriter(log_dir=hparams.log_dir) # train for i_epoch in range(hparams.n_epoch): train_dialogs, train_labels = shuffle_trainset(train_dialogs, train_labels) scheduler.step() for i_step in tqdm(range(max_i)): batch_dialogs = get_batch(train_dialogs, hparams.batch_size, i_step) batch_labels = get_batch(train_labels, hparams.batch_size, i_step) optimizer.zero_grad() pred_labels = model(batch_dialogs) loss = model.cal_loss(batch_labels, pred_labels) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), hparams.clip) optimizer.step() # print if i_step % print_per == 0: model.eval() n_appear = [0] * (hparams.n_class - 1) n_correct = [0] * (hparams.n_class - 1) n_positive = [0] * (hparams.n_class - 1) for i_test in range(len(test_dialogs) // hparams.batch_size): batch_dialogs = get_batch(test_dialogs, hparams.batch_size, i_test) batch_labels = get_batch(test_labels, hparams.batch_size, i_test) pred_labels = model(batch_dialogs) counts = model.count_for_eval(batch_labels, pred_labels) n_appear = [x + y for x, y in zip(n_appear, counts[0])] n_correct = [x + y for x, y in zip(n_correct, counts[1])] n_positive = [x + y for x, y in zip(n_positive, counts[2])] uwa, wa = model.get_uwa_and_wa(n_appear, n_correct) precision, recall, f1, micro_f1, macro_f1 = model.get_f1_scores( n_appear, n_correct, n_positive) print('i_epoch: ', i_epoch) print('i_total_step: ', total_step) print('n_true:\t\t\t', n_appear) print('n_positive:\t\t', n_positive) print('n_true_positive:\t', n_correct) print('precision:\t[%.4f, %.4f, %.4f, %.4f]' % (precision[0], precision[1], precision[2], precision[3])) print('recall:\t\t[%.4f, %.4f, %.4f, %.4f]' % (recall[0], recall[1], recall[2], recall[3])) print('f1:\t\t[%.4f, %.4f, %.4f, %.4f]' % (f1[0], f1[1], f1[2], f1[3])) if micro_f1 > highest_micro_f1: highest_micro_f1 = micro_f1 friend_high_step = total_step print('Micro F1: %.4f (<=%.4f at %d-th total_step)' % (micro_f1, highest_micro_f1, friend_high_step)) print() # write writer.add_scalar(hparams.log_micro_f1 + 'fr', micro_f1, total_step) writer.add_scalar(hparams.log_wce_loss + 'fr', loss, total_step) total_step += 1 model.train()
# Horovod: pin GPU to be used to process local rank (one GPU per process) tfconfig = tf.compat.v1.ConfigProto() #tf.ConfigProto() tfconfig.gpu_options.allow_growth = True tfconfig.gpu_options.visible_device_list = str(hvd.local_rank()) tf.compat.v1.keras.backend.set_session(tf.compat.v1.Session(config=tfconfig)) ################################################################################ # Argument handling ################################################################################ params = p.parse_args() if hvd.rank()==0: print_cl(sys.argv) print_params(params) if params.yaml_dump_then_exit: sys.exit(0) #-------------------------------- optimizer -----------------------------------# if params.optimizer=='adam': params.optimizer = Adam(lr=params.learning_rate * hvd.size()) elif params.optimizer=='sgd': params.optimizer = SGD(lr=params.learning_rate * hvd.size()) params.optimizer = hvd.DistributedOptimizer(params.optimizer) #------------------------------- model reloading ------------------------------# reloading_model = False
def main(): start_epoch = 0 best_prec1 = 0.0 seed=np.random.randint(10000) if seed is not None: np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) if args.gpus is not None: device = torch.device("cuda:{}".format(args.gpus[0])) cudnn.benchmark = False # cudnn.deterministic = True cudnn.enabled = True else: device = torch.device("cpu") now = datetime.now().strftime('%Y-%m-%d-%H:%M:%S') if args.mission is not None: if 'vgg' == args.arch and args.batchnorm: args.job_dir = f'{args.job_dir}/{args.dataset}/{args.arch}{args.num_layers}_bn/{args.mission}/{now}' elif 'resnet20' == args.arch: args.job_dir = f'{args.job_dir}/{args.dataset}/{args.arch}/{args.mission}/{now}' else: args.job_dir = f'{args.job_dir}/{args.dataset}/{args.arch}{args.num_layers}/{args.mission}/{now}' else: if 'vgg' == args.arch and args.batchnorm: args.job_dir = f'{args.job_dir}/{args.dataset}/{args.arch}{args.num_layers}_bn/{now}' else: args.job_dir = f'{args.job_dir}/{args.dataset}/{args.arch}{args.num_layers}/{now}' _make_dir(args.job_dir) ckpt = utils.checkpoint(args) print_logger = utils.get_logger(os.path.join(args.job_dir, "logger.log")) utils.print_params(vars(args), print_logger.info) writer_train = SummaryWriter(args.job_dir +'/run/train') writer_test = SummaryWriter(args.job_dir+ '/run/test') ## hyperparameters settings ## n_layers = (args.num_layers - 2) * 2 unit_k_bits = int(args.k_bits) kbits_list = [unit_k_bits for i in range(n_layers)] print_logger.info(f'k_bits_list {kbits_list}') # Data loading print('=> Preparing data..') if args.dataset in ['cifar10', 'cifar100','mnist']: IMAGE_SIZE = 32 elif args.dataset == 'tinyimagenet': IMAGE_SIZE = 64 else: IMAGE_SIZE = 224 if args.dataset == 'imagenet': train_loader = get_imagenet_iter_dali(type = 'train',image_dir=args.data_dir, batch_size=args.train_batch_size,num_threads=args.workers,crop=IMAGE_SIZE,device_id=0,num_gpus=1) val_loader = get_imagenet_iter_dali(type='val', image_dir=args.data_dir, batch_size=args.eval_batch_size,num_threads=args.workers,crop=IMAGE_SIZE,device_id=0,num_gpus=1) elif args.dataset == 'tinyimagenet': train_loader = get_imagenet_iter_dali(type = 'train',image_dir=args.data_dir, batch_size=args.train_batch_size,num_threads=args.workers,crop=IMAGE_SIZE,device_id=0,num_gpus=1) val_loader = get_imagenet_iter_dali(type='val', image_dir=args.data_dir, batch_size=args.eval_batch_size,num_threads=args.workers,crop=IMAGE_SIZE,device_id=0,num_gpus=1) elif args.dataset == 'cifar10': train_loader = get_cifar_iter_dali(type='train', image_dir=args.data_dir, batch_size=args.train_batch_size,num_threads=args.workers) val_loader = get_cifar_iter_dali(type='val', image_dir=args.data_dir, batch_size=args.eval_batch_size,num_threads=args.workers) # Create model print('=> Building model...') if args.dataset =='cifar10': num_classes = 10 train_data_length = 50000 eval_data_length =10000 elif args.dataset == 'imagenet': num_classes = 1000 train_data_length = 50000 eval_data_length =10000 # arch = args.arch # model = models.__dict__[arch] model_config = {'k_bits':kbits_list,'num_layers':args.num_layers,'pre_k_bits':args.pre_k_bits,'ratio':args.ratio} if args.arch == 'mobilenetv2': model_config = {'k_bits':kbits_list,'num_layers':args.num_layers,'pre_k_bits':args.pre_k_bits,'ratio':args.ratio,'width_mult':args.width_mult} if 'vgg' == args.arch and args.batchnorm: model,model_k_bits = import_module(f"models.{args.dataset}.{args.archtype}.{args.arch}").__dict__[f'{args.arch}{args.num_layers}_bn'](model_config) elif 'resnet20' == args.arch: model,model_k_bits = import_module(f"models.{args.dataset}.{args.archtype}.{args.arch}").__dict__[f'{args.arch}'](model_config) else: model,model_k_bits = import_module(f"models.{args.dataset}.{args.archtype}.{args.arch}").__dict__[f'{args.arch}{args.num_layers}'](model_config) model = model.to(device) print_logger.info(f'model_k_bits_list {model_k_bits}') # Define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = MultiStepLR(optimizer, milestones=[0.5 * args.train_epochs, 0.75 * args.train_epochs], gamma=0.1) # Optionally resume from a checkpoint resume = args.resume if resume: print('=> Loading checkpoint {}'.format(resume)) checkpoint = torch.load(resume, map_location=device) state_dict = checkpoint['state_dict'] start_epoch = checkpoint['epoch'] pre_train_best_prec1 = checkpoint['best_prec1'] model_check = load_check(state_dict,model) pdb.set_trace() model.load_state_dict(model_check) print('Prec@1:',pre_train_best_prec1) if args.test_only: test_prec1 = test(args, device, val_loader, model, criterion, writer_test,print_logger,start_epoch ) print('=> Test Prec@1: {:.2f}'.format(test_prec1)) print(f'sample k_bits {kbits_list}') return for epoch in range(0, args.train_epochs): scheduler.step(epoch) train_loss, train_prec1 = train(args, device, train_loader, train_data_length, model, criterion, optimizer, writer_train, print_logger, epoch) test_prec1 = test(args, device, val_loader, eval_data_length, model, criterion, writer_test, print_logger, epoch) is_best = best_prec1 < test_prec1 best_prec1 = max(test_prec1, best_prec1) state = { 'state_dict': model.state_dict(), 'test_prec1': test_prec1, 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'epoch': epoch + 1 } ckpt.save_model(state, epoch + 1, is_best,mode='train') print_logger.info('==> BEST ACC {:.3f}'.format(best_prec1.item()))
nn.Sigmoid()) def forward(self, input): output = self.main(input) return output.view(-1, 1).squeeze(1) writer = SummaryWriter('runs/dcgan') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") netG = Generator().to(device) netG.apply(weights_init) netD = Discriminator().to(device) netD.apply(weights_init) utils.print_params(netG) utils.print_params(netD) criterion = nn.BCELoss() fixed_noise = torch.randn(64, nz, 1, 1, device=device) real_label = 1 fake_label = 0 # setup optimizer lr = 0.0002 beta1 = 0.5 optimizerD = optim.Adam(netD.parameters(), lr=lr, betas=(beta1, 0.999)) optimizerG = optim.Adam(netG.parameters(), lr=lr, betas=(beta1, 0.999)) last_epoch = 0
if __name__ == "__main__": file = "./MNIST.json" args = load_json(json_file=file) file_path = 'train_gan_file' model_name = 'GAN' lr = args[model_name]['lr'] batch_size = args[model_name]['batch_size'] z_dim = args[model_name]['z_dim'] epochs = args[model_name]['epochs'] n_critic = args[model_name]['n_critic'] print("---------------------Training [%s]------------------------------" % model_name) utils.print_params(args["dataset"], args[model_name]) dataloader = init_dataloader(args, file_path, batch_size, mode="gan") G = GeneratorMNIST(z_dim) DG = DGWGAN32() G = torch.nn.DataParallel(G).cuda() DG = torch.nn.DataParallel(DG).cuda() dg_optimizer = torch.optim.Adam(DG.parameters(), lr=lr, betas=(0.5, 0.999)) g_optimizer = torch.optim.Adam(G.parameters(), lr=lr, betas=(0.5, 0.999)) step = 0 for epoch in range(epochs):
def loop(sess: tf.Session): i_step = 0 if is_root: print('Initializing') sess.run(tf.global_variables_initializer()) if restore_checkpoint is not None: # Restore from checkpoint if is_root: saver = tf.train.Saver() print('Restoring checkpoint:', restore_checkpoint) restore_step = int(restore_checkpoint.split('-')[-1]) print('Restoring from step:', restore_step) saver.restore(sess, restore_checkpoint) i_step = restore_step else: saver = None else: # No checkpoint: perform data dependent initialization if is_root: print('Data dependent init') init_loss = sess.run( init_loss_sym, { x_init_sym: data_train[np.random.randint(0, data_train.shape[0], init_bs)] }) if is_root: print('Init loss:', init_loss * bpd_scale_factor) sess.run(copy_params_to_ema) saver = tf.train.Saver() if is_root else None if is_root: print('Broadcasting initial parameters') sess.run(hvd.broadcast_global_variables(0)) sess.graph.finalize() if is_root: print('Training') print(f'Total GFLOPS: {flops}') print_params() loss_hist = deque(maxlen=steps_per_log) gnorm_hist = deque(maxlen=steps_per_log) for i_epoch in range(99999999999): if i_epoch % epochs_per_val == 0: run_validation(sess, i_step=i_step) if saver is not None: saver.save(sess, os.path.join(checkpointdir, 'model'), global_step=i_step) epoch_start_t = time.time() for i_epoch_step, (batch, ) in enumerate( iterbatches( # non-sharded: each gpu goes through the whole dataset [data_train], batch_size=local_bs, include_final_partial_batch=False, )): lr = lr_schedule(i_step) loss, gnorm, _ = sess.run([loss_sym, grad_norm_sym, opt_sym], { x_sym: batch, lr_sym: lr }) loss_hist.append(loss) gnorm_hist.append(gnorm) # Skip timing the very first step, which will be unusually slow due to TF initialization if i_epoch == i_epoch_step == 0: epoch_start_t = time.time() if i_step % steps_per_log == 0: loss_hist_means = MPI.COMM_WORLD.gather(float( np.mean(loss_hist)), root=0) gnorm_hist_means = MPI.COMM_WORLD.gather(float( np.mean(gnorm_hist)), root=0) steps_per_sec = (i_epoch_step + 1) / (time.time() - epoch_start_t) if is_root: kvs = [ ('iter', i_step), ('epoch', i_epoch + i_epoch_step * local_bs / data_train.shape[0]), # epoch for this gpu ('bpd', float( np.mean(loss_hist_means) * bpd_scale_factor)), ('gnorm', float(np.mean(gnorm_hist_means))), ('lr', float(lr)), # ('fps', steps_per_sec * total_bs), # fps calculated over all gpus (this epoch) ('sps', steps_per_sec), ] logger.writekvs(kvs, i_step) i_step += 1
test_loader = DGLREDataloader(test_set, batch_size=opt.test_batch_size, dataset_type='test') model = GAIN_GloVe(opt) else: assert 1 == 2, 'please choose a model from [bert, bilstm].' import gc del train_set gc.collect() # print(model.parameters) print_params(model) start_epoch = 1 pretrain_model = opt.pretrain_model lr = opt.lr model_name = opt.model_name if pretrain_model != '': chkpt = torch.load(pretrain_model, map_location=torch.device('cpu')) model.load_state_dict(chkpt['checkpoint']) logging('load checkpoint from {}'.format(pretrain_model)) else: assert 1 == 2, 'please provide checkpoint to evaluate.' model = get_cuda(model) model.eval()
def evaluate( *, flow_constructor, seed, restore_checkpoint, total_bs, iw_samples=4096, dtype=tf.float32, dataset='cifar10', samples_filename='samples.png', ): hvd, MPI, is_root, mpi_average = setup_horovod() restore_checkpoint = os.path.expanduser(restore_checkpoint) # Seeding and logging setup seed_all(hvd.rank() + hvd.size() * seed) assert total_bs % hvd.size() == 0 local_bs = total_bs // hvd.size() assert iw_samples % total_bs == 0 if is_root: print('===== EVALUATING {} ({} IW samples) ====='.format( restore_checkpoint, iw_samples)) # Load data if is_root: # Load once on root first to prevent downloading conflicts print('Loading data') load_data(dataset=dataset, dtype=dtype.as_numpy_dtype) MPI.COMM_WORLD.Barrier() data_train, data_val = load_data(dataset=dataset, dtype=dtype.as_numpy_dtype) img_shp = list(data_train.shape[1:]) H, W, Cx = img_shp bpd_scale_factor = 1. / (np.log(2) * np.prod(img_shp)) if is_root: print('Training data: {}, Validation data: {}'.format( data_train.shape[0], data_val.shape[0])) print('Image shape:', img_shp) # Build graph if is_root: print('Building graph') dequant_flow, flow, posterior_flow = flow_constructor() x_sym = tf.placeholder(dtype, [local_bs] + img_shp) # This is a fake training graph. Just used to mimic flow_training, so we can load from the saver build_forward(x=x_sym, dequant_flow=dequant_flow, flow=flow, posterior_flow=posterior_flow, flow_kwargs=dict(vcfg=VarConfig(init=False, ema=None, dtype=dtype), dropout_p=0, verbose=is_root) # note dropout is 0: it doesn't matter ) # EMA params = tf.trainable_variables() if is_root: print_params() ema = tf.train.ExponentialMovingAverage( decay=0.9999999999999) # ema turned off maintain_averages_op = tf.group(ema.apply(params)) # Validation and sampling (with EMA) if is_root: print('===== Validation graph =====') val_flow_kwargs = dict(vcfg=VarConfig(init=False, ema=ema, dtype=dtype), dropout_p=0., verbose=is_root) val_loss_sym, val_logratio_sym = build_forward( x=x_sym, dequant_flow=dequant_flow, flow=flow, posterior_flow=posterior_flow, flow_kwargs=val_flow_kwargs) allgathered_val_logratios_sym = hvd.allgather(val_logratio_sym) # for debugging invertibility # val_dequant_x_sym_rep = tf.reshape(tf.tile(tf.expand_dims(val_dequant_x_sym, 0), [sampling_times, 1, 1, 1, 1]), [-1] + val_dequant_x_sym.shape.as_list()[1:]) # val_inverr_sym = tf.reduce_max(tf.abs(val_dequant_x_sym_rep - flow.inverse(val_y_sym, **val_flow_kwargs)[0][:,:,:,:img_shp[-1]])) if is_root: print('===== Sampling graph =====') samples_sym, _ = flow.sample(64, val_flow_kwargs) allgathered_samples_x_sym = hvd.allgather(tf.to_float(samples_sym)) assert len(tf.trainable_variables()) == len(params) def run_iw_eval(sess): if is_root: print('Running IW eval with {} samples...'.format(iw_samples)) # Go through one example at a time all_val_losses = [] for i_example in (trange if is_root else range)(len(data_val)): # take this single example and tile it batch_x = np.tile(data_val[i_example, None, ...], (local_bs, 1, 1, 1)) # repeatedly evaluate logd for the IWAE bound batch_logratios = np.concatenate([ sess.run(allgathered_val_logratios_sym, {x_sym: batch_x}) for _ in range(iw_samples // total_bs) ]).astype(np.float64) assert batch_logratios.shape == (iw_samples, ) # log [1/n \sum_i exp(r_i)] = log [exp(-b) 1/n \sum_i exp(r_i + b)] = -b + log [1/n \sum_i exp(r_i + b)] shift = batch_logratios.max() all_val_losses.append( -bpd_scale_factor * (shift + np.log(np.mean(np.exp(batch_logratios - shift))))) if i_example % 100 == 0 and is_root: print(i_example, np.mean(all_val_losses)) if is_root: print(f'Final ({len(data_val)}):', np.mean(all_val_losses)) def run_standard_eval(sess): if is_root: print('Running standard eval...') # Standard validation (single sample) data_val_shard = np.array_split(data_val, hvd.size(), axis=0)[hvd.rank()] shard_losses = np.concatenate([ sess.run([val_loss_sym], {x_sym: val_batch}) for val_batch, in iterbatches([data_val_shard], batch_size=local_bs, include_final_partial_batch=False) ]) val_loss, total_count = mpi_average(shard_losses) if is_root: for k, v in [ ('val_bpd', bpd_scale_factor * val_loss), ('num_val_examples', total_count * local_bs), ]: print(k, v) def run_sampling_only(sess): samples = sess.run(allgathered_samples_x_sym) if is_root: from PIL import Image Image.fromarray( tile_imgs(np.clip(samples, 0, 255).astype( np.uint8))).save(samples_filename) print('Saved {} samples to {}'.format(len(samples), samples_filename)) # print('Sampled in {} seconds'.format(sample_time)) # Run config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str( hvd.local_rank()) # Pin GPU to local rank (one GPU per process) with tf.Session(config=config) as sess: if is_root: print('Initializing') sess.run(tf.global_variables_initializer()) # Restore from checkpoint if is_root: print('Restoring checkpoint:', restore_checkpoint) saver = tf.train.Saver() saver.restore(sess, restore_checkpoint) print('Broadcasting initial parameters') sess.run(hvd.broadcast_global_variables(0)) sess.graph.finalize() if samples_filename: run_sampling_only(sess) # Make sure data is the same on all MPI processes tmp_inds = [0, 183, 3, 6, 20, 88] check_batch = np.ascontiguousarray(data_val[tmp_inds]) gathered_batches = np.zeros( (hvd.size(), *check_batch.shape), check_batch.dtype) if is_root else None MPI.COMM_WORLD.Gather(check_batch, gathered_batches, root=0) if is_root: assert all( np.allclose(check_batch, b) for b in gathered_batches), 'data must be in the same order!' print('data ordering ok') # Run validation run_standard_eval(sess) run_iw_eval(sess)
def main(): start_epoch = 0 best_prec1 = 0.0 seed = np.random.randint(10000) if seed is not None: np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) if args.gpus is not None: device = torch.device("cuda:{}".format(args.gpus[0])) cudnn.benchmark = False cudnn.deterministic = True cudnn.enabled = True else: device = torch.device("cpu") now = datetime.now().strftime('%Y-%m-%d-%H:%M:%S') if args.mission is not None: if 'vgg' == args.arch and args.batchnorm: args.job_dir = f'{args.job_dir}/{args.dataset}/{args.arch}{args.num_layers}_bn/{args.mission}/{now}' elif 'resnet20' == args.arch: args.job_dir = f'{args.job_dir}/{args.dataset}/{args.arch}/{args.mission}/{now}' else: args.job_dir = f'{args.job_dir}/{args.dataset}/{args.arch}{args.num_layers}/{args.mission}/{now}' else: if 'vgg' == args.arch and args.batchnorm: args.job_dir = f'{args.job_dir}/{args.dataset}/{args.arch}{args.num_layers}_bn/{now}' else: args.job_dir = f'{args.job_dir}/{args.dataset}/{args.arch}{args.num_layers}/{now}' _make_dir(args.job_dir) ckpt = utils.checkpoint(args) print_logger = utils.get_logger(os.path.join(args.job_dir, "logger.log")) utils.print_params(vars(args), print_logger.info) log_file = os.path.join(args.job_dir, 'search_log.csv') writer_train = SummaryWriter(args.job_dir + '/run/train') writer_test = SummaryWriter(args.job_dir + '/run/test') ## hyperparameters settings ## n_layers = (args.num_layers - 2) * 2 unit_k_bits = int(args.k_bits) kbits_list = [unit_k_bits for i in range(n_layers)] print_logger.info(f'k_bits_list {kbits_list}') # Data loading print('=> Preparing data..') if args.dataset in ['cifar10', 'cifar100', 'mnist']: IMAGE_SIZE = 32 else: IMAGE_SIZE = 224 if args.dataset == 'imagenet': # train_loader = get_imagenet_iter_dali(type = 'train',image_dir=args.data_dir, batch_size=args.train_batch_size,num_threads=args.workers,crop=IMAGE_SIZE,device_id=0,num_gpus=1) # val_loader = get_imagenet_iter_dali(type='val', image_dir=args.data_dir, batch_size=args.eval_batch_size,num_threads=args.workers,crop=IMAGE_SIZE,device_id=0,num_gpus=1) train_data = get_imagenet_iter_torch(type='train', image_dir=args.base_data_dir, batch_size=args.train_batch_size, num_threads=args.workers, crop=IMAGE_SIZE, device_id=0, num_gpus=1) elif args.dataset == 'cifar10': train_transform, test_transform = utils._data_transforms_cifar10( cutout=args.cutout) train_data = torchvision.datasets.CIFAR10(args.data_dir, train=True, transform=train_transform, download=True) # test_data = torchvision.datasets.CIFAR10(args.data_dir,train=False, transform=test_transform, download=True) # train_loader = get_cifar_iter_dali(type='train', image_dir=args.data_dir, batch_size=args.train_batch_size,num_threads=args.workers) # val_loader = get_cifar_iter_dali(type='val', image_dir=args.data_dir, batch_size=args.eval_batch_size,num_threads=args.workers) # Create model # Create model print('=> Building model...') if args.dataset == 'cifar10' or args.dataset == 'mnist': num_classes = 10 train_data_length = 50000 eval_data_length = 10000 elif args.dataset == 'imagenet': num_classes = 1000 train_data_length = 50000 eval_data_length = 10000 if args.arch == 'mobilenetv2': model_config = { 'k_bits': kbits_list, 'num_layers': args.num_layers, 'pre_k_bits': args.pre_k_bits, 'ratio': args.ratio, 'width_mult': args.width_mult } else: model_config = { 'k_bits': kbits_list, 'num_layers': args.num_layers, 'pre_k_bits': args.pre_k_bits, 'ratio': args.ratio } if 'vgg' == args.arch and args.batchnorm: model, model_k_bits = import_module( f"models.{args.dataset}.{args.archtype}.{args.arch}" ).__dict__[f'{args.arch}{args.num_layers}_bn'](model_config) elif 'resnet20' == args.arch: model, model_k_bits = import_module( f"models.{args.dataset}.{args.archtype}.{args.arch}" ).__dict__[f'{args.arch}'](model_config) else: model, model_k_bits = import_module( f"models.{args.dataset}.{args.archtype}.{args.arch}" ).__dict__[f'{args.arch}{args.num_layers}'](model_config) model = model.to(device) print_logger.info(f'model_k_bits_list {model_k_bits}') # Define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss() # Optionally resume from a checkpoint resume = args.resume if resume: print('=> Loading checkpoint {}'.format(resume)) checkpoint = torch.load(resume, map_location=device) state_dict = checkpoint['state_dict'] start_epoch = checkpoint['epoch'] pre_train_best_prec1 = checkpoint['best_prec1'] model_check = load_check(state_dict, model) model.load_state_dict(model_check) print('Prec@1:', pre_train_best_prec1) else: checkpoint = model.state_dict() choose_model,k_bits = architecture_search(args=args,nn_model=model,device = device,checkpoint=checkpoint, \ step=args.step,criterion=criterion,train_data=train_data,train_batch_size=args.train_batch_size, \ eval_batch_size=args.eval_batch_size,train_data_length = train_data_length, \ eval_data_length = eval_data_length,clip_value=args.grad_clip,lam=args.lam,\ gpu_id = 0,print_logger = print_logger,ckpt = ckpt,log_file=log_file)
def train( *, flow_constructor, logdir, lr_schedule, dropout_p, seed, init_bs, total_bs, val_total_bs, ema_decay, steps_per_log, epochs_per_val, max_grad_norm, dtype=tf.float32, scale_loss=None, restore_checkpoint=None, scale_grad=None, dataset='cifar10', steps_per_samples=2000, ): hvd, MPI, is_root, mpi_average = setup_horovod() # Seeding and logging setup seed_all(hvd.rank() + hvd.size() * seed) assert total_bs % hvd.size() == 0 assert val_total_bs % hvd.size() == 0 local_bs = total_bs // hvd.size() val_local_bs = val_total_bs // hvd.size() # Setting up the logger logger = None logdir = '{}_mpi{}_{}'.format(os.path.expanduser(logdir), hvd.size(), time.time()) checkpointdir = os.path.join(logdir, 'checkpoints') if is_root: print('Floating point format:', dtype) pprint(locals()) os.makedirs(logdir) os.makedirs(checkpointdir) logger = TensorBoardOutput(logdir) # Load data if is_root: # Load once on root first to prevent downloading conflicts print('Loading data') load_data(dataset=dataset, dtype=dtype.as_numpy_dtype) MPI.COMM_WORLD.Barrier() data_train, data_val = load_data(dataset=dataset, dtype=dtype.as_numpy_dtype) img_shp = list(data_train.shape[1:]) H, W, Cx = img_shp bpd_scale_factor = 1. / (np.log(2) * np.prod(img_shp)) if is_root: print('Training data: {}, Validation data: {}'.format( data_train.shape[0], data_val.shape[0])) print('Image shape:', img_shp) # Build graph if is_root: print('Building graph') dequant_flow, flow, posterior_flow = flow_constructor() # Data-dependent init if restore_checkpoint is None: if is_root: print('===== Init graph =====') x_init_sym = tf.placeholder(dtype, [init_bs] + img_shp) init_loss_sym, _ = build_forward(x=x_init_sym, dequant_flow=dequant_flow, flow=flow, posterior_flow=posterior_flow, flow_kwargs=dict(vcfg=VarConfig( init=True, ema=None, dtype=dtype), dropout_p=dropout_p, verbose=is_root)) flops = int(get_flops()) / (10**9) # Training if is_root: print('===== Training graph =====') x_sym = tf.placeholder(dtype, [local_bs] + img_shp) loss_sym, _ = build_forward(x=x_sym, dequant_flow=dequant_flow, flow=flow, posterior_flow=posterior_flow, flow_kwargs=dict(vcfg=VarConfig(init=False, ema=None, dtype=dtype), dropout_p=dropout_p, verbose=is_root)) # EMA params = tf.trainable_variables() if is_root: print_params() ema = tf.train.ExponentialMovingAverage(decay=ema_decay) maintain_averages_op = tf.group(ema.apply(params)) # Op for setting the ema params to the current non-ema params (for use after data-dependent init) name2var = {v.name: v for v in tf.global_variables()} copy_params_to_ema = tf.group([ name2var[p.name.replace(':0', '') + '/ExponentialMovingAverage:0'].assign(p) for p in params ]) val_x_sym = tf.placeholder(dtype, [val_local_bs] + img_shp) # Validation and sampling (with EMA) if is_root: print('===== Validation graph =====') val_flow_kwargs = dict(vcfg=VarConfig(init=False, ema=ema, dtype=dtype), dropout_p=0., verbose=is_root) val_loss_sym, _ = build_forward(x=val_x_sym, dequant_flow=dequant_flow, flow=flow, posterior_flow=posterior_flow, flow_kwargs=val_flow_kwargs) # for debugging invertibility # val_inverr_sym = tf.reduce_max(tf.abs(dequant_x - flow.inverse(y, train_flow_kwargs)[0][:,:,:,:img_shp[-1]])) if is_root: print('===== Sampling graph =====') sample_flow_kwargs = dict(vcfg=VarConfig(init=False, ema=ema, dtype=dtype), dropout_p=0, verbose=is_root) samples_sym, _ = flow.sample(val_local_bs, sample_flow_kwargs) allgathered_samples_x_sym = hvd.allgather(tf.to_float(samples_sym)) assert len(tf.trainable_variables()) == len(params) def run_validation(sess, i_step): data_val_shard = np.array_split(data_val, hvd.size(), axis=0)[hvd.rank()] shard_losses = np.concatenate([ sess.run([val_loss_sym], {val_x_sym: val_batch}) for val_batch, in iterbatches([data_val_shard], batch_size=val_local_bs, include_final_partial_batch=False) ]) val_loss, total_count = mpi_average(shard_losses) samples = sess.run(allgathered_samples_x_sym) if is_root: logger.writekvs( [('val_bpd', bpd_scale_factor * val_loss), ('num_val_examples', total_count * val_local_bs), ('samples', tile_imgs(np.clip(samples, 0, 255).astype(np.uint8)))], i_step) if is_root: print('===== Optimization graph =====') # Optimization lr_sym = tf.placeholder(dtype, [], 'lr') optimizer = hvd.DistributedOptimizer(tf.train.AdamOptimizer(lr_sym)) if scale_loss is None: grads_and_vars = optimizer.compute_gradients(loss_sym, var_list=params) else: grads_and_vars = [(g / scale_loss, v) for (g, v) in optimizer.compute_gradients( loss_sym * scale_loss, var_list=params)] if scale_grad is not None: grads_and_vars = [(g / scale_grad, v) for (g, v) in grads_and_vars] if max_grad_norm is not None: clipped_grads, grad_norm_sym = tf.clip_by_global_norm( [g for (g, _) in grads_and_vars], max_grad_norm) grads_and_vars = [ (cg, v) for (cg, (_, v)) in zip(clipped_grads, grads_and_vars) ] else: grad_norm_sym = tf.constant(0.) opt_sym = tf.group(optimizer.apply_gradients(grads_and_vars), maintain_averages_op) def loop(sess: tf.Session): i_step = 0 if is_root: print('Initializing') sess.run(tf.global_variables_initializer()) if restore_checkpoint is not None: # Restore from checkpoint if is_root: saver = tf.train.Saver() print('Restoring checkpoint:', restore_checkpoint) restore_step = int(restore_checkpoint.split('-')[-1]) print('Restoring from step:', restore_step) saver.restore(sess, restore_checkpoint) i_step = restore_step else: saver = None else: # No checkpoint: perform data dependent initialization if is_root: print('Data dependent init') init_loss = sess.run( init_loss_sym, { x_init_sym: data_train[np.random.randint(0, data_train.shape[0], init_bs)] }) if is_root: print('Init loss:', init_loss * bpd_scale_factor) sess.run(copy_params_to_ema) saver = tf.train.Saver() if is_root else None if is_root: print('Broadcasting initial parameters') sess.run(hvd.broadcast_global_variables(0)) sess.graph.finalize() if is_root: print('Training') print(f'Total GFLOPS: {flops}') print_params() loss_hist = deque(maxlen=steps_per_log) gnorm_hist = deque(maxlen=steps_per_log) for i_epoch in range(99999999999): if i_epoch % epochs_per_val == 0: run_validation(sess, i_step=i_step) if saver is not None: saver.save(sess, os.path.join(checkpointdir, 'model'), global_step=i_step) epoch_start_t = time.time() for i_epoch_step, (batch, ) in enumerate( iterbatches( # non-sharded: each gpu goes through the whole dataset [data_train], batch_size=local_bs, include_final_partial_batch=False, )): lr = lr_schedule(i_step) loss, gnorm, _ = sess.run([loss_sym, grad_norm_sym, opt_sym], { x_sym: batch, lr_sym: lr }) loss_hist.append(loss) gnorm_hist.append(gnorm) # Skip timing the very first step, which will be unusually slow due to TF initialization if i_epoch == i_epoch_step == 0: epoch_start_t = time.time() if i_step % steps_per_log == 0: loss_hist_means = MPI.COMM_WORLD.gather(float( np.mean(loss_hist)), root=0) gnorm_hist_means = MPI.COMM_WORLD.gather(float( np.mean(gnorm_hist)), root=0) steps_per_sec = (i_epoch_step + 1) / (time.time() - epoch_start_t) if is_root: kvs = [ ('iter', i_step), ('epoch', i_epoch + i_epoch_step * local_bs / data_train.shape[0]), # epoch for this gpu ('bpd', float( np.mean(loss_hist_means) * bpd_scale_factor)), ('gnorm', float(np.mean(gnorm_hist_means))), ('lr', float(lr)), # ('fps', steps_per_sec * total_bs), # fps calculated over all gpus (this epoch) ('sps', steps_per_sec), ] logger.writekvs(kvs, i_step) i_step += 1 # End of epoch # Train config = tf.ConfigProto() # config.log_device_placement = True config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str( hvd.local_rank()) # Pin GPU to local rank (one GPU per process) if is_root: print('===== Creating session =====') with tf.Session(config=config) as sess: loop(sess)
def main(): parser = argparse.ArgumentParser(description="==========[RNN]==========") parser.add_argument("--mode", default="train", help="available modes: train, test, eval") parser.add_argument("--model", default="rnn", help="available models: rnn, lstm") parser.add_argument("--dataset", default="all", help="available datasets: all, MA, MI, TN") parser.add_argument("--rnn_layers", default=3, type=int, help="number of stacked rnn layers") parser.add_argument("--hidden_dim", default=16, type=int, help="number of hidden dimensions") parser.add_argument("--lin_layers", default=1, type=int, help="number of linear layers before output") parser.add_argument("--epochs", default=100, type=int, help="number of max training epochs") parser.add_argument("--dropout", default=0.0, type=float, help="dropout probability") parser.add_argument("--learning_rate", default=0.01, type=float, help="learning rate") parser.add_argument("--verbose", default=2, type=int, help="how much training output?") options = parser.parse_args() verbose = options.verbose if torch.cuda.is_available(): device = torch.device("cuda") if verbose > 0: print("GPU available, using cuda...") print() else: device = torch.device("cpu") if verbose > 0: print("No available GPU, using CPU...") print() params = { "MODE": options.mode, "MODEL": options.model, "DATASET": options.dataset, "RNN_LAYERS": options.rnn_layers, "HIDDEN_DIM": options.hidden_dim, "LIN_LAYERS": options.lin_layers, "EPOCHS": options.epochs, "DROPOUT_PROB": options.dropout, "LEARNING_RATE": options.learning_rate, "DEVICE": device, "OUTPUT_SIZE": 1 } params["PATH"] = "models/" + params["MODEL"] + "_" + params[ "DATASET"] + "_" + str(params["RNN_LAYERS"]) + "_" + str( params["HIDDEN_DIM"]) + "_" + str( params["LIN_LAYERS"]) + "_" + str( params["LEARNING_RATE"]) + "_" + str( params["DROPOUT_PROB"]) + "_" + str( params["EPOCHS"]) + "_model.pt" #if options.mode == "train": # print("training placeholder...") train_data = utils.DistrictData(params["DATASET"], "train") val_data = utils.DistrictData(params["DATASET"], "val") params["INPUT_SIZE"] = train_data[0]['sequence'].size()[1] if params["MODEL"] == "rnn": model = RNN(params) elif params["MODEL"] == "lstm": model = LSTM(params) model.to(params["DEVICE"]) criterion = nn.MSELoss(reduction='sum') optimizer = torch.optim.Adam(model.parameters(), lr=params["LEARNING_RATE"]) if verbose == 0: print(params["PATH"]) else: utils.print_params(params) print("Beginning training...") print() since = time.time() best_val_loss = 10.0 for e in range(params["EPOCHS"]): running_loss = 0.0 #model.zero_grad() model.train() train_loader = DataLoader(train_data, batch_size=32, shuffle=True, num_workers=4) for batch in train_loader: x = batch['sequence'].to(device) y = batch['target'].to(device) seq_len = batch['size'].to(device) optimizer.zero_grad() y_hat, hidden = model(x, seq_len) loss = criterion(y_hat, y) running_loss += loss loss.backward() optimizer.step() mean_loss = running_loss / len(train_data) val_loss = evaluate(val_data, model, params, criterion, validation=True) if verbose == 2 or (verbose == 1 and (e + 1) % 100 == 0): print('=' * 25 + ' EPOCH {}/{} '.format(e + 1, params["EPOCHS"]) + '=' * 25) print('Training Loss: {}'.format(mean_loss)) print('Validation Loss: {}'.format(val_loss)) print() if e > params["EPOCHS"] / 3: if val_loss < best_val_loss: best_val_loss = val_loss best_model = model.state_dict() torch.save(best_model, params["PATH"]) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print('Final Training Loss: {:4f}'.format(mean_loss)) print('Best Validation Loss: {:4f}'.format(best_val_loss)) test_data = utils.DistrictData(params["DATASET"], "test") test_loss = evaluate(test_data, model, params, criterion) print('Test Loss: {}'.format(test_loss)) print()
def main(): parser = get_parser() try: args = parser.parse_args() except: print("Unable to get parser, exiting now...") sys.exit(0) ############################################################################ #### Sample output directory and file name, tested locally. # out_dir="/Users/gil/Google Drive/repos/quantum_state_diffusion/num_json_specifications" # json_file_name="tmp_file.json" # json_file_dir=os.path.join(out_dir, json_file_name) # make_one_system_example(json_file_dir) # make_two_system_example(json_file_dir) ############################################################################ ############################################################################ #### Set up commands from parser #### Sample call from command line # python /scratch/users/tabakg/qsd_dev/generate_num_model.py --output_dir '/scratch/users/tabakg/qsd_output/json_spec/' --Nfock_a 30 \ # --seed 1 --regime 'kerr_bistableA21.75' --num_systems 2 --delta_t 1e-05 --duration 0.2 --downsample 100 \ # --sdeint_method_name 'itoImplicitEuler' --R 1.0 --eps 1.0 --noise_amp 1.0 --lambda 0.999 ############################################################################ params = dict() ntraj = params['Ntraj'] = args.ntraj seed = params['seed'] = args.seed duration = params['duration'] = args.duration delta_t = params['delta_t'] = args.delta_t Nfock_a = params['Nfock_a'] = args.Nfock_a Nfock_j = params['Nfock_j'] = args.Nfock_j downsample = params['downsample'] = args.downsample Regime = params['regime'] = args.regime num_systems = params['num_systems'] = args.num_systems drive_second_system = params[ 'drive_second_system'] = args.drive_second_system if args.sdeint_method_name == "": logging.info( "sdeint_method_name not set. Using itoEuler as a default.") sdeint_method_name = params['sdeint_method_name'] = "itoEuler" else: sdeint_method_name = params[ 'sdeint_method_name'] = args.sdeint_method_name R = params['R'] = args.R eps = params['eps'] = args.eps noise_amp = params['noise_amp'] = args.noise_amp lambd = params['lambd'] = args.lambd trans_phase = params['trans_phase'] = args.trans_phase # Does the user want to print verbose output? quiet = args.quiet if not quiet: print_params(params=params) #### output directory and file name, generated from inputs params_args = (Regime, seed, ntraj, delta_t, Nfock_a, Nfock_j, duration, downsample, sdeint_method_name, num_systems, R, eps, noise_amp, lambd, trans_phase, drive_second_system) param_str = make_params_string(params_args) json_file_name = "json_spec_" + param_str + ".json" json_file_dir = os.path.join(args.output_dir, json_file_name) print("output file location is ", json_file_dir) tspan = np.arange(0, duration, delta_t) if num_systems == 1: if Regime == "absorptive_bistable": logging.info("Regime is set to %s", Regime) H, psi0, Ls, obsq_data, obs_names = make_system_JC( Nfock_a, Nfock_j) elif Regime == "kerr_bistable": logging.info("Regime is set to %s", Regime) H, psi0, Ls, obsq_data, obs_names = make_system_kerr_bistable( Nfock_a) elif Regime[:len( "kerr_bistable" )] == "kerr_bistable": ##inputs in this case are e.g. kerr_bistableA33.25_... which_kerr = Regime[len( "kerr_bistable")] ## e.g. A in kerr_bistableA33.25_ custom_drive = float(Regime[len("kerr_bistableA"):] ) ## e.g. 33.25 in kerr_bistableA33.25 logging.info("Regime is set to %s, with custom drive %s" % (Regime, custom_drive)) H, psi0, Ls, obsq_data, obs_names = make_system_kerr_bistable_regime_chose_drive( Nfock_a, which_kerr, custom_drive) elif Regime == "kerr_qubit": logging.info("Regime is set to %s", Regime) H, psi0, Ls, obsq_data, obs_names = make_system_kerr_qubit(Nfock_a) else: logging.error("Unknown regime, %s, or not implemented yet.", Regime) raise ValueError("Unknown regime, or not implemented yet.") gen_num_system(json_file_dir, H, psi0, duration, delta_t, Ls, sdeint_method_name, obsq=obsq_data, downsample=downsample, ntraj=ntraj, seed=seed) elif num_systems == 2: if Regime == "absorptive_bistable": logging.info("Regime is set to %s", Regime) H1, H2, psi0, L1s, L2s, obsq_data, obs_names = make_system_JC_two_systems( Nfock_a, Nfock_j, drive_second_system) elif Regime == "kerr_bistable": logging.info("Regime is set to %s", Regime) H1, H2, psi0, L1s, L2s, obsq_data, obs_names = make_system_kerr_bistable_two_systems( Nfock_a, drive_second_system) elif Regime == "kerr_qubit": logging.info("Regime is set to %s", Regime) H1, H2, psi0, L1s, L2s, obsq_data, obs_names = make_system_kerr_qubit_two_systems( Nfock_a, drive_second_system) elif Regime[:len("empty_then_kerr" )] == 'empty_then_kerr': ##e.g. empty_then_kerrA33.25 which_kerr = Regime[len( "empty_then_kerr")] ## e.g. A in empty_then_kerrA33.25_ custom_drive = float(Regime[len("empty_then_kerrA"):] ) ## e.g. 33.25 in empty_then_kerrA33.25 logging.info("Regime is set to %s, with custom drive %s" % (Regime, custom_drive)) H1, H2, psi0, L1s, L2s, obsq_data, obs_names = make_system_empty_then_kerr( Nfock_a, which_kerr, custom_drive) elif Regime[:len( "kerr_bistable" )] == "kerr_bistable": ##inputs in this case are e.g. kerr_bistableA33.25_... which_kerr = Regime[len( "kerr_bistable")] ## e.g. A in kerr_bistableA33.25_ custom_drive = float(Regime[len("kerr_bistableA"):] ) ## e.g. 33.25 in kerr_bistableA33.25 logging.info("Regime is set to %s, with custom drive %s" % (Regime, custom_drive)) H1, H2, psi0, L1s, L2s, obsq_data, obs_names = make_system_kerr_bistable_regime_chose_drive_two_systems( Nfock_a, which_kerr, custom_drive) else: logging.error("Unknown regime, %s, or not implemented yet.", Regime) raise ValueError("Unknown regime, or not implemented yet.") gen_num_system_two_systems(json_file_dir, H1, H2, psi0, duration, delta_t, L1s, L2s, R, eps, noise_amp, lambd, sdeint_method_name, trans_phase=None, obsq=obsq_data, downsample=downsample, ops_on_whole_space=False, ntraj=ntraj, seed=seed) else: ## num_systems not equal to 1 or 2 logging.error("Unknown num_systems, %s, or not implemented yet.", num_systems) raise ValueError("Unknown num_systems, or not implemented yet.")