def post(self): try: data = json.loads(self.request.body) model.train(data['name'], data['X'], data['Y']) self.write({'success' : True}) except Exception as ex: self.write({'success' : False, 'error' : ex.__repr__()})
def train(): # Turn on training mode which enables dropout. model.train() total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data, targets = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) model.zero_grad() output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) for p in model.parameters(): p.data.add_(-lr, p.grad.data) total_loss += loss.data if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss[0] / args.log_interval elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def train(epoch, optimizer, quesfeaShu, labelShu, lengthShu): losses = AverageMeter() top1 = AverageMeter() model.train() for i in range(0, len(quesfeaShu) / args.batch_size): if i == len(quesfeaShu) / args.batch_size - 1: batchend = len(quesfeaShu) else: batchend = (i + 1) * (args.batch_size) batchstart = i * (args.batch_size) batch_size = batchend - batchstart quesfeabatch = [] labelbatch = [] lengthbatch = [] quesfeaOri = quesfeaShu[batchstart:batchend] labelOri = labelShu[batchstart:batchend] lengthOri = lengthShu[batchstart:batchend] idxbatch = sorted(range(len(lengthOri)), key=lambda x: lengthOri[x], reverse=True) for j in range(len(idxbatch)): quesfeabatch.append(quesfeaOri[idxbatch[j]]) labelbatch.append(labelOri[idxbatch[j]]) lengthbatch.append(lengthOri[idxbatch[j]]) questrainarray = np.asarray(quesfeabatch) labeltrainarray = np.asarray(labelbatch) lengthtrainarray = np.asarray(lengthbatch) tmp = [questrainarray, labeltrainarray, lengthtrainarray] tmp = [Variable(torch.from_numpy(_), requires_grad=False) for _ in tmp] trques, trlabel, length = tmp if args.cuda: trlabel.cuda() output = model(trques, length) loss = criterion(output, trlabel) / (batch_size) prec1, = accuracy(output.data, trlabel.data, topk=(1,)) losses.update(loss.data[0], batch_size) top1.update(prec1[0], batch_size) optimizer.zero_grad() loss.backward() optimizer.step() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) print str(top1.avg) + ' ' + str(top1.val) + ' ' + str(loss.data[0]) + ' ' + 'batch ' + str(i) print str(top1.avg) + ' ' + str(top1.val) + ' ' + str(loss.data[0]) + ' ' + 'epoch ' + str(epoch)
def train(): # Turn on training mode which enables dropout. if args.model == 'QRNN': model.reset() total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) batch, i = 0, 0 while i < train_data.size(0) - 1 - 1: bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2. # Prevent excessively small or negative sequence lengths seq_len = max(5, int(np.random.normal(bptt, 5))) # There's a very small chance that it could select a very long sequence length resulting in OOM # seq_len = min(seq_len, args.bptt + 10) lr2 = optimizer.param_groups[0]['lr'] optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt model.train() data, targets = get_batch(train_data, i, args, seq_len=seq_len) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) optimizer.zero_grad() output, hidden, rnn_hs, dropped_rnn_hs = model(data, hidden, return_h=True) raw_loss = criterion(model.decoder.weight, model.decoder.bias, output, targets) loss = raw_loss # Activiation Regularization if args.alpha: loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:]) # Temporal Activation Regularization (slowness) if args.beta: loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:]) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. if args.clip: torch.nn.utils.clip_grad_norm_(params, args.clip) optimizer.step() total_loss += raw_loss.data optimizer.param_groups[0]['lr'] = lr2 if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss.item() / args.log_interval elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'.format( epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]['lr'], elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss), cur_loss / math.log(2))) total_loss = 0 start_time = time.time() ### batch += 1 i += seq_len
def get_af_posterior(): """ Get posterior estimate of each AF. """ print 'Training model and generate posterior for each atlas ...' #-- directory config db_dir = r'/nfs/t2/atlas/database' base_dir = r'/nfs/h1/workingshop/huanglijie/autoroi' doc_dir = os.path.join(base_dir, 'doc') data_dir = os.path.join(base_dir, 'ma_202', 'r_fc') #-- laod session ID list for training sessid_file = os.path.join(doc_dir, 'sessid') sessid = open(sessid_file).readlines() sessid = [line.strip() for line in sessid] #-- parameter config class_label = [1, 3] #-- model training and testing forest_list, classes_list, spatial_ptn = model.train(sessid, data_dir) model.get_posterior_map(sessid, data_dir, class_label, forest_list, classes_list, spatial_ptn, save_nifti=True, probabilistic=False)
def model_testing_with_LOOCV_single_atlas(): """ Training a model with one atlas selected. """ print 'Traing model with one atlas selected and test it with ' + \ 'leave-one-out cross-validation ...' #-- directory config db_dir = r'/nfs/t2/atlas/database' base_dir = r'/nfs/h1/workingshop/huanglijie/autoroi' doc_dir = os.path.join(base_dir, 'doc') data_dir = os.path.join(base_dir, 'ma_202', 'r_fc') #-- load session ID list for training sessid_file = os.path.join(doc_dir, 'sessid') sessid = open(sessid_file).readlines() sessid = [line.strip() for line in sessid] #-- parameter config class_label = [1, 3] #atlas_num = [50] #atlas_num = [1, 5] + range(10, 201, 10) atlas_num = range(1, 202) #iter_num = 50 #-- model training and testing forest_list, classes_list, spatial_ptn = model.train(sessid, data_dir) dice = model.leave_one_out_test(sessid, atlas_num, data_dir, class_label, forest_list, classes_list, spatial_ptn, single_atlas=True) #-- save dice to a file model.save_dice(dice, data_dir)
def train(): with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) image, label = input.get_input(LABEL_PATH, LABEL_FORMAT, IMAGE_PATH, IMAGE_FORMAT) logits = model.inference(image) loss = model.loss(logits, label) train_op = model.train(loss, global_step) saver = tf.train.Saver(tf.all_variables()) summary_op = tf.merge_all_summaries() init = tf.initialize_all_variables() sess = tf.Session(config=tf.ConfigProto(log_device_placement=input.FLAGS.log_device_placement)) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(input.FLAGS.train_dir, graph_def=sess.graph_def) for step in xrange(input.FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), "Model diverged with loss = NaN" if step % 1 == 0: num_examples_per_step = input.FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = "%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)" print(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 10 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 25 == 0: checkpoint_path = os.path.join(input.FLAGS.train_dir, "model.ckpt") saver.save(sess, checkpoint_path, global_step=step)
def train(): with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) images, labels = model.distorted_inputs() logits = model.inference(images) loss = model.loss(logits, labels) train_op = model.train(loss, global_step) saver = tf.train.Saver(tf.all_variables()) summary_op = tf.merge_all_summaries() sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) if FLAGS.resume_training and ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) current_step = int(ckpt.model_checkpoint_path .split('/')[-1].split('-')[-1]) else: current_step = 0 init = tf.initialize_all_variables() sess.run(init) tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(SUMMARY_DIR, graph_def=sess.graph_def) for step in xrange(current_step, FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f' '(%.1f examples/sec; %.3f' 'sec/batch)') print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 50 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) if step % 100 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def train(self, epoch_idx, batch_size, max_norm): logger, model, data = self.logger, self.model, self.data logger.info('At %d-th epoch with lr %f.', epoch_idx, self.optimizer.param_groups[0]['lr']) model.train() nb_train_batch = ceil(data.nb_train / batch_size) for src, src_mask, trg, _ in tqdm( data.train_batch_sample(batch_size), total=nb_train_batch): out = model(src, src_mask, trg) loss = model.loss(out, trg[1:]) self.optimizer.zero_grad() loss.backward() if max_norm > 0: torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm) logger.debug('loss %f with total grad norm %f', loss, util.grad_norm(model.parameters())) self.optimizer.step()
def main(): # Training data consits of 60000 images and 60000 labels # Testing data consists of 10000 images and 10000 labels # Each image consits of 784 (28x28) pixels each of which contains a value from # 0 to 255.0 which corresponds to its darkness or lightness. # Each input needs to be a list of numpy arrays to be valid. # Load all of the data print "Loading data..." test_images = data.load_data(LIMITED) train_images = data.load_data(LIMITED, "train-images.idx3-ubyte", "train-labels.idx1-ubyte") print "Normalizing data..." X_train, Y_train = data.convert_image_data(train_images) X_test, Y_test = data.convert_image_data(test_images) X_train = np.array(X_train) Y_train = np.array(Y_train) X_test = np.array(X_test) Y_test = np.array(Y_test) if LOAD == False: print "Building the model..." _model = model.build() else: print "Loading the model..." elements = os.listdir("model") if len(elements) == 0: print "No models to load." else: _model = model.load(elements[len(elements)-1]) if TRAIN == True: print "Training the model..." model.train(_model, X_train, Y_train, X_test, Y_test) if VISUALIZE: model.visualize(_model, test_images, VISUALIZE_TO_FILE) if TRAIN == True: print "Saving the model..." model.save(_model)
def update(): conn = sqlite3.connect('data/textpile.db') docs, labels = query_train(conn) model = train(docs, labels, **param) docs, doc_ids = query_predict(conn) preds = predict(model, docs, doc_ids, topk=1000) conn.execute('DELETE FROM doc_relevance') sql = 'INSERT INTO doc_relevance (doc_id, relevance, explain_json) VALUES (?,?,?)' res = ((id, sco, json.dumps(exp)) for id, lab, sco, exp in preds) conn.executemany(sql, res) sql = 'UPDATE meta SET value = ? WHERE key = \'last_updated\'' now = dt.datetime.utcnow().isoformat(' ')[:19] conn.execute(sql, [now]) conn.commit()
def main(data_dir, out_dir, n_iter=10, vector_len=300, vocab_size=20000, hidden_len=300, depth=3, drop_rate=0.3, rho=1e-4, batch_size=24): print("Loading") nlp = spacy.en.English(parser=False) dataset = Dataset(nlp, data_dir / 'train', batch_size) print("Training") network = model.train(dataset, vector_len, hidden_len, 2, vocab_size, depth, drop_rate, rho, n_iter, model_writer(out_dir, 'model_{epoch}.pickle')) score = model.Scorer() print("Evaluating") for doc, label in read_data(nlp, data_dir / 'test'): word_ids, embeddings = model.get_words(doc, 0.0, vocab_size) guess = network.forward(word_ids, embeddings) score += guess == label print(score)
def model_testing_independent(): """ Training model and test it with an independent dataset. """ print 'Traing model and test it with an independent dataset.' #-- directory config db_dir = r'/nfs/t2/atlas/database' base_dir = r'/nfs/h1/workingshop/huanglijie/autoroi' doc_dir = os.path.join(base_dir, 'doc') data_dir = os.path.join(base_dir, 'multi-atlas', 'l_sts') #-- laod session ID list for training sessid_file = os.path.join(doc_dir, 'sessid') sessid = open(sessid_file).readlines() sessid = [line.strip() for line in sessid] #-- parameter config class_label = [8, 10, 12] atlas_num = [40] #atlas_num = [1, 5] + range(10, 201, 10) #atlas_num = range(1, 201) #-- model training forest_list, classes_list, spatial_ptn = model.train(sessid, data_dir) #-- load mask coordinate derived from training dataset mask_coords = lib.load_mask_coord(data_dir) #-- load testing dataset test_dir = r'/nfs/h1/workingshop/huanglijie/autoroi/multi-atlas/group08' loc_dir = os.path.join(test_dir, 'localizer') pred_dir = os.path.join(test_dir, 'predicted_files', 'l_sts') test_sessid_file = os.path.join(test_dir, 'sessid') test_sessid = open(test_sessid_file).readlines() test_sessid = [line.strip() for line in test_sessid] for subj in test_sessid: zstat_file = os.path.join(loc_dir, subj + '_face_obj_zstat.nii.gz') feature_name, sample_data = lib.ext_sample(zstat_file, mask_coords, class_label) model.predict(sample_data, atlas_num, pred_dir, subj + '_pred.nii.gz', class_label, forest_list, classes_list, spatial_ptn)
def forest_parameter_selection(): """ Assessment of impact of forest parameters. """ print 'Assess the imapct of forest parameters with leave-one-out ' + \ 'cross-validation ...' #-- directory config db_dir = r'/nfs/t2/atlas/database' base_dir = r'/nfs/h1/workingshop/huanglijie/autoroi' doc_dir = os.path.join(base_dir, 'doc') data_dir = os.path.join(base_dir, 'ma_202', 'l_sts') #-- laod session ID list for training sessid_file = os.path.join(doc_dir, 'sessid') sessid = open(sessid_file).readlines() sessid = [line.strip() for line in sessid] #-- parameter config class_label = [8, 10, 12] atlas_num = [40] #atlas_num = [1, 5] + range(10, 201, 10) #atlas_num = range(1, 201) tree_num = range(10, 41, 5) tree_depth = range(10, 41, 5) for n in tree_num: for d in tree_depth: print 'number - %s, depth - %s'%(n, d) forest_list, classes_list, spatial_ptn = model.train(sessid, data_dir, n_tree=n, d_tree=d) dice = model.leave_one_out_test(sessid, atlas_num, data_dir, class_label, forest_list, classes_list, spatial_ptn) # save dice to a file model.save_dice(dice, data_dir)
def model_training_with_LOOCV_testing(): """ Training model and test it with leave-one-out cross-validation. """ print 'Traing model and test it with leave-one-out cross-validation ...' #-- directory config db_dir = r'/nfs/t2/BAA/SSR' base_dir = r'/nfs/h1/workingshop/huanglijie/autoroi' doc_dir = os.path.join(base_dir, 'doc') data_dir = os.path.join(base_dir, 'ma_202', 'l_sts') #-- laod session ID list for training sessid_file = os.path.join(doc_dir, 'sessid') sessid = open(sessid_file).readlines() sessid = [line.strip() for line in sessid] #-- parameter config class_label = [8, 10, 12] atlas_num = [40] #atlas_num = [1, 5] + range(10, 201, 10) #atlas_num = range(1, 10) #atlas_num = range(1, 201) #-- preparation for model training # get zstat and label file for training dataset zstat_file_list = get_zstat_list(sessid, db_dir) label_file_list = get_label_list(sessid, db_dir) model.prepare(sessid, zstat_file_list, label_file_list, class_label, data_dir) #-- model training forest_list, classes_list, spatial_ptn = model.train(sessid, data_dir) dice = model.leave_one_out_test(sessid, atlas_num, data_dir, class_label, forest_list, classes_list, spatial_ptn, save_nifti=True) #-- save dice to a file model.save_dice(dice, data_dir)
def model_testing_with_LOOCV_random(): """ Training a model with random atlas selection, and test it using leave-one-out cross-validation. """ print 'Traing model with random atlas selection and test it with ' + \ 'leave-one-out cross-validation ...' #-- directory config db_dir = r'/nfs/t2/atlas/database' base_dir = r'/nfs/h1/workingshop/huanglijie/autoroi' doc_dir = os.path.join(base_dir, 'doc') data_dir = os.path.join(base_dir, 'code_test') #-- laod session ID list for training sessid_file = os.path.join(doc_dir, 'sessid') sessid = open(sessid_file).readlines() sessid = [line.strip() for line in sessid] #-- parameter config class_label = [1, 3] #atlas_num = [50] atlas_num = [1, 5] + range(10, 201, 10) #atlas_num = range(1, 201) iter_num = 50 for i in range(iter_num): print 'Iter - %s'%(i) start_time = time.time() # model training and testing forest_list, classes_list, spatial_ptn = model.train(sessid, data_dir) dice = model.leave_one_out_test(sessid, atlas_num, data_dir, class_label, forest_list, classes_list, spatial_ptn, sorted=False) print 'Cost %s'%(time.time()-start_time) # save dice to a file model.save_dice(dice, data_dir)
args = parser.parse_args() tfconfig = tf.ConfigProto() tfconfig.gpu_options.per_process_gpu_memory_fraction = 1 set_session(tf.Session(config=tfconfig)) if args.mode == 'train': print "parsing input data" data = Data() print "loaded training data" print "initiating model construction" model = get_model(data) keras.utils.plot_model(model, to_file='model.png') train(model, data) elif args.mode == 'predict': data = Data() model = get_model(data) weights = args.model_file if weights != None and weights.strip() != "": model.load_weights(weights) print "model loaded" else: print "No model to load" with open('test.csv') as f, open('submission.csv', 'w') as of: reader = csv.reader(f) writer = csv.writer(of) records = [r for r in reader]
def train(): assert args.batch_size % args.small_batch_size == 0, 'batch_size must be divisible by small_batch_size' # Turn on training mode which enables dropout. total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = [ model.init_hidden(args.small_batch_size) for _ in range(args.batch_size // args.small_batch_size) ] batch, i = 0, 0 while i < train_data.size(0) - 1 - 1: bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2. # Prevent excessively small or negative sequence lengths seq_len = max(5, int(np.random.normal(bptt, 5))) # There's a very small chance that it could select a very long sequence length resulting in OOM seq_len = min(seq_len, args.bptt + args.max_seq_len_delta) lr2 = optimizer.param_groups[0]['lr'] optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt model.train() data, targets = get_batch(train_data, i, args, seq_len=seq_len) optimizer.zero_grad() start, end, s_id = 0, args.small_batch_size, 0 while start < args.batch_size: cur_data, cur_targets = data[:, start:end], targets[:, start: end].contiguous( ).view(-1) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden[s_id] = repackage_hidden(hidden[s_id]) log_prob, hidden[ s_id], rnn_hs, dropped_rnn_hs, prior = parallel_model( cur_data, hidden[s_id], return_h=True) raw_loss = nn.functional.nll_loss( log_prob.view(-1, log_prob.size(2)), cur_targets) loss = raw_loss # Activiation Regularization loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:]) # Temporal Activation Regularization (slowness) loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:]) #regularize for prior prior_sum = prior.sum(0) cv = (prior_sum.var() * (prior_sum.size(1) - 1)).sqrt() / prior_sum.mean() loss = loss + sum(args.var * cv * cv) loss *= args.small_batch_size / args.batch_size total_loss += raw_loss.data * args.small_batch_size / args.batch_size loss.backward() s_id += 1 start = end end = start + args.small_batch_size gc.collect() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) optimizer.step() # total_loss += raw_loss.data optimizer.param_groups[0]['lr'] = lr2 if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss[0] / args.log_interval elapsed = time.time() - start_time logging( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]['lr'], elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() ### batch += 1 i += seq_len
d_train = torch.tensor([pd.to_numeric(x[0]) for x in X_train.values]).to(torch.long) d_targets = torch.tensor([pd.to_numeric(y[0]) for y in Y_train.values]).to(torch.long) train_dataset = TensorDataset(d_train, d_targets) train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=16) v_train = torch.tensor([pd.to_numeric(x[0]) for x in X_valid.values]).to(torch.long) v_targets = torch.tensor([pd.to_numeric(y[0]) for y in Y_valid.values]).to(torch.long) valid_dataset = TensorDataset(v_train, v_targets) valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=8) print('Transformed data') from model import ProLSTM netLSTM = ProLSTM(batch_size=BATCH_SIZE) print('Created model') from model import train netLSTM = train(netLSTM, train_loader, valid_loader, epochs=EPOCHS) with open('./pretrained/netProLSTM_30Epochs', 'wb') as f: torch.save(netLSTM.state_dict(), f)
def train(train_loader, val_loader, model, discriminator, criterion_L1, criterion_MSE, criterion_berHu, criterion_GAN, optimizer, optimizer_D, epoch, batch_size): model.train() # switch to train mode eval_mode = False init_lr = optimizer.param_groups[0]['lr'] valid_T = torch.ones(batch_size, 1).cuda().double() zeros_T = torch.zeros(batch_size, 1).cuda().double() for iter_ in range(num_batches): input, target = next(train_loader.get_one_batch(batch_size)) input, target = input, target input, target = input.cuda(), target.cuda() torch.cuda.synchronize() optimizer.zero_grad() pred = model(input) loss_L1 = criterion_L1(pred, target) loss_MSE = criterion_MSE(pred, target) loss_berHu = criterion_berHu(pred, target) loss_SI = criterion_SI(pred, target) set_requires_grad(discriminator, False) loss_adv = 0 for a in range(12): for b in range(16): row = 19 * a col = 19 * b patch_fake = pred[:, :, row:row + 19, col:col + 19] pred_fake = discriminator(patch_fake) loss_adv += criterion_GAN(pred_fake, valid_T) loss_gen = loss_SI + 0.5 * loss_adv loss_gen.backward() optimizer.step() set_requires_grad(discriminator, True) optimizer_D.zero_grad() loss_D = 0 for a in range(12): for b in range(16): row = 19 * a col = 19 * b patch_fake = pred[:, :, row:row + 19, col:col + 19] patch_real = target[:, :, row:row + 19, col:col + 19] pred_fake = discriminator(patch_fake.detach()) pred_real = discriminator(patch_real) loss_D_fake = criterion_GAN(pred_fake, zeros_T) loss_D_real = criterion_GAN(pred_real, valid_T) loss_D += 0.5 * (loss_D_fake + loss_D_real) loss_D.backward() optimizer_D.step() torch.cuda.synchronize() if (iter_ + 1) % 10 == 0: save_image(model, input, target, iter_) logger.add_scalar('L1', loss_L1.item()) logger.add_scalar('MSE', loss_MSE.item()) logger.add_scalar('berHu', loss_berHu.item()) logger.add_scalar('SI', loss_SI.item()) print( 'Train Epoch: {} Batch: [{}/{}], SI: {:0.4f}, ADV:{:0.3f} L1 ={:0.3f}, MSE={:0.3f}, berHu={:0.3f}, Disc:{:0.3f}' .format(epoch, iter_ + 1, num_batches, loss_SI.item(), loss_adv.item(), loss_L1.item(), loss_MSE.item(), loss_berHu.item(), loss_D.item()))
def train_model(model, device, dtype, batch_size, loss_func, optimizer, A, train_loader, epoch, top_k, train_display_step): running_train_loss = 0.0 running_train_recall = 0.0 running_train_prec = 0.0 running_train_f1 = 0.0 # device = model.device # dtype = model.dtype nb_train_batch = len(train_loader.dataset) // batch_size if len(train_loader.dataset) % batch_size == 0: total_train_batch = nb_train_batch else: total_train_batch = nb_train_batch + 1 model.train() start = time.time() for i, data in enumerate(train_loader, 0): user_seq, train_seq_len, target_basket = data x_train_batch = user_seq.to_dense().to(dtype=dtype, device=device) real_batch_size = x_train_batch.size()[0] # hidden = model.init_hidden(real_batch_size) target_basket_train = target_basket.to(device=device, dtype=dtype) optimizer.zero_grad() # clear gradients for this training step predict = model(A, train_seq_len, x_train_batch) # predicted output loss = loss_func(predict, target_basket_train) # WBCE loss loss.backward() # backpropagation, compute gradients optimizer.step() # update gradient train_loss_item = loss.item() running_train_loss += train_loss_item avg_train_loss = running_train_loss / (i + 1) train_recall_item, train_prec_item, train_f1_item = utils.compute_recall_at_top_k( model, predict.detach(), top_k, target_basket_train.detach(), real_batch_size) running_train_recall += train_recall_item running_train_prec += train_prec_item running_train_f1 += train_f1_item avg_train_recall = running_train_recall / (i + 1) avg_train_prec = running_train_prec / (i + 1) avg_train_f1 = running_train_f1 / (i + 1) end = time.time() if ((i + 1) % train_display_step == 0 or (i + 1) == total_train_batch): # print every 50 mini-batches top_pred = predict.clone().detach().topk(dim=-1, k=top_k, sorted=True) print( '[Epoch : % d ,Batch Index : %d / %d] Train Loss : %.8f ----- Train Recall@%d: %.8f / Train Precision: %.8f / Train F1: %.8f ----- Time : %.3f seconds ' % (epoch, i + 1, total_train_batch, avg_train_loss, top_k, avg_train_recall, avg_train_prec, avg_train_f1, end - start)) print("top k indices predict: ") print( '--------------------------------------------------------------' ) print('***** indices *****') print(top_pred.indices) print('***** values *****') print(top_pred.values) print( '--------------------------------------------------------------' ) start = time.time() torch.cuda.empty_cache() print('finish a train epoch') return avg_train_loss, avg_train_recall, avg_train_prec, avg_train_f1
def train(model, train_dataset, dev_dataset, max_epochs=100, model_name='model.save', stopping_counter=20): losses = [] accs = [] dev_losses = [] dev_accs = [] optimizer = torch.optim.Adam(model.parameters()) loss_fn = torch.nn.CrossEntropyLoss() best_loss = float('+inf') best_model = model counter = 0 device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) train = torch.utils.data.DataLoader(train_dataset, batch_size=256, num_workers=2, shuffle=True) model.train() for epoch in range(max_epochs): print("-" * 10, "EPOCH ", epoch, "-" * 10) num_correct = 0.0 total_count = 0.0 start = time.time() epoch_loss = 0.0 for i, batch in enumerate(train): if i + 1 % 100 == 0: print('Batch ', i) data, labels, mask = batch data = data.to(device) labels = labels.to(device) mask = mask.to(device) # print(data.device, mask.device, labels.device) optimizer.zero_grad() output = model(data, mask) loss = loss_fn(output, labels) loss.backward() optimizer.step() pred = torch.argmax(output, 1) num_correct += (pred == labels).sum().item() total_count += pred.size(0) epoch_loss += loss.item() losses.append(epoch_loss / (i + 1)) accs.append(num_correct / total_count) end = time.time() counter += 1 # for early stopping eval_acc, eval_loss = validate(model, dev_dataset) print( f'Loss={losses[-1]}, Accuracy={accs[-1]}, Dev Accuracy={eval_acc}, epoch took {end - start}s' ) dev_losses.append(eval_loss) dev_accs.append(eval_acc) if eval_loss < best_loss: best_loss = eval_loss best_model = model counter = 0 print("Saving new best model...") torch.save(best_model.state_dict(), model_name) if counter == stopping_counter: return losses, accs, dev_losses, dev_accs return losses, accs, dev_losses, dev_accs
def main(hps): # Initialize Horovod. hvd.init() # Create tensorflow session sess = tensorflow_session() # Download and load dataset. tf.set_random_seed(hvd.rank() + hvd.size() * hps.seed) np.random.seed(hvd.rank() + hvd.size() * hps.seed) # Get data and set train_its and valid_its train_iterator, test_iterator, data_init = get_data(hps, sess) hps.train_its, hps.test_its, hps.full_test_its = get_its(hps) # Create log dir logdirs, _print = zeus.get_logdirs(['', '_ckpt']) # Create model import model model = model.model(sess, hps, train_iterator, test_iterator, data_init) # Initialize visualization functions draw_samples = init_visualizations(hps, model, logdirs) if hvd.rank() == 0: _print(hps) _print('Starting training. Logging to', logdirs[0]) _print( 'epoch n_processed n_images pps dtrain dtest dsample dtot train_results test_results msg') # Train sess.graph.finalize() n_processed = 0 n_images = 0 train_time = 0.0 test_loss_best = 999999 tcurr = time.time() for epoch in range(1, hps.epochs): t0 = time.time() train_results = [] for it in range(hps.train_its): # Set learning rate, linearly annealed from 0 in the first # hps.epochs_warmup epochs. lr = hps.lr * min(1., n_processed / (hps.n_train * hps.epochs_warmup)) # Run a training step synchronously. _t0 = time.time() train_results += [model.train(lr)] if hps.verbose and hvd.rank() == 0: _print(n_processed, time.time() - _t0, train_results[-1]) sys.stdout.flush() # Images seen wrt anchor resolution n_processed += hvd.size() * hps.n_batch_train # Actual images seen at current resolution n_images += hvd.size() * hps.local_batch_train train_results = np.mean(np.asarray(train_results), axis=0) dt = time.time() - t0 train_time += dt if epoch < 10 or (epoch < 50 and epoch % 10 == 0) or epoch % hps.epochs_full_valid == 0: test_results = [] msg = '' t0 = time.time() # model.polyak_swap() if epoch % hps.epochs_full_valid == 0: # Full validation run for it in range(hps.full_test_its): test_results += [model.test()] test_results = np.mean(np.asarray(test_results), axis=0) if hvd.rank() == 0: if test_results[0] < test_loss_best: test_loss_best = test_results[0] model.save(logdirs[1] + "model_best_loss.ckpt") msg += ' *' dtest = time.time() - t0 # Full sample uses all machines, 1 sample per machine t0 = time.time() if epoch == 1 or epoch == 10 or epoch % hps.epochs_full_sample == 0: draw_samples(epoch) dfullsample = time.time() - t0 if hvd.rank() == 0: dcurr = time.time() - tcurr tcurr = time.time() _print( epoch, n_processed, n_images, "{:.1f} {:.1f} {:.1f} {:.1f}".format( dt, dtest, dfullsample, dcurr), train_results, test_results, msg, np_precision=4) # model.polyak_swap() if hvd.rank() == 0: _print("Finished!")
if __name__ == '__main__': warnings.filterwarnings("ignore"); label_train, train_bids, test_bids, test_bidders_ids_without_bids = read("./data/train.csv", "./data/test.csv", "./data/bids.csv"); print("Training Set Features"); train_bidder_features,feature_names = computeFeatures(train_bids); del train_bids; train_X = []; train_Y = []; for key in train_bidder_features.keys(): train_X.append(train_bidder_features[key]); train_Y.append(label_train[key]); best_model, imputer, one_hot_encoder = model.train(train_X, train_Y,feature_names); del train_bidder_features; print("Test Set Features"); test_bidder_features,feature_names = computeFeatures(test_bids); del test_bids; test_X = []; test_ids = []; for key in test_bidder_features.keys(): test_ids.append(key); test_X.append(test_bidder_features[key]); model.predict_and_write(best_model, test_X, test_ids, test_bidders_ids_without_bids, imputer, one_hot_encoder);
def train(sess, model, hps, logdir, visualise): _print(hps) _print('Starting training. Logging to', logdir) _print('epoch n_processed n_images ips dtrain dtest dsample dtot train_results test_results msg') # Train sess.graph.finalize() n_processed = 0 n_images = 0 train_time = 0.0 test_loss_best = 999999 if hvd.rank() == 0: train_logger = ResultLogger(logdir + "train.txt", **hps.__dict__) test_logger = ResultLogger(logdir + "test.txt", **hps.__dict__) tcurr = time.time() for epoch in range(1, hps.epochs): t = time.time() train_results = [] for it in range(hps.train_its): # Set learning rate, linearly annealed from 0 in the first hps.epochs_warmup epochs. lr = hps.lr * min(1., n_processed / (hps.n_train * hps.epochs_warmup)) # Run a training step synchronously. _t = time.time() train_results += [model.train(lr)] if hps.verbose and hvd.rank() == 0: _print(n_processed, time.time()-_t, train_results[-1]) sys.stdout.flush() # Images seen wrt anchor resolution n_processed += hvd.size() * hps.n_batch_train # Actual images seen at current resolution n_images += hvd.size() * hps.local_batch_train train_results = np.mean(np.asarray(train_results), axis=0) dtrain = time.time() - t ips = (hps.train_its * hvd.size() * hps.local_batch_train) / dtrain train_time += dtrain if hvd.rank() == 0: train_logger.log(epoch=epoch, n_processed=n_processed, n_images=n_images, train_time=int( train_time), **process_results(train_results)) if epoch < 10 or (epoch < 50 and epoch % 10 == 0) or epoch % hps.epochs_full_valid == 0: test_results = [] msg = '' t = time.time() # model.polyak_swap() if epoch % hps.epochs_full_valid == 0: # Full validation run for it in range(hps.full_test_its): test_results += [model.test()] test_results = np.mean(np.asarray(test_results), axis=0) if hvd.rank() == 0: test_logger.log(epoch=epoch, n_processed=n_processed, n_images=n_images, **process_results(test_results)) # Save checkpoint if test_results[0] < test_loss_best: test_loss_best = test_results[0] model.save(logdir+"model_best_loss.ckpt") msg += ' *' dtest = time.time() - t # Sample t = time.time() if epoch == 1 or epoch == 10 or epoch % hps.epochs_full_sample == 0: visualise(epoch) dsample = time.time() - t if hvd.rank() == 0: dcurr = time.time() - tcurr tcurr = time.time() _print(epoch, n_processed, n_images, "{:.1f} {:.1f} {:.1f} {:.1f} {:.1f}".format( ips, dtrain, dtest, dsample, dcurr), train_results, test_results, msg) # model.polyak_swap() if hvd.rank() == 0: _print("Finished!")
def train(is_finetune=False): """ Train model a number of steps """ # should be changed if model is stored by different convention startstep = 0 if not is_finetune else int( FLAGS.finetune_dir.split('-')[-1]) image_filenames, label_filenames = Inputs.get_filename_list( FLAGS.image_dir) val_image_filenames, val_label_filenames = Inputs.get_filename_list( FLAGS.val_dir) with tf.Graph().as_default(): #Probablitity that the neuron's output will be kept during dropout keep_probability = tf.placeholder(tf.float32, name="keep_probabilty") global_step = tf.Variable(0, trainable=False) #Make images into correct type(float32/float16 el.), create shuffeled batches ++ images, labels = Inputs.datasetInputs(image_filenames, label_filenames, FLAGS.batch_size) val_images, val_labels = Inputs.datasetInputs(val_image_filenames, val_label_filenames, FLAGS.batch_size) train_data_node = tf.placeholder( tf.float32, shape=[FLAGS.batch_size, FLAGS.image_h, FLAGS.image_w, 3]) train_labels_node = tf.placeholder( tf.int64, shape=[FLAGS.batch_size, FLAGS.image_h, FLAGS.image_w, 1]) phase_train = tf.placeholder(tf.bool, name='phase_train') # Build a Graph that computes the logits predictions from the inference model. logits = model.inference( train_data_node, phase_train, FLAGS.batch_size, keep_probability) #tensor, nothing calculated yet #Calculate loss: loss = model.cal_loss(logits, train_labels_node) # Build a Graph that trains the model with one batch of examples and updates the model parameters. train_op = model.train(loss, global_step) # Create a saver. saver = tf.train.Saver(tf.global_variables(), max_to_keep=0) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.summary.merge_all() #Using a context manager - it will release resources for session when no longer required. #Defining session like this means you do not have to explicitly close the session. with tf.Session() as sess: if (is_finetune == True): saver.restore(sess, FLAGS.finetune_dir) else: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) print("Started session run") # Start the queue runners. coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) summary_writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph) average_pl = tf.placeholder(tf.float32) acc_pl = tf.placeholder(tf.float32) iu_pl = tf.placeholder(tf.float32) average_summary = tf.summary.scalar("test_average_loss", average_pl) acc_summary = tf.summary.scalar("test_accuracy", acc_pl) iu_summary = tf.summary.scalar("Mean_IU", iu_pl) """ Starting iterations to train the network """ for step in range(startstep, startstep + FLAGS.max_steps): image_batch, label_batch = sess.run(fetches=[images, labels]) # since we still use mini-batches in eval, still set bn-layer phase_train = True feed_dict = { train_data_node: image_batch, train_labels_node: label_batch, phase_train: True, keep_probability: 0.5 } # storeImageQueue(image_batch, label_batch, step) start_time = time.time() _, loss_value = sess.run(fetches=[train_op, loss], feed_dict=feed_dict) duration = time.time() - start_time assert not np.isnan( loss_value ), 'Model diverged with loss = NaN - weights have "exploded"' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) print('\n--- Normal training ---') format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) # eval current training batch pre-class accuracy pred = sess.run(fetches=logits, feed_dict=feed_dict) print("\n -- conv in classifier!") Utils.per_class_acc(pred, label_batch) if step % 100 == 0 or (step + 1) == FLAGS.max_steps: """ Validate training by running validation dataset """ print( " \n --- Starting testing by running validation dataset ---" ) total_val_loss = 0.0 hist = np.zeros((FLAGS.num_class, FLAGS.num_class)) for test_step in range(TEST_ITER): val_images_batch, val_labels_batch = sess.run( fetches=[val_images, val_labels]) feed_dict = { train_data_node: val_images_batch, train_labels_node: val_labels_batch, phase_train: True, keep_probability: 1.0 #During validation training no dropout layers } _val_loss, _val_pred = sess.run(fetches=[loss, logits], feed_dict=feed_dict) total_val_loss += _val_loss hist += Utils.get_hist(_val_pred, val_labels_batch) PREV_VAL_LOSS = total_val_loss / TEST_ITER print( "loss for validation dataset: ", total_val_loss / TEST_ITER, ". If this value increases the model is likely overfitting." ) #Prev value was: ", PREV_VAL_LOSS) writeSummaries(sess, hist, average_summary, average_pl, total_val_loss, acc_summary, iu_summary, acc_pl, iu_pl, summary_op, step, feed_dict, summary_writer) # Save the model checkpoint periodically. if step % 1000 == 0 or step % 500 == 0 or ( step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.log_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) coord.request_stop() coord.join(threads)
def train(): with tf.Graph().as_default() as g: global_step = tf.contrib.framework.get_or_create_global_step() images1, images2, labels = model.inputs(eval_data=False) # Build a Graph that computes the logits predictions from the # inference model. logits = model.inference(images1, images2) # Calculate loss. loss = model.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_no_accuracy_op = model.train(loss, global_step) def train_accuracy_op(): # Calculate predictions. with tf.control_dependencies([train_no_accuracy_op]): return (True, tf.nn.in_top_k(logits, labels, 1)) def cond_train_accuracy(): return tf.logical_and( tf.greater(global_step, 0), tf.equal( tf.truncatemod(global_step, constants.TRAIN_ACCURACY_FREQUENCY), 0)) train_op = tf.cond(cond_train_accuracy(), train_accuracy_op, lambda: (False, train_no_accuracy_op)) class _LoggerHook(tf.train.SessionRunHook): """ Logs loss and runtime. """ def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 return tf.train.SessionRunArgs(loss) # Asks for loss value. def after_run(self, run_context, run_values): if self._step % FLAGS.log_frequency == 0: current_time = time.time() duration = current_time - self._start_time self._start_time = current_time loss_value = run_values.results examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration sec_per_batch = float(duration / FLAGS.log_frequency) format_str = ( '%s: step %4d, loss = %2.2f (%3.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) summary_train_acc_writer = tf.summary.FileWriter(FLAGS.train_dir + '/train_accuracy') kwargs = { 'checkpoint_dir': FLAGS.train_dir, 'hooks': [ tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook() ], 'config': tf.ConfigProto(log_device_placement=FLAGS.log_device_placement), } with tf.train.MonitoredTrainingSession(**kwargs) as mon_sess: while not mon_sess.should_stop(): accuracy, results = mon_sess.run(train_op) if accuracy: true_count = np.sum(results) accuracy = true_count / constants.BATCH_SIZE format_str = ('%s: Training Accuracy = %.3f') print(format_str % (datetime.now(), accuracy)) summary = tf.Summary() summary.value.add(tag='train_accuracy', simple_value=accuracy) summary_train_acc_writer.add_summary( summary, mon_sess.run(global_step))
import numpy as np from model import train if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--graph_file_path', type=str) parser.add_argument('--random-walk-length', type=int, default=2) parser.add_argument('--random-walk-restart-prob', type=float, default=0.5) parser.add_argument('--num-random-walks', type=int, default=10) parser.add_argument('--num-neighbors', type=int, default=3) parser.add_argument('--num-layers', type=int, default=2) parser.add_argument('--hidden-dims', type=int, default=100) parser.add_argument('--batch-size', type=int, default=256) parser.add_argument('--device', type=str, default='cpu') parser.add_argument('--num-epochs', type=int, default=3) parser.add_argument('--batches-per-epoch', type=int, default=4000) parser.add_argument('--num-workers', type=int, default=0) parser.add_argument('--lr', type=float, default=3e-5) parser.add_argument('-k', type=int, default=10) args = parser.parse_args() # Load dataset with open(args.graph_file_path, 'rb') as f: dataset = pickle.load(f) h_item = train(dataset, args) # Write h_item numpy file np.savez("h_items.npz", movie_vectors=h_item.numpy())
neighbourhood_group_list) neighbourhood_list = sorted(df[df['neighbourhood_group'] == neighbourhood_group]['neighbourhood'].unique()) neighbourhood = st.selectbox('Neighbourhood', neighbourhood_list) room_type_list = sorted( df[df['neighbourhood'] == neighbourhood]['room_type'].unique()) room_type = st.selectbox('Room type', room_type_list) minimum_nights_list = sorted(df['minimum_nights'].unique()) minimum_nights = st.selectbox('Minimum nights', minimum_nights_list) if st.button("Submit"): if neighbourhood_group == None or neighbourhood == None or room_type == None or minimum_nights == None: st.write("Fill all the options") r = requests.post(url + endpoint, json={ 'param1': neighbourhood_group, 'param2': neighbourhood, 'param3': room_type, 'param4': int(minimum_nights) }) train() prediction = predict(neighbourhood_group, neighbourhood, room_type, minimum_nights) st.info(f"**{prediction}€** per night should be charged")
def main(): # Clean up the model directory if not keep training if not args.keep_train: shutil.rmtree(args.model_dir, ignore_errors=True) print('Remove model directory: {}'.format(args.model_dir)) if not config.LOCAL_TRAIN: rsync_model_files = glob.glob(config.RSYNC_MODEL_DIR + "/*") print("export models we have :", rsync_model_files) for model_file in rsync_model_files: model_time = int(model_file.split('/')[-1]) if model_time < int(time.time()) - 60 * 60 * 24 * 7: print("delete :", model_file) shutil.rmtree(model_file, ignore_errors=True) # Clean up the model export directory shutil.rmtree(args.model_export_dir, ignore_errors=True) # Set Which GPU to use or do not use gpu if args.gpu_num == '-1': session_config = tf.ConfigProto(device_count={'CPU': args.num_threads}) else: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_num session_config = tf.ConfigProto( gpu_options=tf.GPUOptions(allow_growth=True), device_count={'CPU': args.num_threads}) # Set Model Params model_params = { 'learning_rate': args.learning_rate, 'layer_size_list': list(map(lambda x: int(x), args.layer_size_list.split(","))), 'regular_rate': args.regular_rate, 'dropout': args.dropout, 'opt_algo': args.opt_algo } estimator_config = tf.estimator.RunConfig().replace( session_config=session_config, log_step_count_steps=args.log_steps, save_summary_steps=args.log_steps) model = tf.estimator.Estimator(model_fn=model_fn, model_dir=args.model_dir, params=model_params, config=estimator_config) print("\n=========================================") print("train type : ", args.train_type) for embedding in config.FILE_EMBEDDING_COLUMNS: print("{} embedding size : {}".format(embedding[0], embedding[-1])) print("train at GPU : ", args.gpu_num) print("learning rate : ", args.learning_rate) print("optimize algorithm : ", args.opt_algo) print("batch size : ", args.batch_size) print("epochs : ", args.train_epochs) print("layer size list : ", args.layer_size_list) print("regular rate : ", args.regular_rate) print("dropout rate : ", args.dropout) print("=========================================") print("model saved at : ", config.MODEL_EXPORT_DIR) print("=========================================\n") if args.train_type == 'train_and_eval': train_and_eval(model=model, train_data=args.train_data, eval_data=args.eval_data, train_epochs=args.train_epochs, batch_size=args.batch_size, epochs_per_eval=args.epochs_per_eval) else: train(model=model, train_data=args.train_data, train_epochs=args.train_epochs, batch_size=args.batch_size) export_model(model, args.model_export_dir)
def train(): # Turn on training mode which enables dropout. if args.model == 'QRNN': model.reset() total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) batch, i = 0, 0 while i < train_data.size(0) - 1 - 1: bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2. # Prevent excessively small or negative sequence lengths seq_len = max(5, int(np.random.normal(bptt, 5))) # There's a very small chance that it could select a very long sequence length resulting in OOM # seq_len = min(seq_len, args.bptt + 10) lr2 = optimizer.param_groups[0]['lr'] optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt model.train() data, targets = get_batch(train_data, i, args, seq_len=seq_len) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) optimizer.zero_grad() output, weight, bias, hidden, rnn_hs, dropped_rnn_hs = model( data, hidden, return_h=True, train=True) raw_loss = criterion(weight, bias, output, targets) loss = raw_loss # Activiation Regularization if args.alpha: loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:]) # Temporal Activation Regularization (slowness) if args.beta: loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:]) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. if args.clip: torch.nn.utils.clip_grad_norm_(params, args.clip) optimizer.step() total_loss += raw_loss.data optimizer.param_groups[0]['lr'] = lr2 if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss.item() / args.log_interval elapsed = time.time() - start_time logging( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'.format( epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]['lr'], elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss), cur_loss / math.log(2))) total_loss = 0 start_time = time.time() ### batch += 1 i += seq_len
def train(): assert args.batch_size % args.small_batch_size == 0, 'batch_size must be divisible by small_batch_size' # Turn on training mode which enables dropout. total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = [model.init_hidden(args.small_batch_size) for _ in range(args.batch_size // args.small_batch_size)] batch, i = 0, 0 while i < train_data.size(0) - 1 - 1: bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2. # Prevent excessively small or negative sequence lengths seq_len = max(5, int(np.random.normal(bptt, 5))) # There's a very small chance that it could select a very long sequence length resulting in OOM seq_len = min(seq_len, args.bptt + args.max_seq_len_delta) lr2 = optimizer.param_groups[0]['lr'] optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt model.train() data, targets = get_batch(train_data, i, args, seq_len=seq_len) optimizer.zero_grad() start, end, s_id = 0, args.small_batch_size, 0 while start < args.batch_size: cur_data, cur_targets = data[:, start: end], targets[:, start: end].contiguous().view(-1) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden[s_id] = repackage_hidden(hidden[s_id]) log_prob, hidden[s_id], rnn_hs, dropped_rnn_hs = parallel_model(cur_data, hidden[s_id], return_h=True) raw_loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)), cur_targets) loss = raw_loss # Activiation Regularization loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:]) # Temporal Activation Regularization (slowness) loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:]) loss *= args.small_batch_size / args.batch_size total_loss += raw_loss.data * args.small_batch_size / args.batch_size loss.backward() s_id += 1 start = end end = start + args.small_batch_size gc.collect() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) optimizer.step() # total_loss += raw_loss.data optimizer.param_groups[0]['lr'] = lr2 if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss[0] / args.log_interval elapsed = time.time() - start_time logging('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]['lr'], elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() ### batch += 1 i += seq_len
FEATURES = 6 bldg_type = {'кирпичный': 1, 'панельный': 2, 'блочный': 3, 'монолитный': 4, 'деревянный': 5} district = {'Ленинский': 1, 'Октябрьский': 2, 'Пролетарский': 3} X = [] Y = [] df = pd.read_csv('dataset.csv') for data in df.values: x_data = np.copy(data) x_data[0] = 1 x_data[4] = bldg_type[x_data[4]] x_data[5] = district[x_data[5]] X.append(x_data) Y.append(data[0] / 1e6) X = np.array(X) Y = np.array(Y) model = model.Model(X.shape[1], 0.0001) model.train(X, Y) print('Цена: ' + '{:,}'.format(int(model.predict([1, 60, 2, 2, bldg_type['кирпичный'], district['Октябрьский']]) * 1e6)))
def train(): print('[Training Configuration]') print('\tTrain dir: %s' % FLAGS.train_dir) print('\tTraining max steps: %d' % FLAGS.max_steps) print('\tSteps per displaying info: %d' % FLAGS.display) print('\tSteps per testing: %d' % FLAGS.test_interval) print('\tSteps per saving checkpoints: %d' % FLAGS.checkpoint_interval) print('\tGPU memory fraction: %f' % FLAGS.gpu_fraction) """Train aPascal for a number of steps.""" with tf.Graph().as_default(): init_step = 0 global_step = tf.Variable(0, trainable=False) # Get images and labels for aPascal. train_images, train_labels = model.distorted_inputs('train') test_images, test_labels = model.inputs('eval') # Build a Graph that computes the predictions from the inference model. images = tf.placeholder(tf.float32, [FLAGS.batch_size, model.IMAGE_WIDTH, model.IMAGE_WIDTH, 3]) labels = tf.placeholder(tf.int32, [FLAGS.batch_size, model.NUM_ATTRS]) probs = model.inference(images) # Calculate loss. (cross_entropy loss) loss, acc = model.loss_acc(probs, labels) tf.scalar_summary("accuracy", acc) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op, lr = model.train(loss, global_step) # Build the summary operation based on the TF collection of Summaries. train_summary_op = tf.merge_all_summaries() # Loss and accuracy summary used in test phase) loss_summary = tf.scalar_summary("test/loss", loss) acc_summary = tf.scalar_summary("test/accuracy", acc) test_summary_op = tf.merge_summary([loss_summary, acc_summary]) # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. sess = tf.Session(config=tf.ConfigProto( gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=FLAGS.gpu_fraction), log_device_placement=FLAGS.log_device_placement)) sess.run(init) # Create a saver. saver = tf.train.Saver(tf.all_variables(), max_to_keep=10000) ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) if ckpt and ckpt.model_checkpoint_path: print('\tRestore from %s' % ckpt.model_checkpoint_path) # Restores from checkpoint saver.restore(sess, ckpt.model_checkpoint_path) init_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) else: print('No checkpoint file found. Start from the scratch.') # if finetune, load variables of the final predication layers # from pretrained model if FLAGS.finetune: base_variables = tf.trainable_variables()[:-2*model.NUM_ATTRS] base_saver = tf.train.Saver(base_variables, max_to_keep=10000) ckpt = tf.train.get_checkpoint_state(FLAGS.pretrained_dir) print('Initial checkpoint: ' + ckpt.model_checkpoint_path) base_saver.restore(sess, ckpt.model_checkpoint_path) # Start the queue runners. tf.train.start_queue_runners(sess=sess) if not os.path.exists(FLAGS.train_dir): os.mkdir(FLAGS.train_dir) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir) # Training!! for step in xrange(init_step, FLAGS.max_steps): start_time = time.time() try: train_images_val, train_labels_val = sess.run([train_images, train_labels]) _, lr_value, loss_value, acc_value, train_summary_str = sess.run([train_op, lr, loss, acc, train_summary_op], feed_dict={images:train_images_val, labels:train_labels_val}) except tf.python.framework.errors.InvalidArgumentError: embed() duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % FLAGS.display == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: (Training) step %d, loss=%.4f, acc=%.4f, lr=%f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), step, loss_value, acc_value, lr_value, examples_per_sec, sec_per_batch)) summary_writer.add_summary(train_summary_str, step) if step % FLAGS.test_interval == 0: test_images_val, test_labels_val = sess.run([test_images, test_labels]) loss_value, acc_value, test_summary_str = sess.run([loss, acc, test_summary_op], feed_dict={images:test_images_val, labels:test_labels_val}) format_str = ('%s: (Test) step %d, loss=%.4f, acc=%.4f') print (format_str % (datetime.now(), step, loss_value, acc_value)) summary_writer.add_summary(test_summary_str, step) # Save the model checkpoint periodically. if step % FLAGS.checkpoint_interval == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
from dataset import load_movielens_100k, load_movielens_1m, to_tensor import model import tensorflow as tf slim = tf.contrib.slim flags = tf.app.flags FLAGS = flags.FLAGS if __name__ == '__main__': with tf.Graph().as_default(): tf.logging.set_verbosity(tf.logging.INFO) trainset, testset = load_movielens_1m() X = to_tensor(trainset) logits, loss = model.AE(X) train_op = model.train(loss) final_loss =slim.learning.train( train_op, logdir='trainAE.log', number_of_steps=5000, save_summaries_secs=5 )
shuffle=False, collate_fn=mlp_collate) valid_set = MLPDataset(args.valid_file, args.feats_dir, args.feats_type) valid_loader = pyDataLoader(valid_set, batch_size=1, shuffle=False, collate_fn=mlp_collate) # Training and validation train(device=device, net=net, criterion=criterion, learning_rate=args.init_lr, lr_sched=args.lr_sched, num_epochs=args.num_epochs, train_loader=train_loader, train_loader_eval=train_loader_eval, valid_loader=valid_loader, icvec=icvec, ckpt_dir=ckpt_dir, logs_dir=logs_dir) # Independent test phase elif args.phase == 'test' or args.phase == 'extract': # Data loading print("\n[*] Loading test data %s." % args.test_file) if args.net_type == 'gcn' or args.net_type == 'chebcn' or args.net_type == 'gmmcn' or args.net_type == 'gincn': test_set = GraphDataset(args.test_file, args.feats_dir, args.feats_type, args.edges_type) test_loader = pygeoDataLoader(test_set, batch_size=1, shuffle=False)
def main(): # Read the data from the text files begin = time.time() vocab, train_raw, test_raw = read.read_tweets("../training_set_tweets.txt", "../test_set_tweets.txt") print "Num of Train users:", len(train_raw), "Num of Test users:", len(test_raw) print "Read data:", time.time() - begin # Preprocess the data begin = time.time() vocab, bigrams, train_word, test_word, train_char, test_char = preprocessing.preprocess(train_raw, test_raw) print "Preprocessed the data", time.time() - begin return # Assign ids to words vocab_list = list(vocab) vocab_list.sort() begin = time.time() vocab_dict = {} for i in range(len(vocab_list)): vocab_dict[vocab_list[i]] = i print "Assigned ids to words:", time.time() - begin # Build train and test set num_full_feats = len(vocab_list) + 10 num_train_tweets = 0 num_test_tweets = 0 # num_train_tweets = np.count_nonzero(~np.isnan(train)) # num_test_tweets = np.count_nonzero(~np.isnan(test)) for author_id in train: num_train_tweets += len(train[author_id]) for author_id in test: num_test_tweets += len(test[author_id]) X_train = np.zeros((num_train_tweets, num_full_feats)) y_train = np.zeros(num_train_tweets) X_test = np.zeros((num_test_tweets, num_full_feats)) y_test = np.zeros(num_test_tweets) # Build train and test set num_full_feats = len(vocab_list) + 10 num_train_tweets = 0 num_test_tweets = 0 # num_train_tweets = np.count_nonzero(~np.isnan(train)) # num_test_tweets = np.count_nonzero(~np.isnan(test)) for author_id in train_word: num_train_tweets += len(train_word[author_id]) for author_id in test_word: num_test_tweets += len(test_word[author_id]) X_train = np.zeros((num_train_tweets, num_full_feats)) y_train = np.zeros(num_train_tweets) X_test = np.zeros((num_test_tweets, num_full_feats)) y_test = np.zeros(num_test_tweets) count = 0 for author_id in train_word: for tweet in train_word[author_id]: X_train[count, :] = features.get_full_feats(tweet, vocab_dict) y_train[count] = author_id count += 1 print count count = 0 for author_id in test_word: for tweet in test_word[author_id]: X_test[count, :] = features.get_full_feats(tweet, vocab_dict) y_test[count] = author_id count += 1 print count begin = time.time() feats = feature_selection.select_features(X_train, y_train, np.zeros(num_full_feats), 100, "dia") X_train = X_train[:, feats] X_test = X_test[:, feats] print "Features selected:", time.time() - begin begin = time.time() clf = model.train(X_train, y_train) acc, my_acc, preds, scores = model.test(clf, X_test, y_test) print 'time:', time.time()-begin, 'acc:', acc, 'my_acc:', my_acc print 'preds:', preds print 'scores:', scores print (preds == y_test)[:100] print np.count_nonzero(scores > 0) print np.count_nonzero(scores < 0)
def train(step_number, stored_loss, lr): # Turn on training mode which enables dropout. total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) i = 0 while i < train_data_src.size(0) - 1 - 1: bptt = args.bptt #if np.random.random() < 0.95 else args.bptt / 2. # Prevent excessively small or negative sequence lengths seq_len = bptt #max(5, int(np.random.normal(bptt, 5))) # There's a very small chance that it could select a very long sequence length resulting in OOM # seq_len = min(seq_len, args.bptt + 10) model.train() data, prev_targets, targets = get_batch(train_data_src, train_data_trg, i, args, seq_len=seq_len) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) optimizer.zero_grad() output, hidden, rnn_hs, dropped_rnn_hs = model(data, prev_targets, hidden, return_h=True) raw_loss = criterion(output.view(-1, ntokens), targets) loss = raw_loss # Activiation Regularization loss = loss + sum( dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:]) # Temporal Activation Regularization (slowness) loss = loss + sum( (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:]) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) optimizer.step() total_loss += raw_loss.data if step_number % args.log_interval == 0 and step_number > 0: cur_loss = total_loss.item() / args.log_interval elapsed = time.time( ) - start_time #timer doesnt stop while validating, so this will be wrong #if there was a validation call since the last log print logging( '| epoch {:3d} | step {:5d} | {:5d} steps per epoch | lr {:01.5f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f} |input tkn/s {:7.2}'.format( epoch, step_number, len(train_data_src) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss), args.log_interval * args.bptt * args.batch_size / elapsed)) total_loss = 0 start_time = time.time() ### if step_number % args.update_interval == 0 and step_number > 0: val_loss = evaluate(val_data_src, val_data_trg, eval_batch_size) logging('|VALIDATION| step number {:3d} | valid loss {:5.2f} | ' 'valid ppl {:8.2f} | '.format(step_number, val_loss, math.exp(val_loss))) save_checkpoint(model, optimizer, args.save, suffix=str(step_number)) # just for debug # save_checkpoint(model, optimizer, args.save, suffix="last") if step_number > args.start_decaying_lr_step: if math.exp(val_loss) < math.exp(stored_loss): # save_checkpoint(model, optimizer, args.save) # logging('Saving Normal!') stored_loss = val_loss else: lr *= 0.5 print('Lowering LR to: ' + str(lr)) for param_group in optimizer.param_groups: param_group['lr'] = lr step_number += 1 i += seq_len del data, targets, loss, raw_loss del output, rnn_hs, dropped_rnn_hs return step_number, stored_loss, lr
train_loader, test_loader = dataset.get100(batch_size=args.batch_size, num_workers=1) model = model.cifar100(n_channel=args.channel) model = torch.nn.DataParallel(model, device_ids= range(args.ngpu)) if args.cuda: model.cuda() # optimizer optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd) decreasing_lr = list(map(int, args.decreasing_lr.split(','))) print('decreasing_lr: ' + str(decreasing_lr)) best_acc, old_file = 0, None t_begin = time.time() try: # ready to go for epoch in range(args.epochs): model.train() if epoch in decreasing_lr: optimizer.param_groups[0]['lr'] *= 0.1 for batch_idx, (data, target) in enumerate(train_loader): indx_target = target.clone() if args.cuda: data, target = data.cuda(), target.cuda() data, target = Variable(data), Variable(target) optimizer.zero_grad() output = model(data) loss = F.cross_entropy(output, target) loss.backward() optimizer.step() if batch_idx % args.log_interval == 0 and batch_idx > 0:
def train(self, maxepoch=50, start=.01, end=0.0001): #EACH EPISODE TAKE ONE LR/HR PAIR WITH CORRESPONDING PATCHES #AND ATTEMPT TO SUPER RESOLVE EACH PATCH #requires pytorch 1.1.0+ which is not possible on the server #scheduler = torch.optim.lr_scheduler.CyclicLR(self.agent.optimizer,base_lr=0.0001,max_lr=0.1) unfold_LR = torch.nn.Unfold(kernel_size=self.PATCH_SIZE, stride=self.PATCH_SIZE, dilation=1) unfold_HR = torch.nn.Unfold(kernel_size=self.PATCH_SIZE * 4, stride=self.PATCH_SIZE * 4, dilation=1) #START TRAINING indices = list(range(len(self.TRAINING_HRPATH))) lossfn = torch.nn.L1Loss() #random.shuffle(indices) for c in range(maxepoch): #FOR EACH HIGH RESOLUTION IMAGE for n, idx in enumerate(indices): idx = random.sample(indices, 1)[0] #GET INPUT FROM CURRENT IMAGE HRpath = self.TRAINING_HRPATH[idx] LRpath = self.TRAINING_LRPATH[idx] LR = imageio.imread(LRpath) HR = imageio.imread(HRpath) LR, HR = self.getTrainingPatches(LR, HR) #WE MUST GO THROUGH EVERY SINGLE PATCH IN RANDOM ORDER patch_ids = list(range(len(LR))) random.shuffle(patch_ids) P = [] for step in range(1): batch_ids = random.sample( patch_ids, self.batch_size) #TRAIN ON A SINGLE IMAGE labels = torch.Tensor(batch_ids).long() lrbatch = LR[labels, :, :, :] hrbatch = HR[labels, :, :, :] lrbatch = lrbatch.to(self.device) hrbatch = hrbatch.to(self.device) #GET SISR RESULTS FROM EACH MODEL SR_result = torch.zeros( self.batch_size, 3, self.PATCH_SIZE * self.UPSIZE, self.PATCH_SIZE * self.UPSIZE).to(self.device) Wloss = torch.zeros(self.batch_size, self.SR_COUNT, self.PATCH_SIZE * self.UPSIZE, self.PATCH_SIZE * self.UPSIZE).to( self.device) loss_SISR = 0 #probs = self.agent.model(lrbatch) probs = torch.zeros((10, 3, 10, 10)) for j, sisr in enumerate(self.SRmodels): self.SRoptimizers[j].zero_grad( ) #zero our sisr gradients hr_pred = sisr(lrbatch) #weighted_pred = hr_pred * probs[:,j].unsqueeze(1) SR_result += hr_pred self.agent.opt.zero_grad() #CALCULATE LOSS l1diff = lossfn(hr_pred, hrbatch) #l1diff = torch.mean(torch.abs(SR_result - hrbatch)) total_loss = l1diff total_loss.backward() #OPTIMIZE AND MOVE THE LEARNING RATE ACCORDING TO SCHEDULER [opt.step() for opt in self.SRoptimizers] [sched.step() for sched in self.schedulers] lr = self.SRoptimizers[-1].param_groups[0]['lr'] #self.agent.opt.step() #self.agent.scheduler.step() SR_result = SR_result / 255 hrbatch = hrbatch / 255 #CONSOLE OUTPUT FOR QUICK AND DIRTY DEBUGGING choice = probs.max(dim=1)[1] c1 = (choice == 0).float().mean() c2 = (choice == 1).float().mean() c3 = (choice == 2).float().mean() c4 = (choice == 3).float().mean() print('\rEpoch/img: {}/{} | LR: {:.8f} | Agent Loss: {:.4f}, SISR Loss: {:.4f}, c1: {:.4f}, c2: {:.4f}, c3: {:.4f} c4:{:.4f}'\ .format(c,n,lr,total_loss.item(),loss_SISR, c1.item(), c2.item(), c3.item(),c4.item()),end="\n") #LOG AND SAVE THE INFORMATION scalar_summaries = { 'Loss/AgentLoss': total_loss, 'Loss/SISRLoss': loss_SISR, "choice/c1": c1, "choice/c2": c2, "choice/c3": c3 } hist_summaries = { 'actions': probs[0].view(-1), "choices": choice[0].view(-1) } img_summaries = { 'sr/mask': probs[0][:3], 'sr/sr': SR_result[0].clamp(0, 1), 'sr/hr': hrbatch[0].clamp(0, 1) } self.logger.scalar_summary(scalar_summaries) self.logger.hist_summary(hist_summaries) self.logger.image_summary(img_summaries) if self.logger.step % 100 == 0: with torch.no_grad(): psnr, ssim, info = self.test.validate(save=False, quick=False) self.agent.model.train() [model.train() for model in self.SRmodels] if self.logger: self.logger.scalar_summary({ 'Testing_PSNR': psnr, 'Testing_SSIM': ssim }) masked_sr = torch.from_numpy( info['assignment']).float().permute(2, 0, 1) srimg = (torch.from_numpy( info['SRimg']).float()).permute(2, 0, 1) hrimg = (torch.from_numpy( info['HRimg']).float()).permute(2, 0, 1) hrimg = hrimg / 255.0 srimg = srimg / 255.0 self.logger.image_summary({ 'Testing/Test Assignment': masked_sr, 'Testing/SR': srimg, 'Testing/HR': hrimg }) self.savemodels() self.logger.incstep()
line=dict(color='#506784'), fill=dict(color=[ rowOddColor, rowEvenColor, rowOddColor, rowEvenColor, rowOddColor ]), align=['left', 'center'], font=dict(color='#506784', size=11))) ]) fig.show() if __name__ == '__main__': print('Start!') # reads in the dataset test_data = pd.read_csv('../Data/Annotations_Adjusted.csv') # trains and tests the models bagging = model.train(test_data, 'bagging') dec_tree = model.train(test_data, 'dec_tree') forest = model.train(test_data, 'forest') mlp = model.train(test_data, 'mlp') sgd = model.train(test_data, 'sgd') gaussian_nb = model.train(test_data, 'gaussian_nb') nn = model.train(test_data, 'nn') svc = model.train(test_data, 'svc') # plots the results plot([dec_tree, sgd, mlp, gaussian_nb, bagging, forest, svc, nn])
def train(): # Turn on training mode which enables dropout. model.train() hidden = model.init_hidden(args.batch_size) train_iter = data.BucketIterator(dataset=train_data, batch_size=args.batch_size, sort_key=lambda x: data.interleave_keys(len(x.src), len(x.trg))) lr = args.lr best_val_loss = None total_loss = 0 epoch = 0 epoch_start_time = time.time() batch_counter = 0 start_time = time.time() samples_num = len(train_data.examples) while(epoch < args.epochs): batch = next(iter(train_iter)) source = batch.src targets = batch.trg.view(-1) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) model.zero_grad() output, hidden = model(source, hidden) loss = criterion(output.view(-1, ntokens), targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) for p in model.parameters(): p.data.add_(-lr, p.grad.data) total_loss += loss.data if batch_counter % args.log_interval == 0 and batch_counter > 0: cur_loss = total_loss[0] / args.log_interval elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch_counter, len(train_data) // args.batch_size, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() # Manully update epoch & batch states if args.batch_size * batch_counter > samples_num: epoch += 1 val_loss = evaluate(val_data) print('-' * 89) print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss))) print('-' * 89) # Save the model if the validation loss is the best we've seen so far. if not best_val_loss or val_loss < best_val_loss: with open(args.save, 'wb') as f: torch.save(model, f) best_val_loss = val_loss else: # Anneal the learning rate if no improvement has been seen in the validation dataset. lr /= 4.0 epoch_start_time = time.time() batch_counter = 0 batch_counter += 1
def train(model, optimizer, loss_fn, dataloader, metrics, params): """Train the model on `num_steps` batches Args: model: (torch.nn.Module) the neural network optimizer: (torch.optim) optimizer for parameters of model loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches training data metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch params: (Params) hyperparameters num_steps: (int) number of batches to train on, each of size params.batch_size """ # set model to training mode model.train() # summary for current training loop and a running average object for loss summ = [] loss_avg = util.RunningAverage() counter = 0 for i, data in enumerate(dataloader): optimizer.zero_grad() x1, x2, y = data['previmg'], data['currimg'], data['currbb'] if params.cuda: x1, x2, y = Variable(x1.cuda()), Variable(x2.cuda()), Variable( y.cuda(), requires_grad=False) output = model(x1, x2) loss = loss_fn(output, y) loss.backward(retain_graph=True) # performs updates using calculated gradients optimizer.step() # Evaluate summaries only once in a while if i % params.save_summary_steps == 0: # extract data from torch Variable, move to cpu, convert to numpy arrays output = output.data.cpu().numpy() # compute all metrics on this batch summary_batch = {} summary_batch['loss'] = loss.data[0] summ.append(summary_batch) logging.info('- Average Loss for iteration {} is {}'.format( i, loss.data[0] / params.batch_size)) # update the average loss loss_avg.update(loss.data[0]) counter += 1 print(counter) # compute mean of all metrics in summary metrics_mean = { metric: np.mean([x[metric] for x in summ]) for metric in summ[0] } metrics_string = " ; ".join("{}: {:05.3f}".format(k, v) for k, v in metrics_mean.items()) logging.info("- Train metrics: " + metrics_string)
parser.add_argument('-l', '--layers', default=1, type=int) parser.add_argument('-m', '--maxepoch', default=2000, type=str) args = parser.parse_args() hidden_dim = args.hiddendim featurefile = args.featurefile neighbourfile = args.neighbourfile neighbourvariable = args.neighbourvariable distancefile = args.distancefile distancevariable = args.distancevariable lambda1 = args.l1 lambda2 = args.l2 lr = args.lr finaldim = args.finaldim layers = args.layers maxepoch = args.maxepoch feature, logrmin, logrmax, smin, smax, pointnum = load_data(featurefile) neighbour, degrees, maxdegree = load_neighbour(neighbourfile, neighbourvariable, pointnum) geodesic_weight = load_geodesic_weight(distancefile, distancevariable, pointnum) model = model.convMESH(pointnum, neighbour, degrees, maxdegree, hidden_dim, finaldim, layers, lambda1, lambda2, lr) model.train(feature, geodesic_weight, maxepoch)
def run(model, train_loader, val_loader): ''' 模型训练和预测 :param model: 初始化的model :param train_loader: 训练数据 :param val_loader: 验证数据 :return: ''' # 初始化变量 ## 模型保存的变量 global best_acc ## 训练C类别的分类问题,使用CrossEntropyLoss criterion = nn.CrossEntropyLoss() ## torch.optim 是一个各种优化算法库 ## optimizer 对象能保存当前的参数状态并且基于计算梯度更新参数 optimizer = get_optimizer(model.parameters(), args) # 加载checkpoint: 可以指定迭代的开始位置进行重新训练 if args.resume: # --resume checkpoint/checkpoint.pth.tar # load checkpoint print('Resuming from checkpoint...') assert os.path.isfile( args.resume), 'Error: no checkpoint directory found!!' checkpoint = torch.load(args.resume) best_acc = checkpoint['best_acc'] state['start_epoch'] = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) # 评估: 混淆矩阵;准确率、召回率、F1-score if args.evaluate: print('\nEvaluate only') test_loss, test_acc, predict_all, labels_all = evaluate(val_loader, model, criterion, test=True) print('Test Loss:%.8f,Test Acc:%.2f' % (test_loss, test_acc)) # 混淆矩阵 report = metrics.classification_report(labels_all, predict_all, target_names=class_list, digits=4) confusion = metrics.confusion_matrix(labels_all, predict_all) print('\n report ', report) print('\n confusion', confusion) return # 模型的训练和验证 ## append logger file logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=None) ## 设置logger 的头信息 logger.set_names([ 'Learning Rate', 'epoch', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.' ]) for epoch in range(state['start_epoch'], state['epochs'] + 1): print('[{}/{}] Training'.format(epoch, args.epochs)) # train train_loss, train_acc = train(train_loader, model, criterion, optimizer) # val test_loss, test_acc = evaluate(val_loader, model, criterion, test=None) # 核心参数保存logger logger.append([ state['lr'], int(epoch), train_loss, test_loss, train_acc, test_acc ]) print('train_loss:%f, val_loss:%f, train_acc:%f, val_acc:%f' % ( train_loss, test_loss, train_acc, test_acc, )) # 保存模型 is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'train_acc': train_acc, 'test_acc': test_acc, 'best_acc': best_acc, 'optimizer': optimizer.state_dict() }, is_best, checkpoint=state['checkpoint']) print('Best acc:', best_acc)
def train(epoch, optimizer, compression_scheduler=None): # Turn on training mode which enables dropout. model.train() total_samples = train_data.size(0) steps_per_epoch = math.ceil(total_samples / args.bptt) total_loss = 0. start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) # The line below was fixed as per: https://github.com/pytorch/examples/issues/214 for batch, i in enumerate(range(0, train_data.size(0), args.bptt)): data, targets = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) if compression_scheduler: compression_scheduler.on_minibatch_begin(epoch, minibatch_id=batch, minibatches_per_epoch=steps_per_epoch) output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets) if compression_scheduler: # Before running the backward phase, we add any regularization loss computed by the scheduler regularizer_loss = compression_scheduler.before_backward_pass(epoch, minibatch_id=batch, minibatches_per_epoch=steps_per_epoch, loss=loss) loss += regularizer_loss optimizer.zero_grad() loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) optimizer.step() total_loss += loss.item() if compression_scheduler: compression_scheduler.on_minibatch_end(epoch, minibatch_id=batch, minibatches_per_epoch=steps_per_epoch) if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time lr = optimizer.param_groups[0]['lr'] msglogger.info( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.4f} | ms/batch {:5.2f} ' '| loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() stats = ('Peformance/Training/', OrderedDict([ ('Loss', cur_loss), ('Perplexity', math.exp(cur_loss)), ('LR', lr), ('Batch Time', elapsed * 1000)]) ) steps_completed = batch + 1 distiller.log_training_progress(stats, model.named_parameters(), epoch, steps_completed, steps_per_epoch, args.log_interval, [tflogger])
X_train = np.reshape(X_train, [X_train.shape[0], X_train.shape[1] * X_train.shape[2]]) X_test = np.reshape(X_test, [X_test.shape[0], X_test.shape[1] * X_test.shape[2]]) print(X_train.shape, y_train.shape, X_test.shape, y_test.shape) # print("shuffling ... ") X_train, y_train = data.data_shuffle(X_train, y_train) X_test, y_test = data.data_shuffle(X_test, y_test) print("preprocessing ...") X_train = X_train / 255.0 X_test = X_test / 255.0 X_train = data.mean_wise(X_train) X_test = data.mean_wise(X_test) X_train, X_test = data.pca(X_train, X_test, 80) # y_train, y_test = data.one_hot(y_train, y_test) print("training ...") # classifier = model.train(X_train, y_train, model.dt("entropy", 0.8)) # 70.68% # classifier = model.train(X_train, y_train, model.rf(1000, "sqrt")) # 96.49% # classifier = model.train(X_train, y_train, model.gbdt(1000, "sqrt")) # 95.17% # classifier = model.train(X_train, y_train, model.logit(1.0)) # 90.14% # classifier = model.train(X_train, y_train, model.mlp(1000, "logistic")) # 93.36% # classifier = model.train(X_train, y_train, model.svm(1.0, "rbf")) # 96.82% # classifier = model.train(X_train, y_train, model.knn(10, "uniform")) # 95.34% classifier = model.train(X_train, y_train, model.bayes()) # 82.30% print("testing ...") print(model.acc(X_train, y_train, classifier)) print(model.acc(X_test, y_test, classifier))
if args.cuda: model.cuda() # optimizer optimizer = optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.wd, momentum=0.9) decreasing_lr = list(map(int, args.decreasing_lr.split(','))) print('decreasing_lr: ' + str(decreasing_lr)) best_acc, old_file = 0, None t_begin = time.time() try: # ready to go for epoch in range(args.epochs): model.train() if epoch in decreasing_lr: optimizer.param_groups[0]['lr'] *= 0.1 for batch_idx, (data, target) in enumerate(train_loader): indx_target = target.clone() if args.cuda: data, target = data.cuda(), target.cuda() data, target = Variable(data), Variable(target) optimizer.zero_grad() output = model(data) loss = F.cross_entropy(output, target) loss.backward() optimizer.step() if batch_idx % args.log_interval == 0 and batch_idx > 0:
def train(base_rates): # Turn on training mode which enables dropout. if args.model == 'QRNN': model.reset() total_loss = 0 total_oe_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) batch, i = 0, 0 # indices for randomizing order of segments train_indices = np.arange(train_data.size(0) // args.bptt) np.random.shuffle(train_indices) oe_indices = np.arange(oe_dataset.size(0) // args.bptt) np.random.shuffle(oe_indices) # seq_len = args.bptt br = None for i in range( 0, train_data.size(0), args.bptt ): # Assume OE dataset is larger. It is, because we're using wikitext-2. lr2 = optimizer.param_groups[0]['lr'] optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt model.train() data, targets = get_batch(train_data, i, args, seq_len=seq_len) data_oe, _ = get_batch(oe_dataset, i, args, seq_len=seq_len) if data.size(0) != data_oe.size( 0 ): # Don't train on this batch if the sequence lengths are different (happens at end of epoch). continue # We need a new hidden state for each segment, because this makes evaluation easier and more meaningful. hidden = model.init_hidden(2 * args.batch_size) hidden = repackage_hidden(hidden) output, hidden, rnn_hs, dropped_rnn_hs = model(torch.cat( [data, data_oe], dim=1), hidden, return_h=True) output, output_oe = torch.chunk(dropped_rnn_hs[-1], dim=1, chunks=2) output, output_oe = output.contiguous(), output_oe.contiguous() output = output.view(output.size(0) * output.size(1), output.size(2)) raw_loss = criterion(model.decoder.weight, model.decoder.bias, output, targets) loss = raw_loss # Activiation Regularization if args.alpha: loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:]) # Temporal Activation Regularization (slowness) if args.beta: loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:]) # OE loss logits_oe = model.decoder(output_oe) smaxes_oe = F.softmax(logits_oe - torch.max(logits_oe, dim=-1, keepdim=True)[0], dim=-1) br = Variable( torch.FloatTensor(base_rates).unsqueeze(0).unsqueeze(0).expand_as( smaxes_oe)).cuda() if br is None else br loss_oe = -(smaxes_oe.log() * br).sum(-1) # for cross entropy loss_oe = loss_oe.mean() # for ERM # if args.use_OE == 'yes': loss_bp = loss + 0.5 * loss_oe else: loss_bp = loss optimizer.zero_grad() loss_bp.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. if args.clip: torch.nn.utils.clip_grad_norm(params, args.clip) optimizer.step() total_loss += raw_loss.data total_oe_loss += loss_oe.data optimizer.param_groups[0]['lr'] = lr2 if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss[0] / args.log_interval cur_oe_loss = total_oe_loss[0] / args.log_interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | oe_loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'. format(epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]['lr'], elapsed * 1000 / args.log_interval, cur_loss, cur_oe_loss, math.exp(cur_loss), cur_loss / math.log(2))) total_loss = 0 total_oe_loss = 0 start_time = time.time() ### batch += 1
def run(self, level = 'intermediate'): assert level in ['simple', 'intermediate', 'full'] tools.dtype = self.dtype device = self.device operation = self.operation context = sc.driver.context(device) if self.logger: self.logger.info("----------------") self.logger.info(operation.__name__.replace('_','-').upper()) self.logger.info(tools.dtype.__name__.upper()) self.logger.info("----------------") #BLAS1 training sizes if operation in [sc.templates.elementwise_1d, sc.templates.reduce_1d]: sizes = [(10**x,) for x in range(3,8)] #BLAS2 training sizes if operation in [sc.templates.elementwise_2d, sc.templates.reduce_2d_rows, sc.templates.reduce_2d_cols]: sizes = [] #Square for N in [896, 1280, 1760, 2560]: sizes += [(N, N)] #Short/Fat for M in [16, 32, 64, 128, 512, 1024]: for N in [1024, 4096, 16384, 65536]: sizes += [(M, N)] #Tall/Skinny for N in [16, 32, 64, 128, 512, 1024]: for M in [1024, 4096, 16384, 65536]: sizes += [(M, N)] #BLAS3 training sizes if operation in [sc.templates.gemm_nn, sc.templates.gemm_nt, sc.templates.gemm_tn, sc.templates.gemm_tt]: sizes = [] #Square for N in [896, 1760, 2048, 2560]: sizes += [(N, N, N)] #LaPack for N in [896, 1760, 2048, 2560]: for K in [16, 32, 64, 128]: sizes += [(N, N, K)] #Covariance for N in [16, 32, 64, 128, 256]: for K in [16000,32000,64000,128000]: sizes += [(N, N, K)] #DeepSpeech for M in [1760, 2048, 2560, 4096]: for N in [16, 32, 64, 128, 7000]: sizes += [(M, N, M)] for K in [1760, 2048, 2560, 4096]: for M, N in [(5124,9124),(35,8457)]: sizes += [(M, N, K)] for M, K in [(7680,2560),(3072,1024)]: for N in [16, 32, 64, 128]: sizes += [(M, N, K)] #Training data performance = tools.metric_of(operation) profiles, X, Y = [], [], [] #Restore progress savepath = os.path.join('save', tools.dtype.__name__, operation.__name__) if not os.path.exists(savepath): os.makedirs(savepath) try: with open(os.path.join(savepath, 'X.csv')) as f: X = [tuple(map(int, row)) for row in csv.reader(f, delimiter=',')] with open(os.path.join(savepath, 'profiles.csv')) as f: profiles = [map(int,row) for v in row for row in csv.reader(f, delimiter=',')] with open(os.path.join(savepath, 'Y.csv')) as f: Y = [map(float, row) for row in csv.reader(f, delimiter=',')] #Recompute Y #Y = [] #for x in X: # tree, _ = tools.tree_of(operation, x, context) # Y.append([performance(x, tools.benchmark(operation(*best), tree)) for best in profiles]) except: pass #Save data def save(): for (fname, data) in zip(['X.csv', 'Y.csv', 'profiles.csv'], [X, Y, profiles]): with open(os.path.join(savepath, fname), 'wb') as f: csv.writer(f).writerows(data) #Tuning for idx, x in enumerate(sizes): #Create new line on log if idx>0: self.progress_bar.set_finished() self.progress_bar.set_prefix(', '.join(map(str, x))) #Skip if already saved if x in X: row = Y[X.index(x)] self.progress_bar.update(1, 1, profiles[argmax(row)], max(row)) continue #Best existing profile for x tree, operands = tools.tree_of(operation, x, context) y = [performance(x, tools.benchmark(operation(*p), tree)) for p in profiles] best = profiles[np.argmax(y)] if y else None #Retune if necessary tune = not (best and optimize.is_local_optimum(best, operation, x, context)) if tune: optimizer = optimize.GeneticOptimizer(self.logger, naccept=1000, niter=1000, cxpb=.4, mutpb=.4, popsize=20, progress_bar = self.progress_bar) best = optimizer.run(operation, x, context, prior=best)[0] if best not in profiles: profiles.append(best) for xx,yy in zip(X, Y): tree, _ = tools.tree_of(operation, xx, context) time = tools.benchmark(operation(*best), tree) yy.append(performance(xx, time)) #Update dataset X.append(x) tree, operands = tools.tree_of(operation, x, context) y = [performance(x,tools.benchmark(operation(*prf), tree)) for prf in profiles] Y.append(y) #Save data save() #print performance info in case no tuning was done if not tune: row = Y[X.index(x)] self.progress_bar.update(1, 1, profiles[argmax(row)], max(row)) self.progress_bar.set_finished() save() #Adding external profiles for prof in tools.external_profiles(operation): profiles.append(prof.__class__.__name__) for x, y in zip(X, Y): tree, operands = tools.tree_of(operation, x, context) perf = performance(x,tools.benchmark(prof, tree, operation)) if max(y) < perf: print x, '\t', prof.__class__.__name__, '\toutperform: \t', int(perf), tools.metric_name_of(operation) y.append(perf) #Pruning of useless profiles X = np.array(X) Y = np.array(Y) if len(Y[0]) > 1: idx = np.where(np.bincount(np.argmax(Y, 1), minlength=len(profiles))==0)[0] profiles = [p for ip,p in enumerate(profiles) if ip not in idx] Y = np.delete(Y, idx, axis=1) #Exporting to JSON json_path = tools.sanitize(device.name) + '.json' if not self.json_path else self.json_path if os.path.isfile(json_path): json_data = json.load(open(json_path, 'r')) else: json_data = {} json_data["version"] = "1.0" operation_name = operation.__name__ if operation_name not in json_data: json_data[operation_name] = {} json_data[operation_name][tools.dtype.__name__] = {} D = json_data[operation_name][tools.dtype.__name__] if len(profiles) > 1: clf, nrmse = model.train(X, Y, profiles) D['predictor'] = [{'children_left': e.tree_.children_left.tolist(), 'children_right': e.tree_.children_right.tolist(), 'threshold': e.tree_.threshold.astype('float64').tolist(), 'feature': e.tree_.feature.astype('float64').tolist(), 'value': e.tree_.value[:,:,0].astype('float64').tolist()} for e in clf.estimators_] D['profiles'] = [tools.convert(x) for x in profiles] json.dump(json_data, open(json_path,'w'))
def main(): # args & device args = config.get_args() if torch.cuda.is_available(): print('Train on GPU!') device = torch.device("cuda") else: device = torch.device("cpu") # dataset assert args.dataset in ['cifar10', 'imagenet'] train_transform, valid_transform = data_transforms(args) if args.dataset == 'cifar10': trainset = torchvision.datasets.CIFAR10(root=os.path.join( args.data_dir, 'cifar'), train=True, download=True, transform=train_transform) train_loader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=8) valset = torchvision.datasets.CIFAR10(root=os.path.join( args.data_dir, 'cifar'), train=False, download=True, transform=valid_transform) val_loader = torch.utils.data.DataLoader(valset, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=8) elif args.dataset == 'imagenet': train_data_set = datasets.ImageNet( os.path.join(args.data_dir, 'ILSVRC2012', 'train'), train_transform) val_data_set = datasets.ImageNet( os.path.join(args.data_dir, 'ILSVRC2012', 'valid'), valid_transform) train_loader = torch.utils.data.DataLoader(train_data_set, batch_size=args.batch_size, shuffle=True, num_workers=8, pin_memory=True, sampler=None) val_loader = torch.utils.data.DataLoader(val_data_set, batch_size=args.batch_size, shuffle=False, num_workers=8, pin_memory=True) # SinglePath_OneShot choice = [2, 0, 2, 3, 2, 2, 3, 1, 2, 1, 0, 1, 0, 3, 1, 0, 0, 2, 3, 2] model = SinglePath_Network(args.dataset, args.resize, args.classes, args.layers, choice) criterion = nn.CrossEntropyLoss().to(device) optimizer = torch.optim.SGD(model.parameters(), args.learning_rate, args.momentum, args.weight_decay) scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lambda epoch: 1 - (epoch / args.epochs)) # flops & params & structure flops, params = profile( model, inputs=(torch.randn(1, 3, 32, 32), ) if args.dataset == 'cifar10' else (torch.randn(1, 3, 224, 224), ), verbose=False) # print(model) print('Random Path of the Supernet: Params: %.2fM, Flops:%.2fM' % ((params / 1e6), (flops / 1e6))) model = model.to(device) summary(model, (3, 32, 32) if args.dataset == 'cifar10' else (3, 224, 224)) # train supernet start = time.time() for epoch in range(args.epochs): train(args, epoch, train_loader, device, model, criterion, optimizer, scheduler, supernet=False) scheduler.step() if (epoch + 1) % args.val_interval == 0: validate(args, epoch, val_loader, device, model, criterion, supernet=False) utils.save_checkpoint({ 'state_dict': model.state_dict(), }, epoch + 1, tag=args.exp_name) utils.time_record(start)
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.contrib.framework.get_or_create_global_step() # Get images and labels for CIFAR-10. images, labels = cifar10.get_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits, fc1_w, fc2_w, fc1_b, fc2_b = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # L2 regularization for the fully connected parameters. regularizers = (tf.nn.l2_loss(fc1_w) + tf.nn.l2_loss(fc1_b) + tf.nn.l2_loss(fc2_w) + tf.nn.l2_loss(fc2_b)) # Add the regularization term to the loss. loss += 5e-4 * regularizers # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 return tf.train.SessionRunArgs(loss) # Asks for loss value. def after_run(self, run_context, run_values): if self._step % FLAGS.log_frequency == 0: current_time = time.time() duration = current_time - self._start_time self._start_time = current_time loss_value = run_values.results examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration sec_per_batch = float(duration / FLAGS.log_frequency) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook()], config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement, allow_soft_placement=True)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op)
#!/usr/bin/python import time import six.moves.cPickle import model import yelp_reader model, tokeniser, dictionarySize = model.train(yelp_reader, oneHot = True, oneHotAveraged = True, contextHashes = False) jsonModel = model.to_json() open('model.json', 'w').write(jsonModel) open('model-dictionary-size.dat', 'w').write(str(dictionarySize)) six.moves.cPickle.dump(tokeniser, open("tokeniser.pkl", "wb")) model.save_weights('model-' + str(time.time()) + '.h5')
def trainCNN_on_Sparrow_architecture(): #CNN Training parameters activation = 'elu' #'elu' init_type = 'xavier' batch_size = 32 epochs = 1000 # Regularizer parameters use_lr_decay = False #set this flag for LR decay wDecayFlag = False #whether to perform L2 weight decay or not lossPenalty = 0.001 # Using lambda=0.001 . applyBatchNorm = False deviceId = "/gpu:0" # Adam parameters optimizer_type = 'adam' b1 = 0.9 b2 = 0.999 epsilon = 0.1 #1e-08 is the default momentum = 0.95 dropout1 = 1.0 #for input to first FC layer dropout2 = 1.0 #for intermediate layer input dropouts = [0.5, 0.4, 0.3, 0.2, 0.1] #,0.6] lambdas = [0.0005, 0.001] architectures = [ 3 ] # birds architecture sparrow, to make it unified (check model.py for definition) trainingSize = [1] #in seconds learning_rates = [0.0001, 0.00008] targets = 2 fftSize = 256 specType = 'mag_spec' padding = False for duration in trainingSize: print('Now loading the data !!') outPath = '../../spectrograms/' + specType + '/' + str( fftSize) + 'FFT/' + str(duration) + 'sec/' mean_std_file = outPath + 'train/mean_std.npz' # Load training data, labels and perform norm tD = dataset.load_data(outPath + 'train/') tL = dataset.get_labels_according_to_targets(trainP, targets) dataset.compute_global_norm(tD, mean_std_file) print('Shape of labels: ', tL.shape) #tD = dataset.normalise_data(tD,mean_std_file,'utterance') # utterance level tD = dataset.normalise_data(tD, mean_std_file, 'global_mv') # global #print('Norm td: max and min are ', np.max(tD)) # Load dev data, labels and perform norm devD = dataset.load_data(outPath + 'dev/') devL = dataset.get_labels_according_to_targets(devP, targets) #devD = dataset.normalise_data(devD,mean_std_file,'utterance') #print('first Norm dev: max and min are ', np.max(devD)) devD = dataset.normalise_data(devD, mean_std_file, 'global_mv') #print('Norm dev: max and min are ', np.max(devD)) trainSize = str( duration) + 'sec' ##may be change this in model.py also ! ### We are training on TRAIN set and validating on DEV set t_data = tD t_labels = tL v_data = devD v_labels = devL for dropout in dropouts: architecture = architectures[0] for lr in learning_rates: #hyp_str ='cnn'+str(architecture)+'_keepProb_1.0_' + str(dropout)+str(dropout3)+'lr'+str(lr) hyp_str = 'sparrow' + '_keep_' + str( dropout) + '_' + 'lr' + str(lr) + '_' + str( activation) + '_' + 'fft' + str(fftSize) log_dir = '../tensorflow_log_dir/sparrowArch/' + hyp_str model_save_path = '../models/sparrowArch/' + hyp_str logfile = model_save_path + '/training.log' figDirectory = model_save_path makeDirectory(model_save_path) print('Training model with ' + str(duration) + ' sec data and cnnModel' + str(architecture)) tLoss, vLoss, tAcc, vAcc = model.train( architecture, fftSize, padding, trainSize, t_data, t_labels, v_data, v_labels, activation, lr, use_lr_decay, epsilon, b1, b2, momentum, optimizer_type, dropout, dropout1, dropout2, model_save_path, log_dir, logfile, wDecayFlag, lossPenalty, applyBatchNorm, init_type, epochs, batch_size, targets) #plot_2dGraph('#Epochs', 'Avg CE Loss', tLoss,vLoss,'train_ce','val_ce', figDirectory+'/loss.png') #plot_2dGraph('#Epochs', 'Avg accuracy', tAcc,vAcc,'train_acc','val_acc',figDirectory+'/acc.png') plot_2dGraph('#Epochs', 'Val loss and accuracy', vLoss, vAcc, 'val_loss', 'val_acc', figDirectory + '/v_ls_acc.png')
import tensorflow as tf import os from model import train, test from configuration import get_config config = get_config() tf.reset_default_graph() if __name__ == "__main__": # start training if config.train: print("\nTraining Session") # os.makedirs(config.model_path) train(config.model_path) # start test else: print("\nTest session") if os.path.isdir(config.model_path): test(config.model_path) else: raise AssertionError("model path doesn't exist!")
def train(self, maxepoch=100, start=.01, end=0.0001): #EACH EPISODE TAKE ONE LR/HR PAIR WITH CORRESPONDING PATCHES #AND ATTEMPT TO SUPER RESOLVE EACH PATCH #requires pytorch 1.1.0+ which is not possible on the server #scheduler = torch.optim.lr_scheduler.CyclicLR(self.agent.optimizer,base_lr=0.0001,max_lr=0.1) unfold_LR = torch.nn.Unfold(kernel_size=self.PATCH_SIZE, stride=self.PATCH_SIZE, dilation=1) unfold_HR = torch.nn.Unfold(kernel_size=self.PATCH_SIZE * 4, stride=self.PATCH_SIZE * 4, dilation=1) #QUICK CHECK ON EVERYTHING with torch.no_grad(): psnr, ssim, info = self.test.validateSet5(save=False, quick=False) #START TRAINING indices = list(range(len(self.TRAINING_HRPATH))) lossfn = torch.nn.L1Loss() lossCE = torch.nn.CrossEntropyLoss() softmaxfn = torch.nn.Softmax(dim=1) #random.shuffle(indices) for c in range(maxepoch): #FOR EACH HIGH RESOLUTION IMAGE for n, idx in enumerate(indices): idx = random.sample(indices, 1)[0] #GET INPUT FROM CURRENT IMAGE HRpath = self.TRAINING_HRPATH[idx] LRpath = self.TRAINING_LRPATH[idx] LR = imageio.imread(LRpath) HR = imageio.imread(HRpath) LR, HR = self.getTrainingPatches(LR, HR) #WE MUST GO THROUGH EVERY SINGLE PATCH IN RANDOM ORDER patch_ids = list(range(len(LR))) random.shuffle(patch_ids) P = [] for step in range(1): batch_ids = random.sample( patch_ids, self.batch_size) #TRAIN ON A SINGLE IMAGE labels = torch.Tensor(batch_ids).long() lrbatch = LR[labels, :, :, :] hrbatch = HR[labels, :, :, :] lrbatch = lrbatch.to(self.device) hrbatch = hrbatch.to(self.device) #GET SISR RESULTS FROM EACH MODEL loss_SISR = 0 sisrs = [] probs = self.agent.model( lrbatch) #SO WE DON'T TAKE LOG OF 0... maxval, maxidx = probs.max(dim=1) for j, sisr in enumerate(self.SRmodels): hr_pred = sisr(lrbatch) sisrs.append(hr_pred) #UPDATE BOTH THE SISR MODELS AND THE SELECTION MODEL ACCORDING TO THEIR LOSS SR_result = torch.zeros( self.batch_size, 3, self.PATCH_SIZE * self.UPSIZE, self.PATCH_SIZE * self.UPSIZE).to(self.device) l1diff = [] for j, sr in enumerate(sisrs): self.SRoptimizers[j].zero_grad() mask = maxidx == j pred = sr * probs[:, j].unsqueeze(1) #pred = sr * mask.unsqueeze(1).float() #pred = sr * (mask.unsqueeze(1).float() * maxval.unsqueeze(1)) SR_result += pred l1 = torch.abs(sr - hrbatch).mean(dim=1) l1diff.append(l1) l1diff = torch.stack(l1diff, dim=1) minval, minidx = l1diff.min(dim=1) target = torch.nn.functional.one_hot( minidx, len(sisrs)).permute( 0, 3, 1, 2) #TARGET PROBABILITY MASK WE HOPE FOR? sisrloss = lossfn(SR_result, hrbatch) sisrloss.backward() [opt.step() for opt in self.SRoptimizers] [sched.step() for sched in self.schedulers] self.agent.opt.zero_grad() probs = self.agent.model(lrbatch).clamp( 1e-10, 1) #SO WE DON'T TAKE LOG OF 0... maxval, maxidx = probs.max(dim=1) selectionloss = torch.mean(-1 * probs.gather( 1, minidx.unsqueeze(1)).log()) + torch.mean(1 - maxval) selectionloss.backward() self.agent.opt.step() #self.agent.scheduler.step() #CONSOLE OUTPUT FOR QUICK AND DIRTY DEBUGGING lr = self.SRoptimizers[-1].param_groups[0]['lr'] SR_result = SR_result / 255 hrbatch = hrbatch / 255 choice = probs.max(dim=1)[1] c1 = (choice == 0).float().mean() c2 = (choice == 1).float().mean() c3 = (choice == 2).float().mean() print('\rEpoch/img: {}/{} | LR: {:.8f} | Agent Loss: {:.4f}, SISR Loss: {:.4f}, c1: {:.4f}, c2: {:.4f}, c3: {:.4f}'\ .format(c,n,lr,selectionloss.item(),sisrloss.item(), c1.item(), c2.item(), c3.item()),end="\n") #LOG AND SAVE THE INFORMATION scalar_summaries = { 'Loss/AgentLoss': selectionloss, 'Loss/SISRLoss': sisrloss, "choice/c1": c1, "choice/c2": c2, "choice/c3": c3 } hist_summaries = { 'actions': probs[0].view(-1), "choices": choice[0].view(-1) } #img_summaries = {'sr/mask': probs[0][:3], 'sr/sr': SR_result[0].clamp(0,1), 'sr/hr': hrbatch[0].clamp(0,1),'sr/targetmask': target[0][:3]} img_summaries = { 'sr/mask': probs[0][:3], 'sr/sr': SR_result[0].clamp(0, 1), 'sr/targetmask': target[0][:3] } self.logger.scalar_summary(scalar_summaries) self.logger.hist_summary(hist_summaries) self.logger.image_summary(img_summaries) if self.logger.step % 100 == 0: with torch.no_grad(): psnr, ssim, info = self.test.validateSet5( save=False, quick=False) self.agent.model.train() [model.train() for model in self.SRmodels] if self.logger: self.logger.scalar_summary({ 'Testing_PSNR': psnr, 'Testing_SSIM': ssim }) mask = torch.from_numpy( info['choices']).float().permute(2, 0, 1) / 255.0 best_mask = info['upperboundmask'].squeeze() worst_mask = info['lowerboundmask'].squeeze() hrimg = info['HR'].squeeze() / 255.0 srimg = torch.from_numpy(info['weighted'] / 255.0).permute(2, 0, 1) self.logger.image_summary({ 'Testing/Test Assignment': mask[:3], 'Testing/SR': srimg, 'Testing/HR': hrimg, 'Testing/upperboundmask': best_mask }) self.savemodels() self.logger.incstep()