def train(computer, optimizer, real_criterion, binary_criterion, train, valid, starting_epoch, total_epochs, starting_iter, iter_per_epoch, savestr, logfile=True): print_interval = 100 val_interval = 1000 save_interval = 1000 target_dim = None rldmax_len = 500 val_batch = 500 running_loss_deque = deque(maxlen=rldmax_len) # erase the logfile for epoch in range(starting_epoch, total_epochs): # all these are batches for i in range(starting_iter, iter_per_epoch): train_step_loss = run_one_step(computer, train, optimizer, binary_criterion) if train_step_loss is not None: printloss = float(train_step_loss[0]) else: raise ValueError("What is happening?") printloss = 10000 # computer.new_sequence_reset() running_loss_deque.appendleft(printloss) if i % print_interval == 0: running_loss = np.mean(running_loss_deque) logprint( logfile, "learning. count: %4d, training loss: %.10f, running loss: %.10f" % (i, printloss, running_loss)) if i % val_interval == 0: printloss = 0 for _ in range(val_batch): assert (printloss == printloss) val_loss = valid_one_step(computer, valid, binary_criterion) if val_loss is not None: printloss += float(val_loss[0]) else: global failure failure += 1 printloss = printloss / val_batch logprint( logfile, "validation. count: %4d, val loss : %.10f" % (i, printloss)) if i % save_interval == 0: save_model(computer, optimizer, epoch, i, savestr) print("model saved for epoch", epoch, "input", i)
def valid(computer, optimizer, real_criterion, binary_criterion, train, valid, starting_epoch, total_epochs, starting_iter, iter_per_epoch, savestr, logfile=False): """ I have problem comparing the performances of different models. They do not seem to refer to the same value. Processing by sequences and processing by steps are fundamentally different and unfair. :param computer: :param optimizer: :param real_criterion: :param binary_criterion: :param train: this is the ChannelManager class. It has a __next__ method defined. :param valid: ditto :param starting_epoch: :param total_epochs: :param starting_iter: :param iter_per_epoch: :param savestr: a custom string that identifies this training run :param logfile: :return: """ global global_exception_counter print_interval = 100 val_interval = 10000 save_interval = 10000 target_dim = None rldmax_len = 500 val_batch = 100000 running_loss_deque = deque(maxlen=rldmax_len) computer.eval() val_losses = [] for i in range(val_batch): val_loss = valid_one_step(computer, valid, binary_criterion) if val_loss is not None: printloss = float(val_loss[0]) val_losses.append(printloss) else: raise ValueError("Why is val_loss None again?") if logfile: logprint( logfile, "validation. count: %4d, val loss : %.10f" % (i, printloss)) print("validation. count: %4d, loss: %.10f" % (i, printloss)) print("loss:", np.mean(val_losses))
def train(computer, optimizer, real_criterion, binary_criterion, train, valid_dl, starting_epoch, total_epochs, starting_iter, iter_per_epoch, target_dim, savestr, beta, logfile=False, kill_time=True): valid_iterator = iter(valid_dl) print_interval = 100 val_interval = 999 save_interval = 5 rldmax_len = 50 val_batch = int(val_bat_cons / param_bs) running_cod_loss = deque(maxlen=rldmax_len) running_toe_loss = deque(maxlen=rldmax_len) traincms, validcms = ConfusionMatrixStats( target_dim - 1), ConfusionMatrixStats(target_dim - 1) cms = (traincms, validcms) if logfile: open(logfile, 'w').close() for name, param in computer.named_parameters(): logprint(logfile, name) logprint(logfile, param.data.shape) for epoch in range(starting_epoch, total_epochs): for i, (input, target, loss_type) in enumerate(train): i = starting_iter + i if kill_time: out_of_time() if i % val_interval == 0: for _ in range(val_batch): # we should consider running validation multiple times and average. TODO try: (input, target, loss_type) = next(valid_iterator) except StopIteration: valid_iterator = iter(valid_dl) (input, target, loss_type) = next(valid_iterator) cod_loss, toe_loss = run_one_patient(computer, input, target, optimizer, loss_type, real_criterion, binary_criterion, beta, cms, validate=True) # TODO this validation is not printing correctly. Way too big. # this line is not printing the right value. it's not averaged. logprint( logfile, "validation. cod: %.10f, toe: %.10f, total: %.10f" % (cod_loss, toe_loss, cod_loss + beta * toe_loss)) logprint( logfile, "validate sen: %.6f, spe: %.6f, roc: %.6f" % tuple(validcms.running_stats())) if i < iter_per_epoch: cod_loss, toe_loss = run_one_patient(computer, input, target, optimizer, loss_type, real_criterion, binary_criterion, beta, cms) total_loss = cod_loss + toe_loss running_cod_loss.appendleft(cod_loss) running_toe_loss.appendleft(toe_loss) if i % print_interval == 0: running_cod = np.mean(running_cod_loss) running_toe = np.mean(running_toe_loss) logprint( logfile, "batch %4d. batch cod: %.5f, toe: %.5f, total: %.5f. running cod: %.5f, toe: %.5f, total: %.5f" % (i, cod_loss, toe_loss, cod_loss + beta * toe_loss, running_cod, running_toe, running_cod + beta * running_toe)) logprint( logfile, "train sen: %.6f, spe: %.6f, roc: %.6f" % tuple(traincms.running_stats())) else: break if epoch % save_interval == 0: save_model(computer, optimizer, epoch, i, savestr) print("model saved for epoch", epoch, "input", i) starting_iter = 0
def train(computer, optimizer, real_criterion, binary_criterion, train, valid_dl, starting_epoch, total_epochs, starting_iter, iter_per_epoch, savestr, logfile=False): valid_iterator = iter(valid_dl) print_interval = 10 val_interval = 200 save_interval = 800 target_dim = None rldmax_len = 50 val_batch = 100 running_loss_deque = deque(maxlen=rldmax_len) if logfile: open(logfile, 'w').close() for epoch in range(starting_epoch, total_epochs): for i, (input, target, loss_type) in enumerate(train): i = starting_iter + i if target_dim is None: target_dim = target.shape[2] if i < iter_per_epoch: train_story_loss = run_one_patient(computer, input, target, target_dim, optimizer, loss_type, real_criterion, binary_criterion) if train_story_loss is not None: printloss = float(train_story_loss[0]) else: raise ValueError("Why would story loss be None?") running_loss_deque.appendleft(printloss) if i % print_interval == 0: running_loss = np.mean(running_loss_deque) logprint( logfile, "learning. count: %4d, training loss: %.10f, running loss: %.10f" % (i, printloss, running_loss)) if i % val_interval == 0: printloss = 0 for _ in range(val_batch): # we should consider running validation multiple times and average. TODO try: (input, target, loss_type) = next(valid_iterator) except StopIteration: valid_iterator = iter(valid_dl) (input, target, loss_type) = next(valid_iterator) val_loss = run_one_patient(computer, input, target, target_dim, optimizer, loss_type, real_criterion, binary_criterion, validate=True) if val_loss is not None: printloss += float(val_loss[0]) else: raise ValueError("Investigate this") printloss = printloss / val_batch logprint( logfile, "validation. count: %4d, val loss : %.10f" % (i, printloss)) if i % save_interval == 0: save_model(computer, optimizer, epoch, i, savestr) print("model saved for epoch", epoch, "input", i) else: break
def train(computer, optimizer, real_criterion, binary_criterion, train, valid_dl, starting_epoch, total_epochs, starting_iter, iter_per_epoch, savestr, beta, logfile=False, kill_time=True): # valid_iterator = iter(valid_dl) print_interval = 100 val_interval = 500 save_interval = int(32768 / param_bs) target_dim = None rldmax_len = 50 val_batch = int(val_bat_cons / param_bs) running_cod_loss = deque(maxlen=rldmax_len) running_toe_loss = deque(maxlen=rldmax_len) if logfile: open(logfile, 'w').close() for name, param in computer.named_parameters(): logprint(logfile, name) logprint(logfile, param.data.shape) for epoch in range(starting_epoch, total_epochs): for i, (input, target, loss_type) in enumerate(train): i = starting_iter + i if kill_time: out_of_time() if target_dim is None: target_dim = target.shape[1] if i < iter_per_epoch: cod_loss, toe_loss = run_one_patient(computer, input, target, optimizer, loss_type, real_criterion, binary_criterion, beta) total_loss = cod_loss + toe_loss running_cod_loss.appendleft(cod_loss) running_toe_loss.appendleft(toe_loss) if i % print_interval == 0: running_cod = np.mean(running_cod_loss) running_toe = np.mean(running_toe_loss) logprint( logfile, "batch %4d. batch cod: %.5f, toe: %.5f, total: %.5f. running cod: %.5f, toe: %.5f, total: %.5f" % (i, cod_loss, toe_loss, cod_loss + beta * toe_loss, running_cod, running_toe, running_cod + beta * running_toe)) # # if i % val_interval == 0: # total_cod=0 # total_toe=0 # total_sen=0 # total_spe=0 # total_prec=0 # total_reca=0 # total_f1=0 # total_accu=0 # total_roc=0 # for _ in range(val_batch): # # we should consider running validation multiple times and average. TODO # try: # (input, target, loss_type) = next(valid_iterator) # except StopIteration: # valid_iterator = iter(valid_dl) # (input, target, loss_type) = next(valid_iterator) # # cod_loss, toe_loss, sen, spe, prec, reca, f1, accu, roc \ # = run_one_patient(computer, input, target, optimizer, loss_type, # real_criterion, binary_criterion, beta, validate=True) # total_cod+=cod_loss # total_toe+=toe_loss # total_sen+=sen # total_spe+=spe # total_prec+=prec # total_reca+=reca # total_f1+=f1 # total_accu+=accu # total_roc+=roc # total_cod=total_cod/val_batch # total_toe=total_toe/val_batch # total_sen=total_sen/val_batch # total_spe=total_spe/val_batch # total_prec=total_prec/val_batch # total_reca=total_reca/val_batch # total_f1=total_f1/val_batch # total_accu=total_accu/val_batch # total_roc=total_roc/val_batch # assert(total_cod>0) # assert(total_toe>0) # assert(total_sen>0) # assert(total_spe>0) # assert(total_prec>0) # assert(total_reca>0) # assert(total_f1>0) # assert(total_accu>0) # assert(total_roc>0) # # TODO this validation is not printing correctly. Way too big. # logprint(logfile, "validation. cod: %.10f, toe: %.10f, total: %.10f" % # (total_cod, total_toe, total_cod + beta*total_toe)) # logprint(logfile, "sen: %.6f, spe: %.6f, prec: %.6f, recall: %.6f, f1: %.6f, accu: %.6f, roc: %.6f" % # (total_sen, total_spe, total_prec, total_reca, total_f1, total_accu, total_roc)) # if i % save_interval == 0: save_model(computer, optimizer, epoch, i, savestr) print("model saved for epoch", epoch, "input", i) else: break starting_iter = 0
def train(computer, optimizer, real_criterion, binary_criterion, train, valid_dl, starting_epoch, total_epochs, starting_iter, iter_per_epoch, savestr, beta, logfile=False): valid_iterator = iter(valid_dl) print_interval = 10 val_interval = 400 save_interval = int(8000 / param_bs) target_dim = None rldmax_len = 50 val_batch = int(val_bat_cons / param_bs) running_cod_loss = deque(maxlen=rldmax_len) running_toe_loss = deque(maxlen=rldmax_len) if logfile: open(logfile, 'w').close() for name, param in computer.named_parameters(): logprint(logfile, name) logprint(logfile, param.data.shape) for epoch in range(starting_epoch, total_epochs): for i, (input, target, loss_type) in enumerate(train): i = starting_iter + i out_of_time() if target_dim is None: target_dim = target.shape[1] if i < iter_per_epoch: cod_loss, toe_loss = run_one_patient(computer, input, target, optimizer, loss_type, real_criterion, binary_criterion, beta) total_loss = cod_loss + toe_loss running_cod_loss.appendleft(cod_loss) running_toe_loss.appendleft(toe_loss) if i % print_interval == 0: running_cod = np.mean(running_cod_loss) running_toe = np.mean(running_toe_loss) logprint( logfile, "batch %4d. batch cod: %.5f, toe: %.5f, total: %.5f. running cod: %.5f, toe: %.5f, total: %.5f" % (i, cod_loss, toe_loss, cod_loss + beta * toe_loss, running_cod, running_toe, running_cod + beta * running_toe)) if i % val_interval == 0: total_cod = 0 total_toe = 0 for _ in range(val_batch): # we should consider running validation multiple times and average. TODO try: (input, target, loss_type) = next(valid_iterator) except StopIteration: valid_iterator = iter(valid_dl) (input, target, loss_type) = next(valid_iterator) cod_loss, toe_loss = run_one_patient(computer, input, target, optimizer, loss_type, real_criterion, binary_criterion, beta, validate=True) total_cod += cod_loss total_toe += toe_loss total_cod = total_cod / val_batch total_toe = total_toe / val_batch # TODO this validation is not printing correctly. Way too big. logprint( logfile, "validation. cod: %.10f, toe: %.10f, total: %.10f" % (total_cod, total_toe, total_cod + beta * total_toe)) if i % save_interval == 0: save_model(computer, optimizer, epoch, i, savestr) print("model saved for epoch", epoch, "input", i) else: break