def write_fastq(forward_path, reverse_path, fastq_reads): """Write forward and reverse reads into a pair of FASTQ files. Args: forward_path: path to which to save forward reads. reverse_path: path to which to save reverse reads. fastq_reads: iterable of Read objects. """ with gfile.Open(forward_path, "w") as forward: with gfile.Open(reverse_path, "w") as reverse: for read in fastq_reads: comp_sequence = dna.reverse_complement(read.sequence) forward.write("@%s\n%s\n+\n%s\n" % (read.title, read.sequence, read.quality)) reverse.write("@%s\n%s\n+\n%s\n" % (read.title, comp_sequence, read.quality[::-1]))
def history_of_iter_dir(iter_dir, can_write_cache=False): execution_data_iter_dir = os.path.join( iter_dir.replace('results', 'execution_data'), 'eval') if not gfile.IsDirectory(execution_data_iter_dir): return None test_acc = None test_iter = None for events_file in gfile.ListDirectory(execution_data_iter_dir): if not events_file.startswith('events.out'): continue for e in tf.train.summary_iterator( os.path.join(execution_data_iter_dir, events_file)): for v in e.summary.value: if v.tag == 'accuracy' or v.tag == 'top_1_accuracy': if test_iter is None or e.step > test_iter: test_iter = e.step test_acc = v.simple_value try: with gfile.Open(os.path.join(iter_dir, 'density_ratio')) as f: density_ratio = float(f.read()) except Exception as e: density_ratio = 1.0 res = IterDatum( iter=os.path.basename(iter_dir), density_ratio=density_ratio, test_acc=test_acc, ) if can_write_cache and test_acc is not None: write_iter(iter_dir, density_ratio, test_acc) with gfile.Open(plot_cache, 'w') as f: f.write('') f.flush() with gfile.Open(plot_cache, 'wb') as f: pickle.dump(res, f) return res
def update_best_model(path, cur_epoch): """Update the records of the model with the lowest validation error. Args: path: the path to the checkpoint of the current model. cur_epoch: a integer of the current epoch """ cur_checkpoint = path cur_checkpoint_meta = six.ensure_str(cur_checkpoint) + '.meta' gfile.Copy(cur_train_report, best_train_report, overwrite=True) gfile.Copy(cur_valid_report, best_valid_report, overwrite=True) gfile.Copy(cur_checkpoint, best_checkpoint, overwrite=True) gfile.Copy(cur_checkpoint_meta, best_checkpoint_meta, overwrite=True) with gfile.Open(best_epoch_file, 'w') as f: f.write(str(cur_epoch)+'\n')
def trial_datum_of_trial(experiment_dir, trial): plot_cache = os.path.join(experiment_dir, trial, 'plot_cache.pkl') if gfile.Exists(plot_cache): with gfile.Open(plot_cache, 'rb') as f: return pickle.loads(f.read()) iter_dirs = sorted( iter_dirs_of_trial_dir(os.path.join(experiment_dir, trial)), key=lambda x: int(iter_re.match(os.path.basename(x)).group('iter'))) pool = mp.Pool(5) res = TrialDatum( trial=trial, iter_data=list( filter( lambda x: x is not None, pool.map( iter_datum_of_iter_dir, map(lambda x: (x[1], x[0] < len(iter_dirs) - 1), enumerate(iter_dirs))))), ) pool.close() return res
def iter_datum_of_iter_dir(iter_dir_and_can_write_cache, verbose=True, ignore_cache=False): iter_dir, can_write_cache = iter_dir_and_can_write_cache if not ignore_cache: res = read_iter(iter_dir) if res: return IterDatum( iter=os.path.basename(res[0]), density_ratio=res[1], test_acc=res[2], ) plot_cache = os.path.join(iter_dir, 'plot_cache.pkl') if gfile.Exists(plot_cache): if verbose: print('PLOT CACHE EXISTS: {}'.format(plot_cache)) with gfile.Open(plot_cache, 'rb') as f: try: it = pickle.loads(f.read()) write_iter(iter_dir, it.density_ratio, it.test_acc) return it except: gfile.Remove(plot_cache) execution_data_iter_dir = os.path.join( iter_dir.replace('results', 'execution_data'), 'eval') if not gfile.IsDirectory(execution_data_iter_dir): return None test_acc = None test_iter = None for events_file in gfile.ListDirectory(execution_data_iter_dir): if not events_file.startswith('events.out'): continue for e in tf.train.summary_iterator( os.path.join(execution_data_iter_dir, events_file)): for v in e.summary.value: if v.tag == 'accuracy' or v.tag == 'top_1_accuracy': if test_iter is None or e.step > test_iter: test_iter = e.step test_acc = v.simple_value if verbose: print(test_acc) try: with gfile.Open(os.path.join(iter_dir, 'density_ratio')) as f: density_ratio = float(f.read()) except Exception as e: density_ratio = 1.0 res = IterDatum( iter=os.path.basename(iter_dir), density_ratio=density_ratio, test_acc=test_acc, ) if can_write_cache and test_acc is not None: write_iter(iter_dir, density_ratio, test_acc) # with gfile.Open(plot_cache, 'w') as f: # f.write('') # f.flush() # with gfile.Open(plot_cache, 'wb') as f: # pickle.dump(res, f) return res
def run_training(hps, experiment_proto, train_dir, train_input_paths, val_input_paths, tuner=None, master='', metrics_targets=None, metrics_measures=None): """Main training function. Trains the model given a directory to write to and a logfile to write to. Args: hps: tf.HParams with training parameters. experiment_proto: selection_pb2.Experiment proto for training. train_dir: str path to train directory. train_input_paths: List[str] giving paths to input sstables for training. val_input_paths: List[str] giving paths to input sstable(s) for validation. tuner: optional hp_tuner.HPTuner. master: optional string to pass to a tf.Supervisor. metrics_targets: String list of network targets to report metrics for. metrics_measures: Measurements about the performance of the network to report, e.g. 'auc/top_1p'. Returns: None. Raises: Error: if the hyperparamter combination in hps is infeasible and there is no tuner. (If the hyperparameter combination is infeasible and there is a tuner then the params are reported back to the tuner as infeasible.) """ hps_infeasible, infeasible_reason = hps_is_infeasible( hps, experiment_proto.sequence_length) if hps_infeasible: if tuner: tuner.report_done(True, infeasible_reason) logger.info('report_done(infeasible=%r)', hps_infeasible) return else: raise Error('Hyperparams are infeasible: %s', infeasible_reason) logger.info('Starting training.') if tuner: logger.info('Using tuner: loaded HParams from Vizier') else: logger.info('No tuner: using default HParams') logger.info('experiment_proto: %s', experiment_proto) logger.info('train_dir: %s', train_dir) logger.info('train_input_paths[0]: %s', train_input_paths[0]) logger.info('val_input_paths[0]: %s', val_input_paths[0]) logger.info('%r', list(hps.values())) generationinfo.to_file(os.path.join(train_dir, 'geninfo.pbtxt')) with gfile.Open(os.path.join(train_dir, config.hparams_name), 'w') as f: f.write(str(hps.to_proto())) eval_size = hps.eval_size or None def make_subdir(subdirectory_mame): path = os.path.join(train_dir, subdirectory_mame) gfile.MakeDirs(path) return path logger.info('Computing preprocessing statistics') # TODO(shoyer): move this over into preprocessing instead? experiment_proto = dataset_stats.compute_experiment_statistics( experiment_proto, train_input_paths, os.path.join( hps.input_dir, six.ensure_str( config.wetlab_experiment_train_pbtxt_path[hps.val_fold]) + '.wstats'), preprocess_mode=hps.preprocess_mode, max_size=eval_size, logdir=make_subdir('compute-statistics'), save_stats=hps.save_stats) logging.info('Saving experiment proto with statistics') with gfile.Open( os.path.join(train_dir, config.wetlab_experiment_train_name), 'w') as f: f.write(str(experiment_proto)) logger.debug(str(hps.to_proto())) logger.debug(hps.run_name) tr_entries = len(sstable.MergedSSTable(train_input_paths)) logger.info('Training sstable size: %d', tr_entries) val_entries = len(sstable.MergedSSTable(val_input_paths)) logger.info('Validation sstable size: %d', val_entries) epoch_size = hps.epoch_size or int(tr_entries * (1 + hps.ratio_random_dna)) num_batches_per_epoch = int(float(epoch_size) / hps.mbsz) eval_ff.config_pandas_display(FLAGS.interactive_display) tr_evaluator = eval_ff.Evaluator( hps, experiment_proto, train_input_paths, make_subdir(config.experiment_training_dir), verbose=FLAGS.verbose_eval) val_evaluator = eval_ff.Evaluator( hps, experiment_proto, val_input_paths, make_subdir(config.experiment_validation_dir), verbose=FLAGS.verbose_eval) with tf.Graph().as_default(): # we need to use the registered key 'hparams' tf.add_to_collection('hparams', hps) # TODO(shoyer): collect these into a Model class: dummy_inputs = data.dummy_inputs( experiment_proto, input_features=hps.input_features, kmer_k_max=hps.kmer_k_max, additional_output=six.ensure_str(hps.additional_output).split(',')) output_layer = output_layers.create_output_layer(experiment_proto, hps) net = ff.FeedForward(dummy_inputs, output_layer.logit_axis, hps) trainer = FeedForwardTrainer(hps, net, output_layer, experiment_proto, train_input_paths) summary_writer = tf.SummaryWriter(make_subdir('training'), flush_secs=30) # TODO(shoyer): file a bug to figure out why write_version=2 (now the # default) doesn't work. saver = tf.Saver(write_version=1) # We are always the chief since we do not do distributed training. # Every replica with a different task id is completely independent and all # must be their own chief. sv = tf.Supervisor( logdir=train_dir, is_chief=True, summary_writer=summary_writer, save_summaries_secs=10, save_model_secs=180, saver=saver) logger.info('Preparing session') train_report_dir = os.path.join(train_dir, config.experiment_training_dir) cur_train_report = os.path.join(train_report_dir, config.experiment_report_name) best_train_report = os.path.join(train_report_dir, config.experiment_best_report_name) valid_report_dir = os.path.join(train_dir, config.experiment_validation_dir) cur_valid_report = os.path.join(valid_report_dir, config.experiment_report_name) best_valid_report = os.path.join(valid_report_dir, config.experiment_best_report_name) best_checkpoint = os.path.join(train_dir, 'model.ckpt-lowest_val_loss') best_checkpoint_meta = best_checkpoint + '.meta' best_epoch_file = os.path.join(train_dir, 'best_epoch.txt') with sv.managed_session(master) as sess: logger.info('Starting queue runners') sv.start_queue_runners(sess) def save_and_evaluate(): """Save and evaluate the current model. Returns: path: the path string to the checkpoint. summary_df: pandas.DataFrame storing the evaluation result on the validation dataset with rows for each output name and columns for each metric value """ logger.info('Saving model checkpoint') path = sv.saver.save( sess, sv.save_path, global_step=sv.global_step, write_meta_graph=True) tr_evaluator.run(path, eval_size) summary_df, _ = val_evaluator.run_and_report( tuner, path, eval_size, metrics_targets=metrics_targets, metrics_measures=metrics_measures) return path, summary_df def update_best_model(path, cur_epoch): """Update the records of the model with the lowest validation error. Args: path: the path to the checkpoint of the current model. cur_epoch: a integer of the current epoch """ cur_checkpoint = path cur_checkpoint_meta = six.ensure_str(cur_checkpoint) + '.meta' gfile.Copy(cur_train_report, best_train_report, overwrite=True) gfile.Copy(cur_valid_report, best_valid_report, overwrite=True) gfile.Copy(cur_checkpoint, best_checkpoint, overwrite=True) gfile.Copy(cur_checkpoint_meta, best_checkpoint_meta, overwrite=True) with gfile.Open(best_epoch_file, 'w') as f: f.write(str(cur_epoch)+'\n') def compare_with_best_model(checkpoint_path, summary_df, cur_epoch): logger.info('Comparing current val loss with the best model') if not gfile.Exists(best_train_report): logger.info('No best model saved. Adding current model...') update_best_model(checkpoint_path, cur_epoch) else: with gfile.GFile(best_valid_report) as f: with xarray.open_dataset(f) as best_ds: best_ds.load() cur_loss = summary_df['loss'].loc['mean'] best_loss = best_ds['loss'].mean('output') logger.info('Current val loss:%f', cur_loss) logger.info('The best val loss:%f', best_loss) if cur_loss < best_loss: logger.info( 'Current model has lower loss. Updating the best model.') update_best_model(checkpoint_path, cur_epoch) else: logger.info('The best model has lower loss.') logger.info('Running eval before starting training') save_and_evaluate() try: for cur_epoch in trainer.train(sess, hps.epochs, num_batches_per_epoch): checkpoint_path, val_summary_df = save_and_evaluate() if (cur_epoch+1) % hps.epoch_interval_to_save_best == 0: compare_with_best_model(checkpoint_path, val_summary_df, cur_epoch) if tuner and tuner.should_trial_stop(): break except eval_ff.TrainingDivergedException as error: logger.error('Training diverged: %s', str(error)) infeasible = True else: infeasible = False logger.info('Saving final checkpoint') sv.saver.save(sess, sv.save_path, global_step=sv.global_step) if tuner: # should be at the very end of execution, to avoid possible race conditions tuner.report_done(infeasible=infeasible) logger.info('report_done(infeasible=%r)', infeasible) logger.info('Done.')