def get_parser(): """Get argparser object. Returns: :argparse:`ArgumentParser` : the argparser object """ parser = argparse.ArgumentParser( description='Plot graphs of training loss', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( '--mav', default=None, type=int, help='Moving average window applied to batchlog loss.' + 'e.g --mav 10 visually separates loss curves') parser.add_argument( '--upper_y_limit', default=None, type=Positive(float), help='Upper limit of plot y(loss) axis') parser.add_argument( '--lower_y_limit', default=None, type=Positive(float), help='Lower limit of plot y(loss) axis') parser.add_argument( '--upper_x_limit', default=None, type=Positive(float), help='Upper limit of plot x(iterations) axis') parser.add_argument( '--lower_x_limit', default=None, type=Positive(float), help='Lower limit of plot x(iterations) axis') parser.add_argument( 'output', help='Output png file') parser.add_argument( 'input_directories', nargs='+', help='One or more directories containing {} and {} files'.format( BATCH_LOG_FILENAME, VAL_LOG_FILENAME)) return parser
def get_parser(): parser = argparse.ArgumentParser( description="Basecall reads using a taiyaki model", formatter_class=argparse.ArgumentDefaultsHelpFormatter) add_common_command_args( parser, """alphabet device input_folder input_strand_list jobs limit output quiet recursive version""".split()) parser.add_argument( '--beam', default=None, metavar=('width', 'guided'), nargs=2, type=(int, bool), action=ParseToNamedTuple, help='Use beam search decoding') parser.add_argument( "--chunk_size", type=Positive(int), metavar="blocks", default=basecall_helpers._DEFAULT_CHUNK_SIZE, help="Size of signal chunks sent to GPU is chunk_size * model stride") parser.add_argument( '--fastq', default=False, action=AutoBool, help='Write output in fastq format (default is fasta)') parser.add_argument( "--max_concurrent_chunks", type=Positive(int), default=128, help="Maximum number of chunks to call at " "once. Lower values will consume less (GPU) RAM.") parser.add_argument( "--overlap", type=NonNegative(int), metavar="blocks", default=basecall_helpers._DEFAULT_OVERLAP, help="Overlap between signal chunks sent to GPU") parser.add_argument( '--posterior', default=True, action=AutoBool, help='Use posterior-viterbi decoding') parser.add_argument( "--qscore_offset", type=float, default=0.0, help="Offset to apply to q scores in fastq (after scale)") parser.add_argument( "--qscore_scale", type=float, default=1.0, help="Scaling factor to apply to q scores in fastq") parser.add_argument( '--reverse', default=False, action=AutoBool, help='Reverse sequences in output') parser.add_argument( '--scaling', action=FileExists, default=None, help='Path to TSV containing per-read scaling params') parser.add_argument( '--temperature', default=1.0, type=float, help='Scaling factor applied to network outputs before decoding') parser.add_argument( "model", action=FileExists, help="Model checkpoint file to use for basecalling") return parser
def get_parser(): parser = argparse.ArgumentParser( description='Plot reference-to-signal maps from mapped signal ' + 'files. Also dump one-line summary of each read to stdout', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( '--output', help='Output PNG filename. ' + 'Default: only output per-read summaries.') parser.add_argument( '--maxlegendsize', type=Positive(int), default=10, help='Maximum number of reads to list in the legend.') parser.add_argument( '--nreads', type=Positive(int), default=10, help='Max number of reads to read from each file. Not used if ' + 'read_ids are given') parser.add_argument( '--read_ids', nargs='+', default=[], help='One or more read_ids. If not present, plots the first ' + '[--nreads] in each file') parser.add_argument( '--xmin', default=None, type=float, help='Minimum x for plot') parser.add_argument( '--xmax', default=None, type=float, help='Maximum x for plot') parser.add_argument( '--ymin', default=None, type=float, help='Minimum x for plot') parser.add_argument( '--ymax', default=None, type=float, help='Maximum x for plot') parser.add_argument( '--line_transparency', type=float, default=1.0, help='Transparency value for lines. Default: %(default)f') parser.add_argument( '--zero_signal_start', action='store_true', help='Start signal locations at zero. Default: start at ' + 'assigned position within entire read.') parser.add_argument( '--quiet', action='store_true', help='Do not display status messages.') parser.add_argument( 'mapped_signal_files', nargs='+', help='Inputs: one or more mapped signal files') return parser
def get_parser(): """Get argparser object. Returns: :argparse:`ArgumentParser` : the argparser object """ parser = argparse.ArgumentParser( description='Split a strand list into a number of smaller strand ' + 'lists, or alternatively do the same thing starting with a ' + 'directory containing fast5s.') parser.add_argument('--maxlistsize', default=10000, type=Positive(int), help='Maximum size for a strand list') parser.add_argument( '--outputbase', default=10000, help='Strand lists will be saved as <outputbase>_000.txt etc. If ' + 'outputbase not present then the input will be used as the base name.') parser.add_argument('input', help='either a strand list file or a directory name') return parser
def get_parser(): parser = argparse.ArgumentParser( description='Train a flip-flop neural network', formatter_class=argparse.ArgumentDefaultsHelpFormatter) add_common_command_args( parser, """adam alphabet device eps limit niteration outdir overwrite quiet save_every version weight_decay""".split()) parser.add_argument('--batch_size', default=128, metavar='chunks', type=Positive(int), help='Number of chunks to run in parallel') parser.add_argument( '--gradient_cap_fraction', default=0.05, metavar='f', type=Maybe(NonNegative(float)), help='Cap L2 norm of gradient so that a fraction f of gradients ' + 'are capped. Use --gradient_cap_fraction None for no capping.') parser.add_argument('--lr_max', default=4.0e-3, metavar='rate', type=Positive(float), help='Initial learning rate') parser.add_argument('--size', default=96, metavar='neurons', type=Positive(int), help='Base layer size for model') parser.add_argument('--seed', default=None, metavar='integer', type=Positive(int), help='Set random number seed') parser.add_argument('--stride', default=2, metavar='samples', type=Positive(int), help='Stride for model') parser.add_argument('--winlen', default=19, type=Positive(int), help='Length of window over data') parser.add_argument('model', action=FileExists, help='File to read python model description from') parser.add_argument('chunks', action=FileExists, help='file containing chunks') parser.add_argument('reference', action=FileExists, help='file containing fasta reference') return parser
def get_parser(): parser = argparse.ArgumentParser( description='Plot an accuracy histogram from a combined read file', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( 'combined_read_file', action=FileExists, help='Combined read file to get data from') parser.add_argument( '--bins', default=100, type=Positive(int), help='Number of bins for histogram') parser.add_argument( '--title', default='', help='Figure title') parser.add_argument( '--output_name', default='basecaller_histogram.png', help='Output file name') return parser
def get_train_flipflop_parser(): parser = argparse.ArgumentParser( description='Train flip-flop neural network', formatter_class=argparse.ArgumentDefaultsHelpFormatter) mdl_grp = parser.add_argument_group('Model Arguments') mdl_grp.add_argument( '--size', default=384, metavar='neurons', type=Positive(int), help='Base layer size for model') mdl_grp.add_argument( '--stride', default=5, metavar='samples', type=Positive(int), help='Stride for model') mdl_grp.add_argument( '--winlen', default=19, type=Positive(int), help='Length of window over data') trn_grp = parser.add_argument_group('Training Arguments') trn_grp.add_argument( '--adam', nargs=2, metavar=('beta1', 'beta2'), default=[0.9, 0.999], type=NonNegative(float), help='Parameters beta1, beta2 for Exponential Decay ' + 'Adaptive Momentum') trn_grp.add_argument( '--eps', default=1e-6, metavar='adjustment', type=Positive(float), help='Small value to stabilise optimiser') trn_grp.add_argument( '--niteration', metavar='batches', type=Positive(int), default=150000, help='Maximum number of batches to train for') trn_grp.add_argument( '--weight_decay', default=0.01, metavar='penalty', type=NonNegative(float), help='Adam weight decay (L2 normalisation penalty)') trn_grp.add_argument( '--gradient_clip_num_mads', default=0, metavar='num_MADs', type=Maybe(NonNegative(float)), help='Clip gradients (by value) at num_MADs above the median of ' + 'the last 1000 parameter gradient maximums. Gradient threshold ' + 'values are computed for each parameter group independently. Use ' + '"--gradient_clip_num_mads None" for no clipping.') trn_grp.add_argument( '--lr_max', default=4.0e-3, metavar='rate', type=Positive(float), help='Max learning rate, reached at --warmup_batches iterations.') trn_grp.add_argument( '--lr_min', default=1.0e-4, metavar='rate', type=Positive(float), help='Min (starting and final) learning rate') trn_grp.add_argument( '--seed', default=None, metavar='integer', type=Positive(int), help='Set random number seed') trn_grp.add_argument( '--sharpen', default=(1.0, 1.0, 25000), nargs=3, metavar=('min', 'max', 'niter'), action=ParseToNamedTuple, type=(Positive(float), Positive(float), Positive(int)), help='Increase sharpening factor linearly from "min" to ' + '"max" over "niter" iterations') trn_grp.add_argument( '--warmup_batches', type=int, default=200, help='Over first n batches, increase learning rate like cosine.') trn_grp.add_argument( '--lr_warmup', metavar='rate', type=Positive(float), help='Start learning rate for warmup. Defaults to lr_min.') trn_grp.add_argument( '--min_momentum', type=Positive(float), help='Min momentum in cycling. default = Adam beta1, no cycling') data_grp = parser.add_argument_group('Data Arguments') data_grp.add_argument( '--filter_max_dwell', default=10.0, metavar='multiple', type=Maybe(Positive(float)), help='Drop chunks with max dwell more than multiple of median ' + '(over chunks)') data_grp.add_argument( '--filter_mean_dwell', default=3.0, metavar='radius', type=Maybe(Positive(float)), help='Drop chunks with mean dwell more than radius deviations ' + 'from the median (over chunks)') data_grp.add_argument( '--filter_min_pass_fraction', default=0.5, metavar='fraction', type=Maybe(Positive(float)), help='Halt if fraction of chunks passing tests is less than this') data_grp.add_argument( '--filter_path_buffer', default=1.1, metavar='ratio', type=Bounded(float, lower=1.0), help='Drop chunks with small ratio of signal length to bases * ' + 'model stride, which would restrict potential CTC paths. Must be ' + 'greater than 1.0.') data_grp.add_argument( '--limit', default=None, type=Maybe(Positive(int)), help='Limit number of reads to process') data_grp.add_argument( '--reverse', default=False, action=AutoBool, help='Reverse input sequence and current') data_grp.add_argument( '--sample_nreads_before_filtering', metavar='n', type=NonNegative(int), default=100000, help='Sample n reads to decide on bounds for filtering before ' + 'training. Set to 0 to do all.') data_grp.add_argument( '--chunk_len_min', default=3000, metavar='samples', type=Positive(int), help='Min length of each chunk in samples (chunk lengths are ' + 'random between min and max)') data_grp.add_argument( '--chunk_len_max', default=8000, metavar='samples', type=Positive(int), help='Max length of each chunk in samples (chunk lengths are ' + 'random between min and max)') data_grp.add_argument( '--include_reporting_strands', default=False, action=AutoBool, help='Include reporting strands in training. Default: Hold ' + 'training strands out of training.') data_grp.add_argument( '--input_strand_list', default=None, action=FileExists, help='Strand summary file containing column read_id. Filenames in ' + 'file are ignored.') data_grp.add_argument( '--min_sub_batch_size', default=128, metavar='chunks', type=Positive(int), help='Number of chunks to run in parallel per sub-batch for ' + 'chunk_len = chunk_len_max. Actual length of sub-batch used is ' + '(min_sub_batch_size * chunk_len_max / chunk_len).') data_grp.add_argument( '--reporting_percent_reads', default=1, metavar='sub_batches', type=Positive(float), help='Percent of reads to use for std loss reporting') data_grp.add_argument( '--reporting_strand_list', action=FileExists, help='Strand summary file containing column read_id. All other ' + 'fields are ignored. If not provided reporting strands will be ' + 'randomly selected.') data_grp.add_argument( '--reporting_sub_batches', default=100, metavar='sub_batches', type=Positive(int), help='Number of sub-batches to use for std loss reporting') data_grp.add_argument( '--standardize', default=True, action=AutoBool, help='Standardize currents for each read') data_grp.add_argument( '--sub_batches', default=1, metavar='sub_batches', type=Positive(int), help='Number of sub-batches per batch') cmp_grp = parser.add_argument_group('Compute Arguments') cmp_grp.add_argument( '--device', default='cpu', action=DeviceAction, help='Integer specifying which GPU to use, or "cpu" to use CPU only. ' 'Other accepted formats: "cuda" (use default GPU), "cuda:2" ' 'or "cuda2" (use GPU 2).') # Argument local_rank is used only by when the script is run in multi-GPU # mode using torch.distributed.launch. See the README. cmp_grp.add_argument( '--local_rank', type=int, default=None, help=argparse.SUPPRESS) out_grp = parser.add_argument_group('Output Arguments') out_grp.add_argument( '--full_filter_status', default=False, action=AutoBool, help='Output full chunk filtering statistics. Default: only ' + 'proportion of filtered chunks.') out_grp.add_argument( '--outdir', default='training', help='Output directory, created when run.') out_grp.add_argument( '--overwrite', default=False, action=AutoBool, help='Whether to overwrite any output files') out_grp.add_argument( '--quiet', default=False, action=AutoBool, help="Don't print progress information to stdout") out_grp.add_argument( '--save_every', metavar='x', type=Positive(int), default=2500, help='Save model every x batches') mod_grp = parser.add_argument_group('Modified Base Arguments') mod_grp.add_argument( '--mod_factor', default=(8.0, 1.0, 50000), nargs=3, metavar=('start', 'final', 'niter'), action=ParseToNamedTuple, type=(Positive(float), Positive(float), Positive(int)), help='Relative weight applied to modified base transitions in ' + 'loss/gradient compared to canonical transitions. Larger values ' + 'increase the effective modified base learning rate. Scale factor ' + 'linearly from "start" to "final" over first "niter" iterations') mod_grp.add_argument( '--mod_prior_factor', type=float, help='Exponential factor applied to prior mod weights estimated ' + 'from training data. Intended to balance modified base scores. ' + 'Default: no mod prior') mod_grp.add_argument( '--num_mod_weight_reads', type=int, default=5000, help='Number of reads to sample to compute the modified base prior ' + 'weights from the training data.') misc_grp = parser.add_argument_group('Miscellaneous Arguments') misc_grp.add_argument( '--version', nargs=0, action=display_version_and_exit, metavar=__version__, help='Display version information.') parser.add_argument( 'model', action=FileExists, help='File to read python model (or checkpoint) from') parser.add_argument( 'input', action=FileExists, help='file containing mapped reads') return parser
parser = argparse.ArgumentParser( description='Train a flip-flop neural network', formatter_class=argparse.ArgumentDefaultsHelpFormatter) add_common_command_args( parser, """adam chunk_logging_threshold device filter_max_dwell filter_mean_dwell limit lr_cosine_iters niteration overwrite quiet save_every sample_nreads_before_filtering version weight_decay""" .split()) parser.add_argument( '--chunk_len_min', default=2000, metavar='samples', type=Positive(int), help= 'Min length of each chunk in samples (chunk lengths are random between min and max)' ) parser.add_argument( '--chunk_len_max', default=4000, metavar='samples', type=Positive(int), help= 'Max length of each chunk in samples (chunk lengths are random between min and max)' ) parser.add_argument( '--input_strand_list', default=None, action=FileExists,
parser = argparse.ArgumentParser( description='Train a model to predict ionic current levels from sequence', formatter_class=argparse.ArgumentDefaultsHelpFormatter) add_common_command_args( parser, """adam chunk_logging_threshold device filter_max_dwell filter_mean_dwell limit niteration overwrite quiet save_every sample_nreads_before_filtering version weight_decay""" .split()) parser.add_argument('--batch_size', default=100, metavar='chunks', type=Positive(int), help='Number of chunks to run in parallel') parser.add_argument('--back_prob', default=1e-15, metavar='probability', type=proportion, help='Probability of backwards move') parser.add_argument('--depth', metavar='layers', default=4, type=Positive(int), help='Number of residual convolution layers') parser.add_argument( '--drop_slip', default=5, type=Maybe(Positive(int)),
from taiyaki import ctc, flipflopfings, helpers from taiyaki.cmdargs import FileExists, Positive from taiyaki.common_cmdargs import add_common_command_args # This is here, not in main to allow documentation to be built parser = argparse.ArgumentParser( description='Train a flip-flop neural network', formatter_class=argparse.ArgumentDefaultsHelpFormatter) add_common_command_args(parser, """adam alphabet device eps limit niteration outdir overwrite quiet save_every version""".split()) parser.add_argument('--batch_size', default=128, metavar='chunks', type=Positive(int), help='Number of chunks to run in parallel') parser.add_argument( '--lr_max', default=4.0e-3, metavar='rate', type=Positive(float), help='Initial learning rate') parser.add_argument('--size', default=96, metavar='neurons', type=Positive(int), help='Base layer size for model') parser.add_argument('--seed', default=None, metavar='integer', type=Positive(int), help='Set random number seed') parser.add_argument('--stride', default=2, metavar='samples', type=Positive(int), help='Stride for model') parser.add_argument('--winlen', default=19, type=Positive(int), help='Length of window over data') parser.add_argument('model', action=FileExists, help='File to read python model description from') parser.add_argument('chunks', action=FileExists, help='file containing chunks')
def get_parser(): parser = argparse.ArgumentParser( description='Train a model to predict ionic current levels ' + 'from sequence', formatter_class=argparse.ArgumentDefaultsHelpFormatter) add_common_command_args( parser, """adam device eps filter_max_dwell filter_mean_dwell limit niteration outdir overwrite quiet reverse save_every sample_nreads_before_filtering version weight_decay""".split()) parser.add_argument('--batch_size', default=100, metavar='chunks', type=Positive(int), help='Number of chunks to run in parallel') parser.add_argument('--back_prob', default=1e-15, metavar='probability', type=proportion, help='Probability of backwards move') parser.add_argument('--depth', metavar='layers', default=4, type=Positive(int), help='Number of residual convolution layers') parser.add_argument( '--drop_slip', default=5, type=Maybe(Positive(int)), metavar='length', help='Drop chunks with slips greater than given length (None = off)') parser.add_argument( '--filter_path_buffer', default=1.1, metavar='ratio', type=float, help='Drop chunks with small ratio of signal length to bases * ' + 'model stride, which would restrict potential CTC paths.') parser.add_argument( '--filter_min_pass_fraction', default=0.5, metavar='fraction', type=Maybe(Positive(float)), help='Halt if fraction of chunks passing tests is less than this') parser.add_argument('--full_filter_status', default=False, action=AutoBool, help='Output full chunk filtering statistics. ' + 'Default: only proportion of filtered chunks.') parser.add_argument( '--input_strand_list', default=None, action=FileExists, help='Strand summary file containing column read_id. Filenames in ' + 'file are ignored.') parser.add_argument( '--lr_decay', default=5000, metavar='n', type=Positive(float), help='Learning rate for batch i is lr_max / (1.0 + i / n)') parser.add_argument('--lr_max', default=1.0e-4, metavar='rate', type=Positive(float), help='Max (and starting) learning rate') parser.add_argument('--sd', default=0.5, metavar='value', type=Positive(float), help='Standard deviation to initialise with') parser.add_argument('--seed', default=None, metavar='integer', type=Positive(int), help='Set random number seed') parser.add_argument('--size', metavar='n', default=32, type=Positive(int), help='Size of layers in convolution network') parser.add_argument('--target_len', metavar='n', default=300, type=Positive(int), help='Target length of sequence') parser.add_argument('--winlen', metavar='n', default=9, type=Positive(int), help='Window for convolution network') parser.add_argument('input', action=FileExists, help='HDF5 file containing mapped reads') return parser
from taiyaki.maths import med_mad from taiyaki.signal import Signal STITCH_BEFORE_VITERBI = False parser = argparse.ArgumentParser( description="Basecall reads using a taiyaki model", formatter_class=argparse.ArgumentDefaultsHelpFormatter) add_common_command_args(parser, 'device input_folder input_strand_list limit output quiet recursive version'.split()) parser.add_argument("--alphabet", default=DEFAULT_ALPHABET, help="Alphabet used by basecaller") parser.add_argument("--chunk_size", type=Positive(int), default=basecall_helpers._DEFAULT_CHUNK_SIZE, help="Size of signal chunks sent to GPU") parser.add_argument("--overlap", type=NonNegative(int), default=basecall_helpers._DEFAULT_OVERLAP, help="Overlap between signal chunks sent to GPU") parser.add_argument("--modified_base_output", action=FileAbsent, default=None, help="Output filename for modified base output.") parser.add_argument("model", action=FileExists, help="Model checkpoint file to use for basecalling") def med_mad_norm(x, dtype='f4'): """ Normalise a numpy array using median and MAD """ med, mad = med_mad(x) normed_x = (x - med) / mad
def add_common_command_args(parser, arglist): """Given an argparse parser object and a list of keys such as ['input_strand_list', 'jobs'], add these command line args to the parser. Note that not all command line args used in the package are included in this func: only those that are used by more than one script and which have the same defaults. Also note that some args are positional and some are optional. The optional ones are listed first below.""" ############################################################################ # # Optional arguments # ############################################################################ if 'adam' in arglist: parser.add_argument( '--adam', nargs=3, metavar=('rate', 'decay1', 'decay2'), default=(1e-3, 0.9, 0.999), type=(NonNegative(float), NonNegative(float), NonNegative(float)), action=ParseToNamedTuple, help='Parameters for Exponential Decay Adaptive Momementum') if 'chunk_logging_threshold' in arglist: parser.add_argument( '--chunk_logging_threshold', default=10.0, metavar='multiple', type=NonNegative(float), help= 'If loss > (threshold * smoothed loss) for a batch, then log chunks to ' + 'output/chunklog.tsv. Set to zero to log all, including rejected chunks' ) if 'device' in arglist: parser.add_argument( '--device', default='cpu', action=DeviceAction, help= 'Integer specifying which GPU to use, or "cpu" to use CPU only. ' 'Other accepted formats: "cuda" (use default GPU), "cuda:2" ' 'or "cuda2" (use GPU 2).') if 'filter_max_dwell' in arglist: parser.add_argument( '--filter_max_dwell', default=10.0, metavar='multiple', type=Maybe(Positive(float)), help= 'Drop chunks with max dwell more than multiple of median (over chunks)' ) if 'filter_mean_dwell' in arglist: parser.add_argument( '--filter_mean_dwell', default=3.0, metavar='radius', type=Maybe(Positive(float)), help= 'Drop chunks with mean dwell more than radius deviations from the median (over chunks)' ) if 'input_strand_list' in arglist: parser.add_argument('--input_strand_list', default=None, action=FileExists, help='Strand summary file containing subset') if 'jobs' in arglist: parser.add_argument( '--jobs', default=1, metavar='n', type=Positive(int), help='Number of threads to use when processing data') if 'limit' in arglist: parser.add_argument('--limit', default=None, type=Maybe(Positive(int)), help='Limit number of reads to process') if 'lrdecay' in arglist: parser.add_argument( '--lrdecay', default=5000, metavar='n', type=Positive(float), help='Learning rate for batch i is adam.rate / (1.0 + i / n)') if 'niteration' in arglist: parser.add_argument('--niteration', metavar='batches', type=Positive(int), default=50000, help='Maximum number of batches to train for') if 'overwrite' in arglist: parser.add_argument('--overwrite', default=False, action=AutoBool, help='Whether to overwrite any output files') if 'quiet' in arglist: parser.add_argument('--quiet', default=False, action=AutoBool, help="Don't print progress information to stdout") if 'sample_nreads_before_filtering' in arglist: parser.add_argument( '--sample_nreads_before_filtering', metavar='n', type=NonNegative(int), default=1000, help= 'Sample n reads to decide on bounds for filtering before training. Set to 0 to do all.' ) if 'save_every' in arglist: parser.add_argument('--save_every', metavar='x', type=Positive(int), default=5000, help='Save model every x batches') if 'version' in arglist: parser.add_argument('--version', nargs=0, action=display_version_and_exit, metavar=__version__, help='Display version information.') if 'weight_decay' in arglist: parser.add_argument( '--weight_decay', default=0.0, metavar='penalty', type=NonNegative(float), help='Adam weight decay (L2 normalisation penalty)') ############################################################################ # # Positional arguments # ############################################################################ if 'input_folder' in arglist: parser.add_argument( 'input_folder', action=FileExists, help='Directory containing single-read fast5 files')
mapped_signal_files, optim) from taiyaki import __version__ from taiyaki.cmdargs import FileExists, FilesExist, Positive from taiyaki.common_cmdargs import add_common_command_args from taiyaki.constants import DOTROWLENGTH # This is here, not in main to allow documentation to be built parser = argparse.ArgumentParser(description='Train a flip-flop neural network', formatter_class=argparse.ArgumentDefaultsHelpFormatter) add_common_command_args(parser, """adam chunk_logging_threshold device filter_max_dwell filter_mean_dwell limit lr_cosine_iters niteration overwrite quiet save_every sample_nreads_before_filtering version weight_decay""".split()) parser.add_argument('--chunk_len_min', default=2000, metavar='samples', type=Positive(int), help='Min length of each chunk in samples (chunk lengths are random between min and max)') parser.add_argument('--chunk_len_max', default=4000, metavar='samples', type=Positive(int), help='Max length of each chunk in samples (chunk lengths are random between min and max)') parser.add_argument('--input_strand_list', default=None, action=FileExists, help='Strand summary file containing column read_id. Filenames in file are ignored.') parser.add_argument('--lr_cosine_iters', default=40000, metavar='n', type=Positive(float), help='Learning rate decreases from max to min like cosine function over n batches') parser.add_argument('--lr_max', default=2.0e-3, metavar='rate', type=Positive(float), help='Max (and starting) learning rate') parser.add_argument('--lr_min', default=1.0e-4, metavar='rate', type=Positive(float), help='Min (and final) learning rate') parser.add_argument('--min_batch_size', default=64, metavar='chunks', type=Positive(int), help='Number of chunks to run in parallel for chunk_len = chunk_len_max.' + 'Actual batch size used is (min_batch_size / chunk_len) * chunk_len_max')
mapped_signal_files, optim) from taiyaki import __version__ from taiyaki.cmdargs import FileExists, Maybe, Positive, proportion from taiyaki.common_cmdargs import add_common_command_args from taiyaki.constants import DOTROWLENGTH from taiyaki.squiggle_match import squiggle_match_loss, embed_sequence parser = argparse.ArgumentParser(description='Train a model to predict ionic current levels from sequence', formatter_class=argparse.ArgumentDefaultsHelpFormatter) add_common_command_args(parser, """adam chunk_logging_threshold device filter_max_dwell filter_mean_dwell limit niteration overwrite quiet save_every sample_nreads_before_filtering version weight_decay""".split()) parser.add_argument('--batch_size', default=100, metavar='chunks', type=Positive(int), help='Number of chunks to run in parallel') parser.add_argument('--back_prob', default=1e-15, metavar='probability', type=proportion, help='Probability of backwards move') parser.add_argument('--depth', metavar='layers' , default=4, type=Positive(int), help='Number of residual convolution layers') parser.add_argument('--drop_slip', default=5, type=Maybe(Positive(int)), metavar='length', help='Drop chunks with slips greater than given length (None = off)') parser.add_argument('--input_strand_list', default=None, action=FileExists, help='Strand summary file containing column read_id. Filenames in file are ignored.') parser.add_argument('--lr_decay', default=5000, metavar='n', type=Positive(float), help='Learning rate for batch i is lr_max / (1.0 + i / n)') parser.add_argument('--lr_max', default=1.0e-4, metavar='rate', type=Positive(float), help='Max (and starting) learning rate') parser.add_argument('--sd', default=0.5, metavar='value', type=Positive(float),
def add_common_command_args(parser, arglist): """Given an argparse parser object and a list of keys such as ['input_strand_list', 'jobs'], add these command line args to the parser. Not all command line args used in the package are included in this func: only those that are used by more than one script and which have the same defaults. Some args are positional and some are optional. The optional ones are listed first below.""" ############################################################################ # # Optional arguments # ############################################################################ if 'adam' in arglist: parser.add_argument( '--adam', nargs=2, metavar=('beta1', 'beta2'), default=[0.9, 0.999], type=NonNegative(float), help= 'Parameters beta1, beta2 for Exponential Decay Adaptive Momentum') if 'alphabet' in arglist: parser.add_argument('--alphabet', default=DEFAULT_ALPHABET, help='Canonical base alphabet') if 'device' in arglist: parser.add_argument( '--device', default='cpu', action=DeviceAction, help= 'Integer specifying which GPU to use, or "cpu" to use CPU only. ' 'Other accepted formats: "cuda" (use default GPU), "cuda:2" ' 'or "cuda2" (use GPU 2).') if 'eps' in arglist: parser.add_argument('--eps', default=1e-6, metavar='adjustment', type=Positive(float), help='Small value to stabilise optimiser') if 'filter_max_dwell' in arglist: parser.add_argument( '--filter_max_dwell', default=10.0, metavar='multiple', type=Maybe(Positive(float)), help= 'Drop chunks with max dwell more than multiple of median (over chunks)' ) if 'filter_mean_dwell' in arglist: parser.add_argument( '--filter_mean_dwell', default=3.0, metavar='radius', type=Maybe(Positive(float)), help= 'Drop chunks with mean dwell more than radius deviations from the median (over chunks)' ) if 'input_strand_list' in arglist: parser.add_argument( '--input_strand_list', default=None, action=FileExists, help= 'Strand list TSV file with columns filename_fast5 or read_id or both' ) if 'jobs' in arglist: parser.add_argument( '--jobs', default=1, metavar='n', type=Positive(int), help='Number of threads to use when processing data') if 'limit' in arglist: parser.add_argument('--limit', default=None, type=Maybe(Positive(int)), help='Limit number of reads to process') if 'niteration' in arglist: parser.add_argument('--niteration', metavar='batches', type=Positive(int), default=50000, help='Maximum number of batches to train for') if 'outdir' in arglist: parser.add_argument('--outdir', default='training', help='Output directory, created when run.') if 'output' in arglist: parser.add_argument('--output', default=None, metavar='filename', action=FileAbsent, help='Write output to file') if 'overwrite' in arglist: parser.add_argument('--overwrite', default=False, action=AutoBool, help='Whether to overwrite any output files') if 'quiet' in arglist: parser.add_argument('--quiet', default=False, action=AutoBool, help="Don't print progress information to stdout") if 'recursive' in arglist: parser.add_argument('--recursive', default=True, action=AutoBool, help='Search for fast5s recursively within ' + 'input_folder. Otherwise only search first level.') if 'sample_nreads_before_filtering' in arglist: parser.add_argument( '--sample_nreads_before_filtering', metavar='n', type=NonNegative(int), default=1000, help= 'Sample n reads to decide on bounds for filtering before training. Set to 0 to do all.' ) if 'save_every' in arglist: parser.add_argument('--save_every', metavar='x', type=Positive(int), default=5000, help='Save model every x batches') if 'version' in arglist: parser.add_argument('--version', nargs=0, action=display_version_and_exit, metavar=__version__, help='Display version information.') if 'weight_decay' in arglist: parser.add_argument( '--weight_decay', default=0.0, metavar='penalty', type=NonNegative(float), help='Adam weight decay (L2 normalisation penalty)') ############################################################################ # # Positional arguments # ############################################################################ if 'input_folder' in arglist: parser.add_argument( 'input_folder', action=FileExists, help='Directory containing single or multi-read fast5 files')
#!/usr/bin/env python3 import argparse import numpy as np from taiyaki.bio import fasta_file_to_dict from taiyaki.cmdargs import (AutoBool, FileExists, Positive) from taiyaki.fileio import readtsv parser = argparse.ArgumentParser() parser.add_argument('--refbackground', default=False, action=AutoBool, help='Get background from references') parser.add_argument('--down', metavar='bases', type=Positive(int), default=15, help='number of bases down stream') parser.add_argument('--up', metavar='bases', type=Positive(int), default=15, help='number of bases up stream') parser.add_argument('references', action=FileExists, help='Fasta file containing references') parser.add_argument('coordinates', action=FileExists, help='coordinates file') bases = {b: i for i, b in enumerate('ACGT')} if __name__ == '__main__':
import argparse import matplotlib as mpl mpl.use('Agg') # So we don't need an x server import matplotlib.pyplot as plt import os from taiyaki.cmdargs import Positive parser = argparse.ArgumentParser( description='Plot graphs of training loss', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('output', help='Output png file') parser.add_argument('input_directories', nargs='+', help='One or more directories containing files called model.log') parser.add_argument('--upper_y_limit', default=None, type=Positive(float), help='Upper limit of plot y(loss) axis') if __name__=="__main__": args = parser.parse_args() plt.figure() for training_directory in args.input_directories: blocklist = [] losslist = [] filepath = training_directory + "/model.log" print("Opening", filepath) with open(filepath, "r") as f: for line in f: # The * removes error messges in the log if line.startswith('.') and not ('*' in line): splitline = line.split() try: