Esempio n. 1
0
def get_parser():
    """Get argparser object.

    Returns:
        :argparse:`ArgumentParser` : the argparser object
    """
    parser = argparse.ArgumentParser(
        description='Plot graphs of training loss',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument(
        '--mav', default=None, type=int,
        help='Moving average window applied to batchlog loss.' +
        'e.g --mav 10 visually separates loss curves')
    parser.add_argument(
        '--upper_y_limit', default=None, type=Positive(float),
        help='Upper limit of plot y(loss) axis')
    parser.add_argument(
        '--lower_y_limit', default=None, type=Positive(float),
        help='Lower limit of plot y(loss) axis')
    parser.add_argument(
        '--upper_x_limit', default=None, type=Positive(float),
        help='Upper limit of plot x(iterations) axis')
    parser.add_argument(
        '--lower_x_limit', default=None, type=Positive(float),
        help='Lower limit of plot x(iterations) axis')

    parser.add_argument(
        'output', help='Output png file')
    parser.add_argument(
        'input_directories', nargs='+',
        help='One or more directories containing {} and {} files'.format(
            BATCH_LOG_FILENAME, VAL_LOG_FILENAME))

    return parser
Esempio n. 2
0
def get_parser():
    parser = argparse.ArgumentParser(
        description="Basecall reads using a taiyaki model",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    add_common_command_args(
        parser, """alphabet device input_folder
        input_strand_list jobs limit output quiet
        recursive version""".split())

    parser.add_argument(
        '--beam', default=None, metavar=('width', 'guided'), nargs=2,
        type=(int, bool), action=ParseToNamedTuple,
        help='Use beam search decoding')
    parser.add_argument(
        "--chunk_size", type=Positive(int), metavar="blocks",
        default=basecall_helpers._DEFAULT_CHUNK_SIZE,
        help="Size of signal chunks sent to GPU is chunk_size * model stride")
    parser.add_argument(
        '--fastq', default=False, action=AutoBool,
        help='Write output in fastq format (default is fasta)')
    parser.add_argument(
        "--max_concurrent_chunks", type=Positive(int), default=128,
        help="Maximum number of chunks to call at "
        "once. Lower values will consume less (GPU) RAM.")
    parser.add_argument(
        "--overlap", type=NonNegative(int), metavar="blocks",
        default=basecall_helpers._DEFAULT_OVERLAP,
        help="Overlap between signal chunks sent to GPU")
    parser.add_argument(
        '--posterior', default=True, action=AutoBool,
        help='Use posterior-viterbi decoding')
    parser.add_argument(
        "--qscore_offset", type=float, default=0.0,
        help="Offset to apply to q scores in fastq (after scale)")
    parser.add_argument(
        "--qscore_scale", type=float, default=1.0,
        help="Scaling factor to apply to q scores in fastq")
    parser.add_argument(
        '--reverse', default=False, action=AutoBool,
        help='Reverse sequences in output')
    parser.add_argument(
        '--scaling', action=FileExists, default=None,
        help='Path to TSV containing per-read scaling params')
    parser.add_argument(
        '--temperature', default=1.0, type=float,
        help='Scaling factor applied to network outputs before decoding')
    parser.add_argument(
        "model", action=FileExists,
        help="Model checkpoint file to use for basecalling")

    return parser
Esempio n. 3
0
def get_parser():
    parser = argparse.ArgumentParser(
        description='Plot reference-to-signal maps from mapped signal ' +
        'files. Also dump one-line summary of each read to stdout',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument(
        '--output', help='Output PNG filename. ' +
        'Default: only output per-read summaries.')
    parser.add_argument(
        '--maxlegendsize', type=Positive(int), default=10,
        help='Maximum number of reads to list in the legend.')
    parser.add_argument(
        '--nreads', type=Positive(int), default=10,
        help='Max number of reads to read from each file. Not used if ' +
        'read_ids are given')
    parser.add_argument(
        '--read_ids', nargs='+', default=[],
        help='One or more read_ids. If not present, plots the first ' +
        '[--nreads] in each file')
    parser.add_argument(
        '--xmin', default=None, type=float,
        help='Minimum x for plot')
    parser.add_argument(
        '--xmax', default=None, type=float,
        help='Maximum x for plot')
    parser.add_argument(
        '--ymin', default=None, type=float,
        help='Minimum x for plot')
    parser.add_argument(
        '--ymax', default=None, type=float,
        help='Maximum x for plot')
    parser.add_argument(
        '--line_transparency', type=float, default=1.0,
        help='Transparency value for lines. Default: %(default)f')
    parser.add_argument(
        '--zero_signal_start', action='store_true',
        help='Start signal locations at zero. Default: start at ' +
        'assigned position within entire read.')
    parser.add_argument(
        '--quiet', action='store_true',
        help='Do not display status messages.')

    parser.add_argument(
        'mapped_signal_files', nargs='+',
        help='Inputs: one or more mapped signal files')

    return parser
Esempio n. 4
0
def get_parser():
    """Get argparser object.

    Returns:
        :argparse:`ArgumentParser` : the argparser object
    """
    parser = argparse.ArgumentParser(
        description='Split a strand list into a number of smaller strand ' +
        'lists, or alternatively do the same thing starting with a ' +
        'directory containing fast5s.')

    parser.add_argument('--maxlistsize',
                        default=10000,
                        type=Positive(int),
                        help='Maximum size for a strand list')
    parser.add_argument(
        '--outputbase',
        default=10000,
        help='Strand lists will be saved as <outputbase>_000.txt etc. If ' +
        'outputbase not present then the input will be used as the base name.')

    parser.add_argument('input',
                        help='either a strand list file or a directory name')

    return parser
Esempio n. 5
0
def get_parser():
    parser = argparse.ArgumentParser(
        description='Train a flip-flop neural network',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    add_common_command_args(
        parser, """adam alphabet device eps limit niteration
        outdir overwrite quiet save_every version
        weight_decay""".split())

    parser.add_argument('--batch_size',
                        default=128,
                        metavar='chunks',
                        type=Positive(int),
                        help='Number of chunks to run in parallel')
    parser.add_argument(
        '--gradient_cap_fraction',
        default=0.05,
        metavar='f',
        type=Maybe(NonNegative(float)),
        help='Cap L2 norm of gradient so that a fraction f of gradients ' +
        'are capped. Use --gradient_cap_fraction None for no capping.')
    parser.add_argument('--lr_max',
                        default=4.0e-3,
                        metavar='rate',
                        type=Positive(float),
                        help='Initial learning rate')
    parser.add_argument('--size',
                        default=96,
                        metavar='neurons',
                        type=Positive(int),
                        help='Base layer size for model')
    parser.add_argument('--seed',
                        default=None,
                        metavar='integer',
                        type=Positive(int),
                        help='Set random number seed')
    parser.add_argument('--stride',
                        default=2,
                        metavar='samples',
                        type=Positive(int),
                        help='Stride for model')
    parser.add_argument('--winlen',
                        default=19,
                        type=Positive(int),
                        help='Length of window over data')

    parser.add_argument('model',
                        action=FileExists,
                        help='File to read python model description from')
    parser.add_argument('chunks',
                        action=FileExists,
                        help='file containing chunks')
    parser.add_argument('reference',
                        action=FileExists,
                        help='file containing fasta reference')

    return parser
def get_parser():
    parser = argparse.ArgumentParser(
        description='Plot an accuracy histogram from a combined read file',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        'combined_read_file', action=FileExists,
        help='Combined read file to get data from')
    parser.add_argument(
        '--bins', default=100, type=Positive(int),
        help='Number of bins for histogram')
    parser.add_argument(
        '--title', default='', help='Figure title')
    parser.add_argument(
        '--output_name', default='basecaller_histogram.png',
        help='Output file name')

    return parser
Esempio n. 7
0
def get_train_flipflop_parser():
    parser = argparse.ArgumentParser(
        description='Train flip-flop neural network',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    mdl_grp = parser.add_argument_group('Model Arguments')
    mdl_grp.add_argument(
        '--size', default=384, metavar='neurons',
        type=Positive(int), help='Base layer size for model')
    mdl_grp.add_argument(
        '--stride', default=5, metavar='samples',
        type=Positive(int), help='Stride for model')
    mdl_grp.add_argument(
        '--winlen', default=19, type=Positive(int),
        help='Length of window over data')

    trn_grp = parser.add_argument_group('Training Arguments')
    trn_grp.add_argument(
        '--adam', nargs=2, metavar=('beta1', 'beta2'),
        default=[0.9, 0.999], type=NonNegative(float),
        help='Parameters beta1, beta2 for Exponential Decay ' +
        'Adaptive Momentum')
    trn_grp.add_argument(
        '--eps', default=1e-6, metavar='adjustment',
        type=Positive(float), help='Small value to stabilise optimiser')
    trn_grp.add_argument(
        '--niteration', metavar='batches', type=Positive(int),
        default=150000, help='Maximum number of batches to train for')
    trn_grp.add_argument(
        '--weight_decay', default=0.01, metavar='penalty',
        type=NonNegative(float),
        help='Adam weight decay (L2 normalisation penalty)')
    trn_grp.add_argument(
        '--gradient_clip_num_mads', default=0, metavar='num_MADs',
        type=Maybe(NonNegative(float)),
        help='Clip gradients (by value) at num_MADs above the median of ' +
        'the last 1000 parameter gradient maximums. Gradient threshold ' +
        'values are computed for each parameter group independently. Use ' +
        '"--gradient_clip_num_mads None" for no clipping.')
    trn_grp.add_argument(
        '--lr_max', default=4.0e-3, metavar='rate', type=Positive(float),
        help='Max learning rate, reached at --warmup_batches iterations.')
    trn_grp.add_argument(
        '--lr_min', default=1.0e-4, metavar='rate', type=Positive(float),
        help='Min (starting and final) learning rate')
    trn_grp.add_argument(
        '--seed', default=None, metavar='integer', type=Positive(int),
        help='Set random number seed')
    trn_grp.add_argument(
        '--sharpen', default=(1.0, 1.0, 25000), nargs=3,
        metavar=('min', 'max', 'niter'), action=ParseToNamedTuple,
        type=(Positive(float), Positive(float), Positive(int)),
        help='Increase sharpening factor linearly from "min" to ' +
        '"max" over "niter" iterations')
    trn_grp.add_argument(
        '--warmup_batches', type=int, default=200,
        help='Over first n batches, increase learning rate like cosine.')
    trn_grp.add_argument(
        '--lr_warmup', metavar='rate', type=Positive(float),
        help='Start learning rate for warmup. Defaults to lr_min.')
    trn_grp.add_argument(
        '--min_momentum', type=Positive(float),
        help='Min momentum in cycling. default = Adam beta1, no cycling')

    data_grp = parser.add_argument_group('Data Arguments')
    data_grp.add_argument(
        '--filter_max_dwell', default=10.0, metavar='multiple',
        type=Maybe(Positive(float)),
        help='Drop chunks with max dwell more than multiple of median ' +
        '(over chunks)')
    data_grp.add_argument(
        '--filter_mean_dwell', default=3.0, metavar='radius',
        type=Maybe(Positive(float)),
        help='Drop chunks with mean dwell more than radius deviations ' +
        'from the median (over chunks)')
    data_grp.add_argument(
        '--filter_min_pass_fraction', default=0.5, metavar='fraction',
        type=Maybe(Positive(float)),
        help='Halt if fraction of chunks passing tests is less than this')
    data_grp.add_argument(
        '--filter_path_buffer', default=1.1, metavar='ratio',
        type=Bounded(float, lower=1.0),
        help='Drop chunks with small ratio of signal length to bases * ' +
        'model stride, which would restrict potential CTC paths. Must be ' +
        'greater than 1.0.')
    data_grp.add_argument(
        '--limit', default=None, type=Maybe(Positive(int)),
        help='Limit number of reads to process')
    data_grp.add_argument(
        '--reverse', default=False, action=AutoBool,
        help='Reverse input sequence and current')
    data_grp.add_argument(
        '--sample_nreads_before_filtering', metavar='n',
        type=NonNegative(int), default=100000,
        help='Sample n reads to decide on bounds for filtering before ' +
        'training. Set to 0 to do all.')
    data_grp.add_argument(
        '--chunk_len_min', default=3000, metavar='samples', type=Positive(int),
        help='Min length of each chunk in samples (chunk lengths are ' +
        'random between min and max)')
    data_grp.add_argument(
        '--chunk_len_max', default=8000, metavar='samples', type=Positive(int),
        help='Max length of each chunk in samples (chunk lengths are ' +
        'random between min and max)')
    data_grp.add_argument(
        '--include_reporting_strands', default=False, action=AutoBool,
        help='Include reporting strands in training. Default: Hold ' +
        'training strands out of training.')
    data_grp.add_argument(
        '--input_strand_list', default=None, action=FileExists,
        help='Strand summary file containing column read_id. Filenames in ' +
        'file are ignored.')
    data_grp.add_argument(
        '--min_sub_batch_size', default=128, metavar='chunks',
        type=Positive(int),
        help='Number of chunks to run in parallel per sub-batch for ' +
        'chunk_len = chunk_len_max. Actual length of sub-batch used is ' +
        '(min_sub_batch_size * chunk_len_max / chunk_len).')
    data_grp.add_argument(
        '--reporting_percent_reads', default=1, metavar='sub_batches',
        type=Positive(float),
        help='Percent of reads to use for std loss reporting')
    data_grp.add_argument(
        '--reporting_strand_list', action=FileExists,
        help='Strand summary file containing column read_id. All other ' +
        'fields are ignored. If not provided reporting strands will be ' +
        'randomly selected.')
    data_grp.add_argument(
        '--reporting_sub_batches', default=100, metavar='sub_batches',
        type=Positive(int),
        help='Number of sub-batches to use for std loss reporting')
    data_grp.add_argument(
        '--standardize', default=True, action=AutoBool,
        help='Standardize currents for each read')
    data_grp.add_argument(
        '--sub_batches', default=1, metavar='sub_batches', type=Positive(int),
        help='Number of sub-batches per batch')

    cmp_grp = parser.add_argument_group('Compute Arguments')
    cmp_grp.add_argument(
        '--device', default='cpu', action=DeviceAction,
        help='Integer specifying which GPU to use, or "cpu" to use CPU only. '
        'Other accepted formats: "cuda" (use default GPU), "cuda:2" '
        'or "cuda2" (use GPU 2).')
    # Argument local_rank is used only by when the script is run in multi-GPU
    # mode using torch.distributed.launch. See the README.
    cmp_grp.add_argument(
        '--local_rank', type=int, default=None, help=argparse.SUPPRESS)

    out_grp = parser.add_argument_group('Output Arguments')
    out_grp.add_argument(
        '--full_filter_status', default=False, action=AutoBool,
        help='Output full chunk filtering statistics. Default: only ' +
        'proportion of filtered chunks.')
    out_grp.add_argument(
        '--outdir', default='training',
        help='Output directory, created when run.')
    out_grp.add_argument(
        '--overwrite', default=False, action=AutoBool,
        help='Whether to overwrite any output files')
    out_grp.add_argument(
        '--quiet', default=False, action=AutoBool,
        help="Don't print progress information to stdout")
    out_grp.add_argument(
        '--save_every', metavar='x', type=Positive(int), default=2500,
        help='Save model every x batches')

    mod_grp = parser.add_argument_group('Modified Base Arguments')
    mod_grp.add_argument(
        '--mod_factor', default=(8.0, 1.0, 50000), nargs=3,
        metavar=('start', 'final', 'niter'), action=ParseToNamedTuple,
        type=(Positive(float), Positive(float), Positive(int)),
        help='Relative weight applied to modified base transitions in ' +
        'loss/gradient compared to canonical transitions. Larger values ' +
        'increase the effective modified base learning rate. Scale factor ' +
        'linearly from "start" to "final" over first "niter" iterations')
    mod_grp.add_argument(
        '--mod_prior_factor', type=float,
        help='Exponential factor applied to prior mod weights estimated ' +
        'from training data. Intended to balance modified base scores. ' +
        'Default: no mod prior')
    mod_grp.add_argument(
        '--num_mod_weight_reads', type=int, default=5000,
        help='Number of reads to sample to compute the modified base prior ' +
        'weights from the training data.')

    misc_grp = parser.add_argument_group('Miscellaneous  Arguments')
    misc_grp.add_argument(
        '--version', nargs=0, action=display_version_and_exit,
        metavar=__version__,
        help='Display version information.')

    parser.add_argument(
        'model', action=FileExists,
        help='File to read python model (or checkpoint) from')
    parser.add_argument(
        'input', action=FileExists,
        help='file containing mapped reads')

    return parser
Esempio n. 8
0
parser = argparse.ArgumentParser(
    description='Train a flip-flop neural network',
    formatter_class=argparse.ArgumentDefaultsHelpFormatter)

add_common_command_args(
    parser,
    """adam chunk_logging_threshold device filter_max_dwell filter_mean_dwell
                                   limit lr_cosine_iters niteration overwrite quiet save_every
                                   sample_nreads_before_filtering version weight_decay"""
    .split())

parser.add_argument(
    '--chunk_len_min',
    default=2000,
    metavar='samples',
    type=Positive(int),
    help=
    'Min length of each chunk in samples (chunk lengths are random between min and max)'
)
parser.add_argument(
    '--chunk_len_max',
    default=4000,
    metavar='samples',
    type=Positive(int),
    help=
    'Max length of each chunk in samples (chunk lengths are random between min and max)'
)
parser.add_argument(
    '--input_strand_list',
    default=None,
    action=FileExists,
Esempio n. 9
0
parser = argparse.ArgumentParser(
    description='Train a model to predict ionic current levels from sequence',
    formatter_class=argparse.ArgumentDefaultsHelpFormatter)

add_common_command_args(
    parser,
    """adam chunk_logging_threshold device filter_max_dwell filter_mean_dwell
                                   limit niteration overwrite quiet save_every
                                   sample_nreads_before_filtering version weight_decay"""
    .split())

parser.add_argument('--batch_size',
                    default=100,
                    metavar='chunks',
                    type=Positive(int),
                    help='Number of chunks to run in parallel')
parser.add_argument('--back_prob',
                    default=1e-15,
                    metavar='probability',
                    type=proportion,
                    help='Probability of backwards move')
parser.add_argument('--depth',
                    metavar='layers',
                    default=4,
                    type=Positive(int),
                    help='Number of residual convolution layers')
parser.add_argument(
    '--drop_slip',
    default=5,
    type=Maybe(Positive(int)),
Esempio n. 10
0
from taiyaki import ctc, flipflopfings, helpers
from taiyaki.cmdargs import FileExists, Positive
from taiyaki.common_cmdargs import add_common_command_args


# This is here, not in main to allow documentation to be built
parser = argparse.ArgumentParser(
    description='Train a flip-flop neural network',
    formatter_class=argparse.ArgumentDefaultsHelpFormatter)

add_common_command_args(parser, """adam alphabet device eps limit niteration
                                   outdir overwrite quiet save_every version""".split())

parser.add_argument('--batch_size', default=128, metavar='chunks',
                    type=Positive(int), help='Number of chunks to run in parallel')
parser.add_argument( '--lr_max', default=4.0e-3, metavar='rate',
                    type=Positive(float), help='Initial learning rate')
parser.add_argument('--size', default=96, metavar='neurons',
                    type=Positive(int), help='Base layer size for model')
parser.add_argument('--seed', default=None, metavar='integer', type=Positive(int),
                    help='Set random number seed')
parser.add_argument('--stride', default=2, metavar='samples', type=Positive(int),
                    help='Stride for model')
parser.add_argument('--winlen', default=19, type=Positive(int),
                    help='Length of window over data')

parser.add_argument('model', action=FileExists,
                    help='File to read python model description from')
parser.add_argument('chunks', action=FileExists,
                    help='file containing chunks')
Esempio n. 11
0
def get_parser():
    parser = argparse.ArgumentParser(
        description='Train a model to predict ionic current levels ' +
        'from sequence',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    add_common_command_args(
        parser, """adam device eps filter_max_dwell filter_mean_dwell limit
        niteration outdir overwrite quiet reverse save_every
        sample_nreads_before_filtering version weight_decay""".split())

    parser.add_argument('--batch_size',
                        default=100,
                        metavar='chunks',
                        type=Positive(int),
                        help='Number of chunks to run in parallel')
    parser.add_argument('--back_prob',
                        default=1e-15,
                        metavar='probability',
                        type=proportion,
                        help='Probability of backwards move')
    parser.add_argument('--depth',
                        metavar='layers',
                        default=4,
                        type=Positive(int),
                        help='Number of residual convolution layers')
    parser.add_argument(
        '--drop_slip',
        default=5,
        type=Maybe(Positive(int)),
        metavar='length',
        help='Drop chunks with slips greater than given length (None = off)')
    parser.add_argument(
        '--filter_path_buffer',
        default=1.1,
        metavar='ratio',
        type=float,
        help='Drop chunks with small ratio of signal length to bases * ' +
        'model stride, which would restrict potential CTC paths.')
    parser.add_argument(
        '--filter_min_pass_fraction',
        default=0.5,
        metavar='fraction',
        type=Maybe(Positive(float)),
        help='Halt if fraction of chunks passing tests is less than this')
    parser.add_argument('--full_filter_status',
                        default=False,
                        action=AutoBool,
                        help='Output full chunk filtering statistics. ' +
                        'Default: only proportion of filtered chunks.')
    parser.add_argument(
        '--input_strand_list',
        default=None,
        action=FileExists,
        help='Strand summary file containing column read_id. Filenames in ' +
        'file are ignored.')
    parser.add_argument(
        '--lr_decay',
        default=5000,
        metavar='n',
        type=Positive(float),
        help='Learning rate for batch i is lr_max / (1.0 + i / n)')
    parser.add_argument('--lr_max',
                        default=1.0e-4,
                        metavar='rate',
                        type=Positive(float),
                        help='Max (and starting) learning rate')
    parser.add_argument('--sd',
                        default=0.5,
                        metavar='value',
                        type=Positive(float),
                        help='Standard deviation to initialise with')
    parser.add_argument('--seed',
                        default=None,
                        metavar='integer',
                        type=Positive(int),
                        help='Set random number seed')
    parser.add_argument('--size',
                        metavar='n',
                        default=32,
                        type=Positive(int),
                        help='Size of layers in convolution network')
    parser.add_argument('--target_len',
                        metavar='n',
                        default=300,
                        type=Positive(int),
                        help='Target length of sequence')
    parser.add_argument('--winlen',
                        metavar='n',
                        default=9,
                        type=Positive(int),
                        help='Window for convolution network')
    parser.add_argument('input',
                        action=FileExists,
                        help='HDF5 file containing mapped reads')

    return parser
Esempio n. 12
0
from taiyaki.maths import med_mad
from taiyaki.signal import Signal


STITCH_BEFORE_VITERBI = False


parser = argparse.ArgumentParser(
    description="Basecall reads using a taiyaki model",
    formatter_class=argparse.ArgumentDefaultsHelpFormatter)

add_common_command_args(parser, 'device input_folder input_strand_list limit output quiet recursive version'.split())

parser.add_argument("--alphabet", default=DEFAULT_ALPHABET,
                    help="Alphabet used by basecaller")
parser.add_argument("--chunk_size", type=Positive(int),
                    default=basecall_helpers._DEFAULT_CHUNK_SIZE,
                    help="Size of signal chunks sent to GPU")
parser.add_argument("--overlap", type=NonNegative(int),
                    default=basecall_helpers._DEFAULT_OVERLAP,
                    help="Overlap between signal chunks sent to GPU")
parser.add_argument("--modified_base_output", action=FileAbsent, default=None,
                    help="Output filename for modified base output.")
parser.add_argument("model", action=FileExists,
                    help="Model checkpoint file to use for basecalling")


def med_mad_norm(x, dtype='f4'):
    """ Normalise a numpy array using median and MAD """
    med, mad = med_mad(x)
    normed_x = (x - med) / mad
Esempio n. 13
0
def add_common_command_args(parser, arglist):
    """Given an argparse parser object and a list of keys such as
    ['input_strand_list', 'jobs'], add these command line args
    to the parser.
    
    Note that not all command line args used in the package are
    included in this func: only those that are used by more than
    one script and which have the same defaults.

    Also note that some args are positional and some are optional.
    The optional ones are listed first below."""

    ############################################################################
    #
    # Optional arguments
    #
    ############################################################################

    if 'adam' in arglist:
        parser.add_argument(
            '--adam',
            nargs=3,
            metavar=('rate', 'decay1', 'decay2'),
            default=(1e-3, 0.9, 0.999),
            type=(NonNegative(float), NonNegative(float), NonNegative(float)),
            action=ParseToNamedTuple,
            help='Parameters for Exponential Decay Adaptive Momementum')

    if 'chunk_logging_threshold' in arglist:
        parser.add_argument(
            '--chunk_logging_threshold',
            default=10.0,
            metavar='multiple',
            type=NonNegative(float),
            help=
            'If loss > (threshold * smoothed loss) for a batch, then log chunks to '
            +
            'output/chunklog.tsv. Set to zero to log all, including rejected chunks'
        )

    if 'device' in arglist:
        parser.add_argument(
            '--device',
            default='cpu',
            action=DeviceAction,
            help=
            'Integer specifying which GPU to use, or "cpu" to use CPU only. '
            'Other accepted formats: "cuda" (use default GPU), "cuda:2" '
            'or "cuda2" (use GPU 2).')

    if 'filter_max_dwell' in arglist:
        parser.add_argument(
            '--filter_max_dwell',
            default=10.0,
            metavar='multiple',
            type=Maybe(Positive(float)),
            help=
            'Drop chunks with max dwell more than multiple of median (over chunks)'
        )

    if 'filter_mean_dwell' in arglist:
        parser.add_argument(
            '--filter_mean_dwell',
            default=3.0,
            metavar='radius',
            type=Maybe(Positive(float)),
            help=
            'Drop chunks with mean dwell more than radius deviations from the median (over chunks)'
        )

    if 'input_strand_list' in arglist:
        parser.add_argument('--input_strand_list',
                            default=None,
                            action=FileExists,
                            help='Strand summary file containing subset')

    if 'jobs' in arglist:
        parser.add_argument(
            '--jobs',
            default=1,
            metavar='n',
            type=Positive(int),
            help='Number of threads to use when processing data')

    if 'limit' in arglist:
        parser.add_argument('--limit',
                            default=None,
                            type=Maybe(Positive(int)),
                            help='Limit number of reads to process')

    if 'lrdecay' in arglist:
        parser.add_argument(
            '--lrdecay',
            default=5000,
            metavar='n',
            type=Positive(float),
            help='Learning rate for batch i is adam.rate / (1.0 + i / n)')

    if 'niteration' in arglist:
        parser.add_argument('--niteration',
                            metavar='batches',
                            type=Positive(int),
                            default=50000,
                            help='Maximum number of batches to train for')

    if 'overwrite' in arglist:
        parser.add_argument('--overwrite',
                            default=False,
                            action=AutoBool,
                            help='Whether to overwrite any output files')

    if 'quiet' in arglist:
        parser.add_argument('--quiet',
                            default=False,
                            action=AutoBool,
                            help="Don't print progress information to stdout")

    if 'sample_nreads_before_filtering' in arglist:
        parser.add_argument(
            '--sample_nreads_before_filtering',
            metavar='n',
            type=NonNegative(int),
            default=1000,
            help=
            'Sample n reads to decide on bounds for filtering before training. Set to 0 to do all.'
        )

    if 'save_every' in arglist:
        parser.add_argument('--save_every',
                            metavar='x',
                            type=Positive(int),
                            default=5000,
                            help='Save model every x batches')

    if 'version' in arglist:
        parser.add_argument('--version',
                            nargs=0,
                            action=display_version_and_exit,
                            metavar=__version__,
                            help='Display version information.')

    if 'weight_decay' in arglist:
        parser.add_argument(
            '--weight_decay',
            default=0.0,
            metavar='penalty',
            type=NonNegative(float),
            help='Adam weight decay (L2 normalisation penalty)')

    ############################################################################
    #
    # Positional arguments
    #
    ############################################################################

    if 'input_folder' in arglist:
        parser.add_argument(
            'input_folder',
            action=FileExists,
            help='Directory containing single-read fast5 files')
Esempio n. 14
0
                     mapped_signal_files, optim)
from taiyaki import __version__
from taiyaki.cmdargs import FileExists, FilesExist, Positive
from taiyaki.common_cmdargs import add_common_command_args
from taiyaki.constants import DOTROWLENGTH


# This is here, not in main to allow documentation to be built
parser = argparse.ArgumentParser(description='Train a flip-flop neural network',
                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)

add_common_command_args(parser, """adam chunk_logging_threshold device filter_max_dwell filter_mean_dwell
                                   limit lr_cosine_iters niteration overwrite quiet save_every
                                   sample_nreads_before_filtering version weight_decay""".split())

parser.add_argument('--chunk_len_min', default=2000, metavar='samples', type=Positive(int),
                    help='Min length of each chunk in samples (chunk lengths are random between min and max)')
parser.add_argument('--chunk_len_max', default=4000, metavar='samples', type=Positive(int),
                    help='Max length of each chunk in samples (chunk lengths are random between min and max)')
parser.add_argument('--input_strand_list', default=None, action=FileExists,
                    help='Strand summary file containing column read_id. Filenames in file are ignored.')
parser.add_argument('--lr_cosine_iters', default=40000, metavar='n', type=Positive(float),
                    help='Learning rate decreases from max to min like cosine function over n batches')
parser.add_argument('--lr_max', default=2.0e-3, metavar='rate',
                    type=Positive(float),
                    help='Max (and starting) learning rate')
parser.add_argument('--lr_min', default=1.0e-4, metavar='rate',
                    type=Positive(float), help='Min (and final) learning rate')
parser.add_argument('--min_batch_size', default=64, metavar='chunks', type=Positive(int),
                    help='Number of chunks to run in parallel for chunk_len = chunk_len_max.' +
                         'Actual batch size used is (min_batch_size / chunk_len) * chunk_len_max')
Esempio n. 15
0
                     mapped_signal_files, optim)
from taiyaki import __version__
from taiyaki.cmdargs import FileExists, Maybe, Positive, proportion
from taiyaki.common_cmdargs import add_common_command_args
from taiyaki.constants import DOTROWLENGTH
from taiyaki.squiggle_match import squiggle_match_loss, embed_sequence


parser = argparse.ArgumentParser(description='Train a model to predict ionic current levels from sequence',
                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)

add_common_command_args(parser, """adam chunk_logging_threshold device filter_max_dwell filter_mean_dwell
                                   limit niteration overwrite quiet save_every
                                   sample_nreads_before_filtering version weight_decay""".split())

parser.add_argument('--batch_size', default=100, metavar='chunks', type=Positive(int),
                    help='Number of chunks to run in parallel')
parser.add_argument('--back_prob', default=1e-15, metavar='probability',
                    type=proportion, help='Probability of backwards move')
parser.add_argument('--depth', metavar='layers' , default=4, type=Positive(int),
                    help='Number of residual convolution layers')
parser.add_argument('--drop_slip', default=5, type=Maybe(Positive(int)), metavar='length',
                    help='Drop chunks with slips greater than given length (None = off)')
parser.add_argument('--input_strand_list', default=None, action=FileExists,
                    help='Strand summary file containing column read_id. Filenames in file are ignored.')
parser.add_argument('--lr_decay', default=5000, metavar='n', type=Positive(float),
                     help='Learning rate for batch i is lr_max / (1.0 + i / n)')
parser.add_argument('--lr_max', default=1.0e-4, metavar='rate',
                            type=Positive(float),
                            help='Max (and starting) learning rate')
parser.add_argument('--sd', default=0.5, metavar='value', type=Positive(float),
Esempio n. 16
0
def add_common_command_args(parser, arglist):
    """Given an argparse parser object and a list of keys such as
    ['input_strand_list', 'jobs'], add these command line args
    to the parser.

    Not all command line args used in the package are
    included in this func: only those that are used by more than
    one script and which have the same defaults.

    Some args are positional and some are optional.
    The optional ones are listed first below."""

    ############################################################################
    #
    # Optional arguments
    #
    ############################################################################

    if 'adam' in arglist:
        parser.add_argument(
            '--adam',
            nargs=2,
            metavar=('beta1', 'beta2'),
            default=[0.9, 0.999],
            type=NonNegative(float),
            help=
            'Parameters beta1, beta2 for Exponential Decay Adaptive Momentum')

    if 'alphabet' in arglist:
        parser.add_argument('--alphabet',
                            default=DEFAULT_ALPHABET,
                            help='Canonical base alphabet')

    if 'device' in arglist:
        parser.add_argument(
            '--device',
            default='cpu',
            action=DeviceAction,
            help=
            'Integer specifying which GPU to use, or "cpu" to use CPU only. '
            'Other accepted formats: "cuda" (use default GPU), "cuda:2" '
            'or "cuda2" (use GPU 2).')
    if 'eps' in arglist:
        parser.add_argument('--eps',
                            default=1e-6,
                            metavar='adjustment',
                            type=Positive(float),
                            help='Small value to stabilise optimiser')

    if 'filter_max_dwell' in arglist:
        parser.add_argument(
            '--filter_max_dwell',
            default=10.0,
            metavar='multiple',
            type=Maybe(Positive(float)),
            help=
            'Drop chunks with max dwell more than multiple of median (over chunks)'
        )

    if 'filter_mean_dwell' in arglist:
        parser.add_argument(
            '--filter_mean_dwell',
            default=3.0,
            metavar='radius',
            type=Maybe(Positive(float)),
            help=
            'Drop chunks with mean dwell more than radius deviations from the median (over chunks)'
        )

    if 'input_strand_list' in arglist:
        parser.add_argument(
            '--input_strand_list',
            default=None,
            action=FileExists,
            help=
            'Strand list TSV file with columns filename_fast5 or read_id or both'
        )

    if 'jobs' in arglist:
        parser.add_argument(
            '--jobs',
            default=1,
            metavar='n',
            type=Positive(int),
            help='Number of threads to use when processing data')

    if 'limit' in arglist:
        parser.add_argument('--limit',
                            default=None,
                            type=Maybe(Positive(int)),
                            help='Limit number of reads to process')

    if 'niteration' in arglist:
        parser.add_argument('--niteration',
                            metavar='batches',
                            type=Positive(int),
                            default=50000,
                            help='Maximum number of batches to train for')

    if 'outdir' in arglist:
        parser.add_argument('--outdir',
                            default='training',
                            help='Output directory, created when run.')

    if 'output' in arglist:
        parser.add_argument('--output',
                            default=None,
                            metavar='filename',
                            action=FileAbsent,
                            help='Write output to file')

    if 'overwrite' in arglist:
        parser.add_argument('--overwrite',
                            default=False,
                            action=AutoBool,
                            help='Whether to overwrite any output files')

    if 'quiet' in arglist:
        parser.add_argument('--quiet',
                            default=False,
                            action=AutoBool,
                            help="Don't print progress information to stdout")

    if 'recursive' in arglist:
        parser.add_argument('--recursive',
                            default=True,
                            action=AutoBool,
                            help='Search for fast5s recursively within ' +
                            'input_folder. Otherwise only search first level.')

    if 'sample_nreads_before_filtering' in arglist:
        parser.add_argument(
            '--sample_nreads_before_filtering',
            metavar='n',
            type=NonNegative(int),
            default=1000,
            help=
            'Sample n reads to decide on bounds for filtering before training. Set to 0 to do all.'
        )

    if 'save_every' in arglist:
        parser.add_argument('--save_every',
                            metavar='x',
                            type=Positive(int),
                            default=5000,
                            help='Save model every x batches')

    if 'version' in arglist:
        parser.add_argument('--version',
                            nargs=0,
                            action=display_version_and_exit,
                            metavar=__version__,
                            help='Display version information.')

    if 'weight_decay' in arglist:
        parser.add_argument(
            '--weight_decay',
            default=0.0,
            metavar='penalty',
            type=NonNegative(float),
            help='Adam weight decay (L2 normalisation penalty)')

    ############################################################################
    #
    # Positional arguments
    #
    ############################################################################

    if 'input_folder' in arglist:
        parser.add_argument(
            'input_folder',
            action=FileExists,
            help='Directory containing single or multi-read fast5 files')
Esempio n. 17
0
#!/usr/bin/env python3
import argparse
import numpy as np

from taiyaki.bio import fasta_file_to_dict
from taiyaki.cmdargs import (AutoBool, FileExists, Positive)
from taiyaki.fileio import readtsv

parser = argparse.ArgumentParser()
parser.add_argument('--refbackground',
                    default=False,
                    action=AutoBool,
                    help='Get background from references')
parser.add_argument('--down',
                    metavar='bases',
                    type=Positive(int),
                    default=15,
                    help='number of bases down stream')
parser.add_argument('--up',
                    metavar='bases',
                    type=Positive(int),
                    default=15,
                    help='number of bases up stream')
parser.add_argument('references',
                    action=FileExists,
                    help='Fasta file containing references')
parser.add_argument('coordinates', action=FileExists, help='coordinates file')

bases = {b: i for i, b in enumerate('ACGT')}

if __name__ == '__main__':
Esempio n. 18
0
import argparse
import matplotlib as mpl
mpl.use('Agg')  # So we don't need an x server
import matplotlib.pyplot as plt
import os
from taiyaki.cmdargs import Positive

parser = argparse.ArgumentParser(
    description='Plot graphs of training loss',
    formatter_class=argparse.ArgumentDefaultsHelpFormatter)

parser.add_argument('output', help='Output png file')
parser.add_argument('input_directories',  nargs='+',
                    help='One or more directories containing files called model.log')
parser.add_argument('--upper_y_limit', default=None, 
                    type=Positive(float), help='Upper limit of plot y(loss) axis')

if __name__=="__main__":
    args = parser.parse_args()
    plt.figure()
    for training_directory in args.input_directories:
        blocklist = []
        losslist = []
        filepath = training_directory + "/model.log"
        print("Opening", filepath)
        with open(filepath, "r") as f:
            for line in f:
                # The * removes error messges in the log
                if line.startswith('.') and not ('*' in line):
                    splitline = line.split()
                    try: