Esempio n. 1
0
def main():
    args = get_parser().parse_args()

    sys.stderr.write(
        "* Loading references (this may take a while for large genomes)\n")
    references = fasta_file_to_dict(args.reference, filter_ambig=False)

    if args.input_strand_list is None:
        strand_list = None
    else:
        strand_list = readtsv(args.input_strand_list,
                              fields=['read_id'])['read_id']
        sys.stderr.write('* Strand list contains {} reads\n'.format(
            len(strand_list)))

    sys.stderr.write("* Extracting read references using SAM alignment\n")
    with open_file_or_stdout(args.output) as fh:
        for samfile in args.input:
            for name, read_ref in get_refs(samfile,
                                           references,
                                           args.min_coverage,
                                           args.pad,
                                           strand_list=strand_list):
                if args.reverse:
                    read_ref = read_ref[::-1]
                if args.complement:
                    read_ref = complement(read_ref)
                fasta = ">{}\n{}\n".format(name, read_ref)

                fh.write(fasta)
Esempio n. 2
0
def get_column_from_tsv(tsv_file_name, column):
    '''Load a column from a csv file'''

    if tsv_file_name is not None:
        data = readtsv(tsv_file_name, encoding='utf-8')
        assert column in data.dtype.names, "Strand file does not contain required field {}".format(column)
        return [x for x in data[column]]
Esempio n. 3
0
def get_per_read_params_dict_from_tsv(input_file):
    """Load per read parameter .tsv into a np array and parse into a dictionary

    Args:
        input_file (str): filename including path for the tsv file

    Returns:
        dict : dictionary with keys being UUIDs, values being named
        tuple('per_read_params', 'trim_start trim_end shift scale')"""
    try:
        per_read_params_array = readtsv(
            input_file, ['UUID', 'trim_start', 'trim_end', 'shift', 'scale'])
    except Exception as e:
        sys.stderr.write(
            'Failed to get per-read parameters from {}.\n{}\n'.format(
                input_file, repr(e)))
        return None

    per_read_params_dict = {}
    for row in per_read_params_array:
        try:
            per_read_params_dict[row[0]] = {
                'trim_start': row[1],
                'trim_end': row[2],
                'shift': row[3],
                'scale': row[4]
            }
        except Exception:
            sys.stderr.write(
                "Warning: ignoring incorrect line {} in {}\n".format(
                    row, input_file))

    return per_read_params_dict
def main():
    print("Plots output of predict_squiggle.py")
    print("Usage:")
    print(
        "plot_predict_squiggle_output.py <predict_squiggle_output_file> <output_png_file>"
    )
    if len(sys.argv) < 3:
        print("ERROR: Needs command line arguments!")
    else:
        predict_squiggle_output_file = sys.argv[1]
        plotfile = sys.argv[2]
        t = fileio.readtsv(predict_squiggle_output_file)

        plt.figure(figsize=(16, 5))
        tstart = 0
        for nrow in range(len(t)):
            i, sd, dwell = t['current'][nrow], t['sd'][nrow], t['dwell'][nrow]
            centret = tstart + dwell / 2
            plt.bar(centret, sd, dwell, i - sd / 2)
            plt.text(centret, i, t['base'][nrow])
            tstart += dwell
        plt.xlabel('time')
        plt.ylabel('current')
        plt.grid()
        plt.savefig(plotfile)
Esempio n. 5
0
def main():
    args = get_parser().parse_args()

    batchdata = {}
    valdata = {}
    for td in args.input_directories:
        batchdata[td] = fileio.readtsv(os.path.join(td, BATCH_LOG_FILENAME))
        valdata[td] = fileio.readtsv(os.path.join(td, VAL_LOG_FILENAME))
        if args.mav is not None:
            batchdata[td]['loss'] = moving_average(
                batchdata[td]['loss'], args.mav)

    # Plot validation and training loss
    plt.figure(figsize=(6, 4.8))
    colour_cycle = plt.rcParams['axes.prop_cycle'].by_key()['color']
    for td, colour in zip(args.input_directories, colour_cycle):
        label = os.path.basename(os.path.normpath(td))
        plt.plot(batchdata[td]['iter'], batchdata[td]['loss'],
                 color=colour, label=label + ' (training)', alpha=0.5,
                 linewidth=0.5)
        if len(valdata[td]['iter']) == 0:
            print(('No validtion log data for {}. The first validation run ' +
                   'has likely not completed.').format(td))
            continue
        plt.plot(valdata[td]['iter'], valdata[td]['loss'],
                 color=colour, label=label + ' (validation)', linewidth=0.5)

    plt.grid()
    plt.xlabel('Iterations')
    plt.ylabel('Loss')
    if args.upper_y_limit is not None:
        plt.ylim(top=args.upper_y_limit)
    if args.lower_y_limit is not None:
        plt.ylim(bottom=args.lower_y_limit)
    if args.upper_x_limit is not None:
        plt.xlim(right=args.upper_x_limit)
    if args.lower_x_limit is not None:
        plt.xlim(left=args.lower_x_limit)
    leg = plt.legend(loc='upper right')
    for legobj in leg.legendHandles:
        legobj.set_linewidth(4.0)

    if args.mav is not None:
        plt.title('Moving average window = {} iterations'.format(args.mav))
    plt.tight_layout()
    plt.savefig(args.output, dpi=300)
    plt.close()
Esempio n. 6
0
def read_batch_log(filepath):
    t = fileio.readtsv(filepath)
    return {
        't': np.arange(len(t)),
        'training_loss': t['loss'],
        'gradientnorm': t['gradientnorm'],
        'gradientcap': t['gradientcap']
    }
def get_alignment_data(alignment_file):
    """Read alignment summary generated by Guppy or Taiyaki, getting accuracy
    and length of aligned part of read for each read id

    :param alignment_file: file path pointing to either
                            Taiyaki  (.samacc)  or Guppy (.txt)
                            alignment summary

    :return: tuple (read_ids, accuracies, alignment_lens)
             where read_ids is a numpy array of strs
                   accuracies is a numpy array of floats (0-1)
                   alignment_lens is a numpy array of ints
                      giving the read (i.e. 'strand') alignment length.
                      -1 is used as a marker for null in this array.

    :note: The resulting table may have more than one entry for each read id
           (because there may be more than one possible alignment)

    """
    #Delimiter None accepts space or tab - samaccs are space-separated.
    t = fileio.readtsv(alignment_file, delimiter=None)

    try:
        #Try to read the file as a Guppy alignment summary file
        read_ids = t['read_id']
        accuracies = t['alignment_accuracy']
        alignment_lens = (t['alignment_strand_end']
                        - t['alignment_strand_start'])
        print("Interpreted alignment file as Guppy output")
        accuracies[accuracies<0] = np.nan
        return read_ids, accuracies, alignment_lens
    except ValueError:
        #Thrown if the required fields are not present in the file
        pass

    try:
        #Try to read the file as a Taiyaki alignment summary
        read_ids = t['query']
        accuracies = t['accuracy']
        #Query length in alignment not available directly in taiyaki summary
        alignment_lens = (t['reference_end']
                        - t['reference_start']
                        + t['insertion']
                        - t['deletion'])
        print("Interpreted alignment file as Taiyaki output")
        return read_ids, accuracies, alignment_lens
    except ValueError:
        pass

    columnlist = list(t.dtype.fields.keys())
    raise Exception("Alignment summary file must contain either columns "+
                    "(read_ids, alignment accuracy, alignment_strand_end, " +
                    "alignment_strand_start) or " +
                    "(id, accuracy, reference_end, reference_start, " +
                    "insertion, deletion  )" +
                    ". Columns are {}".format(columnlist) )
Esempio n. 8
0
def get_alignment_data(alignment_file):
    """Read alignment summary generated by Guppy or Taiyaki, getting accuracy
    and length of aligned part of read for each read ID

    Note:
        The resulting table may have more than one entry for each read id
        because there may be more than one possible alignment

    Args:
        alignment_file (str): file path pointing to either Taiyaki (.samacc) or
            Guppy (.txt) alignment summary.

    Returns:
        tuple of :class:`ndarray` and :class:`ndarray` and :class:`ndarray`:
            First element of tuple contains the read ID of the reads analysed,
            the second element is the corresponding accuracy of each read, and
            the third element is the alignment length (-1 means unaligned).
    """
    # Delimiter None accepts space or tab - samaccs are space-separated.
    t = fileio.readtsv(alignment_file, delimiter=None)

    try:
        # Try to read the file as a Guppy alignment summary file
        read_ids = t['read_id']
        accuracies = t['alignment_accuracy']
        alignment_lens = (t['alignment_strand_end'] -
                          t['alignment_strand_start'])
        print("Interpreted alignment file as Guppy output")
        accuracies[accuracies < 0] = np.nan
        return read_ids, accuracies, alignment_lens
    except ValueError:
        # Thrown if the required fields are not present in the file
        pass

    try:
        # Try to read the file as a Taiyaki alignment summary
        read_ids = t['query']
        accuracies = t['accuracy']
        # Query length in alignment not available directly in taiyaki summary
        alignment_lens = (t['reference_end'] - t['reference_start'] +
                          t['insertion'] - t['deletion'])
        print("Interpreted alignment file as Taiyaki output")
        return read_ids, accuracies, alignment_lens
    except ValueError:
        pass

    columnlist = list(t.dtype.fields.keys())
    raise Exception("Alignment summary file must contain either columns " +
                    "(read_ids, alignment accuracy, alignment_strand_end, " +
                    "alignment_strand_start) or " +
                    "(id, accuracy, reference_end, reference_start, " +
                    "insertion, deletion  )" +
                    ". Columns are {}".format(columnlist))
Esempio n. 9
0
def get_column_from_tsv(tsv_file_name, column):
    """Load a column from a csv file

    Args:
        tsv_file_name (str) : filename
        column (str) : the column we want

    Returns:
        list : the data from the column
    """
    if tsv_file_name is not None:
        data = readtsv(tsv_file_name, encoding='utf-8')
        assert column in data.dtype.names, (
            "Strand file does not contain required field {}".format(column))
        return [x for x in data[column]]
Esempio n. 10
0
def main():
    args = parser.parse_args()

    AccVals = readtsv(args.combined_read_file)['alignment_accuracy']

    fig, ax = plt.subplots()

    escaped_title = bytes(args.title, "utf-8").decode("unicode_escape")
    ax.set_title(escaped_title, fontsize=7)
    ax.set_xlabel('Accuracy')
    ax.set_ylabel('Reads')

    ax.minorticks_on()
    ax.grid(which='major', linestyle=':')
    ax.grid(which='minor', linestyle=':')

    plt.hist(np.array(AccVals[AccVals >= 0]), bins=args.bins)

    plt.tight_layout()

    plt.savefig(args.output_name)
Esempio n. 11
0
def iterate_fast5_reads(path,
                        strand_list=None,
                        limit=None,
                        verbose=0,
                        recursive=False):
    """Return iterator yielding reads in a directory of fast5 files or a single fast5 file.

    Each read is specified by a tuple (filepath, read_id)
    Files may be single or multi-read fast5s

    You may say, "why not yield an ont_fast_api object instead of this nasty tuple?"
    I would then say. "yes, I did try that, but it led to unfathomable nastiness when
    I fed these objects in as arguments to multiple processes."

    If strand_list is given, then only return the reads spcified, according to
    the following rules:

        (A) If the strand list file has a column 'read_id' and no column 'filename' or 'filename_fast5'
                    then look through all fast5 files in the path and return all reads with read_ids
                    in that column.
        (B) If the strand list file has a column 'filename' or 'filename_fast5' and no column 'read_id'
                    then look through all filenames specified and return all reads in them.
        (C) If the strand list has a column 'filename' or 'filename_fast5' _and_ a column 'read_id'
                    then loop through the rows in the strand list, returning the appropriate tuple
                    for each row. We check that each file exists and contains the read_id.

    :param path: Directory ( or filename for a single file)
    :param strand_list: Path to file containing list of files and/or read ids to iterate over.
    :param limit: Limit number of reads to consider
    :param verbose   : an integer. verbose=0 prints no progress messages, verbose=1
                       prints a message for every file read. Verbose =2 prints the
                       list of files before starting as well.
    :param recursive: Search path recursively for fast5 files.

    Example usage:

    read_iterator = iterate_fast5_reads('directory')
    for read_tuple in read_iterator:
        fname,read_id = read_tuple
        print("Filename=",fname,", read id = ",read_id)
        with fast5_interface.get_fast5_file(fname, 'r') as f5file:
            read = f5file.get_read(read_id)
            dacs = read.get_raw_data()
        print("Length of rawget_file_names data:",len(dacs))
    """
    filepaths, read_ids = None, None

    if strand_list is not None:
        strand_table = readtsv(strand_list)
        if verbose >= 2:
            print("Columns in strand list file:")
            print(strand_table.dtype.names)
        if 'filename' in strand_table.dtype.names:
            filepaths = strand_table['filename']
        elif 'filename_fast5' in strand_table.dtype.names:
            filepaths = strand_table['filename_fast5']
        if 'read_id' in strand_table.dtype.names:
            read_ids = [str(i) for i in strand_table['read_id']]
        # If we get to this point and we haven't got read ids or filenames, then
        # there is nothing in the strand list that we can use (this happens, for
        # example, when the strand list has no header line).
        if filepaths is None and read_ids is None:
            raise Exception(
                "Strand list at {} has no column that can be used:".format(
                    strand_list) +
                "(it should contain ('filename' or 'filename_fast5') or 'read_id',"
                + "or both a filename column and a read_id column)")
        # The strand list supplies filenames, not paths, so we supply the rest of the path
        if filepaths is not None:
            filepaths = [os.path.join(path, x) for x in filepaths]

    if (filepaths is not None) and (read_ids is not None):
        # This is the case (C) above. Both filenames and read_ids come from the strandlist
        # and we therefore know which read_id goes with which file
        for y in iterate_file_read_pairs(filepaths, read_ids, limit, verbose):
            yield y
        return

    if filepaths is None:
        # Filenames not supplied by strand list, so we get them from the path
        if os.path.isdir(path):
            filepaths = ont_fast5_api.conversion_tools.conversion_utils.get_fast5_file_list(
                path, recursive=recursive)
        else:
            filepaths = [path]

    for y in iterate_files_reads_unpaired(filepaths, read_ids, limit, verbose):
        yield y
parser.add_argument('combined_read_file',
                    action=FileExists,
                    help='Combined read file to get data from')
parser.add_argument('--bins',
                    default=100,
                    type=Positive(int),
                    help='Number of bins for histogram')
parser.add_argument('--title', default='', help='Figure title')
parser.add_argument('--output_name',
                    default='basecaller_histogram.png',
                    help='Output file name')

if __name__ == "__main__":
    args = parser.parse_args()

    AccVals = readtsv(args.combined_read_file)['alignment_accuracy']

    fig, ax = plt.subplots()

    ax.set_title(args.title)
    ax.set_xlabel('Accuracy')
    ax.set_ylabel('Reads')

    ax.minorticks_on()
    ax.grid(which='major', linestyle=':')
    ax.grid(which='minor', linestyle=':')

    plt.hist(np.array(AccVals[AccVals >= 0]), bins=args.bins)

    plt.tight_layout()
Esempio n. 13
0
def main():
    print("Plots summary of chunk log.")
    print("Usage:")
    print("plot_chunk_log.py <chunk_log_file> <output_file>")
    if len(sys.argv) < 3:
        print("ERROR: Needs command line arguments!")
    else:
        chunk_log_file = sys.argv[1]
        plotfile = sys.argv[2]
        t = fileio.readtsv(chunk_log_file)

        plt.figure(figsize=(16, 12))

        plt.subplot(2, 2, 1)
        plt.title('Mean dwells of chunks sampled to get filter params')
        f = (t['iteration'] == -1) & (t['status'] == 'pass')
        bases = t['chunk_len_bases'][f]
        samples = t['chunk_len_samples'][f]
        filter_sample_length = len(bases)
        meandwells = samples / (bases + 0.0001)
        plt.hist(meandwells, bins=100, log=True)
        plt.grid()

        # Remove the part that refers to the sampling for filter params
        t = t[filter_sample_length:]

        plt.subplot(2, 2, 2)
        plt.title('Lengths of accepted and rejected chunks')
        status_choices = np.unique(t['status'])
        # Need to do 'pass' first - otherwise it overwhelms everything
        status_choices = list(status_choices[status_choices != 'pass'])
        status_choices = ['pass'] + status_choices
        for status in status_choices:
            filt = (t['status'] == status)
            bases = t['chunk_len_bases'][filt]
            samples = t['chunk_len_samples'][filt]
            print("Status", status, "number of chunks=", len(bases))
            plt.scatter(bases, samples, label=status, s=4)

        plt.grid()
        plt.ylabel('Length in bases')
        plt.xlabel('Length in samples')
        plt.legend(loc='upper left', framealpha=0.3)

        for nplot, scale in enumerate('log linear'.split()):
            plt.subplot(2, 2, nplot + 3, xscale=scale, yscale=scale)
            plt.title('Max and mean dwells')
            status_choices = np.unique(t['status'])
            # Need to do 'pass' first - otherwise it overwhelms everything
            status_choices = list(status_choices[status_choices != 'pass'])
            status_choices = ['pass'] + status_choices
            for status in status_choices:
                filt = (t['status'] == status)
                bases = t['chunk_len_bases'][filt]
                samples = t['chunk_len_samples'][filt]
                count = len(bases)
                meandwells = samples / (bases + 0.0001)
                maxdwells = t['max_dwell'][filt]
                plt.scatter(meandwells,
                            maxdwells,
                            label=status + ' (' + str(count) + ')',
                            s=4,
                            alpha=0.5)

            plt.grid()
            plt.xlabel('Mean dwell')
            plt.ylabel('Max dwell')
            plt.legend(loc='lower right', framealpha=0.3)

        plt.savefig(plotfile)
#!/usr/bin/env python3
import matplotlib as mpl
mpl.use('Agg')  # So we don't need an x server
import matplotlib.pyplot as plt
import sys
from taiyaki import fileio

print("Plots output of predict_squiggle.py")
print("Usage:")
print("plot_predict_squiggle_output.py <predict_squiggle_output_file> <output_png_file>")
if len(sys.argv) < 3:
    print("ERROR: Needs command line arguments!")
else:
    predict_squiggle_output_file = sys.argv[1]
    plotfile = sys.argv[2]
    t = fileio.readtsv(predict_squiggle_output_file)

    plt.figure(figsize=(16, 5))
    tstart = 0
    for nrow in range(len(t)):
        i,sd,dwell = t['current'][nrow], t['sd'][nrow], t['dwell'][nrow]
        centret = tstart + dwell/2
        plt.bar(centret, sd, dwell, i-sd/2)
        plt.text(centret, i, t['base'][nrow])
        tstart +=dwell
    plt.xlabel('time')
    plt.ylabel('current')
    plt.grid()
    plt.savefig(plotfile)
Esempio n. 15
0
def iterate_fast5_reads(
        path, strand_list=None, limit=None, verbose=0, recursive=False):
    """ Iterate over reads in a directory of fast5 files or a single fast5
        file. Files may be single or multi-read fast5s.

    Args:
        path (str): Directory (or filename for a single file)
        strand_list (str or None, optional): Path to file containing list of
            files and/or read ids to iterate over (as described in notes) or
            None for all files and reads
        limit (int or None, optional): Maximum number of reads to consider or
            None for all
        verbose (int, optional): 0 prints no messages, 1 prints a message for
            every file read, 2 prints the list of files before starting as well
        recursive (bool, optional): Search path recursively for fast5 files

    Yields:
        (tuple(str, str)): filepath and read_id for each read. You may say,
            "why not yield an ont_fast_api object instead of a nasty tuple?" I
            would say: "yes, I tried that, but it led to unfathomable nastiness
            when I fed these objects in as arguments to multiple processes."

    Notes:
        If strand_list is given, then only return the reads spcified, according
        to the following rules:

        (A) If the strand list file has a column 'read_id' and no column
            'filename' or 'filename_fast5' then look through all fast5 files in
            the path and return all reads with read_ids in that column.
        (B) If the strand list file has a column 'filename' or 'filename_fast5'
            and no column 'read_id' then look through all filenames specified
            and return all reads in them.
        (C) If the strand list has a column 'filename' or 'filename_fast5'
            _and_ a column 'read_id' then loop through the rows in the strand
            list, returning the appropriate tuple for each row. We check that
            each file exists and contains the read_id.

    Example:
        read_iterator = iterate_fast5_reads('directory')
        for read_tuple in read_iterator:
            fname,read_id = read_tuple
            print("Filename=",fname,", read id = ",read_id)
            with fast5_interface.get_fast5_file(fname, 'r') as f5file:
                read = f5file.get_read(read_id)
                dacs = read.get_raw_data()
            print("Length of rawget_file_names data:",len(dacs))
    """
    filepaths, read_ids = None, None

    if strand_list is not None:
        strand_table = readtsv(strand_list)
        if verbose >= 2:
            print("Columns in strand list file:")
            print(strand_table.dtype.names)
        if 'filename' in strand_table.dtype.names:
            filepaths = strand_table['filename']
        elif 'filename_fast5' in strand_table.dtype.names:
            filepaths = strand_table['filename_fast5']
        if 'read_id' in strand_table.dtype.names:
            read_ids = [str(i) for i in strand_table['read_id']]
        # If we get to this point and we haven't got read ids or filenames,
        # then there is nothing in the strand list that we can use (this
        # happens, for example, when the strand list has no header line).
        if filepaths is None and read_ids is None:
            raise Exception((
                "Strand list at {} has no column that can be used: (it " +
                "should contain ('filename' or 'filename_fast5') or " +
                "'read_id', or both a filename column and a read_id " +
                "column)").format(strand_list))
        # The strand list supplies filenames, not paths, so we supply the rest
        # of the path
        if filepaths is not None:
            filepaths = [os.path.join(path, x) for x in filepaths]

    if (filepaths is not None) and (read_ids is not None):
        # This is the case (C) above. Both filenames and read_ids come from the
        # strandlist and we therefore know which read_id goes with which file
        for y in iterate_file_read_pairs(filepaths, read_ids, limit, verbose):
            yield y
        return

    if filepaths is None:
        # Filenames not supplied by strand list, so we get them from the path
        if os.path.isdir(path):
            filepaths = get_fast5_file_list(path, recursive=recursive)
        else:
            filepaths = [path]

    for y in iterate_files_reads_unpaired(filepaths, read_ids, limit, verbose):
        yield y
Esempio n. 16
0
                    type=Positive(int),
                    default=15,
                    help='number of bases up stream')
parser.add_argument('references',
                    action=FileExists,
                    help='Fasta file containing references')
parser.add_argument('coordinates', action=FileExists, help='coordinates file')

bases = {b: i for i, b in enumerate('ACGT')}

if __name__ == '__main__':
    args = parser.parse_args()
    args.up += 1

    refdict = fasta_file_to_dict(args.references)
    coordinates = readtsv(args.coordinates)

    background_counts = np.zeros(len(bases), dtype=float)
    if args.refbackground:
        for ref in refdict.values():
            refstr = ref.decode('ascii')
            background_counts += [refstr.count(b) for b in bases.keys()]

    frags = []
    for coord in coordinates:
        readname, pos = coord['filename'], coord['pos']
        readname = readname.decode('ascii')
        if pos < args.down:
            continue
        if readname not in refdict:
            continue
Esempio n. 17
0
mpl.use('Agg')  # So we don't need an x server
import matplotlib.pyplot as plt
import numpy as np
import sys
from taiyaki import fileio

print("Plots summary of chunk log.")
print("Usage:")
print("plot_chunk_log.py <chunk_log_file> <output_file>")
if len(sys.argv) < 3:
    print("ERROR: Needs command line arguments!")
else:
    chunk_log_file = sys.argv[1]
    plotfile = sys.argv[2]
    t = fileio.readtsv(chunk_log_file)

    plt.figure(figsize=(16, 12))

    plt.subplot(2, 2, 1)
    plt.title('Mean dwells of chunks sampled to get filter params')
    f = (t['iteration'] == -1) & (t['status'] == 'pass')
    bases = t['chunk_len_bases'][f]
    samples = t['chunk_len_samples'][f]
    filter_sample_length = len(bases)
    meandwells = samples / (bases + 0.0001)
    plt.hist(meandwells, bins=100, log=True)
    plt.grid()

    # Remove the part that refers to the sampling for filter params
    t = t[filter_sample_length:]