def main(): args = get_parser().parse_args() sys.stderr.write( "* Loading references (this may take a while for large genomes)\n") references = fasta_file_to_dict(args.reference, filter_ambig=False) if args.input_strand_list is None: strand_list = None else: strand_list = readtsv(args.input_strand_list, fields=['read_id'])['read_id'] sys.stderr.write('* Strand list contains {} reads\n'.format( len(strand_list))) sys.stderr.write("* Extracting read references using SAM alignment\n") with open_file_or_stdout(args.output) as fh: for samfile in args.input: for name, read_ref in get_refs(samfile, references, args.min_coverage, args.pad, strand_list=strand_list): if args.reverse: read_ref = read_ref[::-1] if args.complement: read_ref = complement(read_ref) fasta = ">{}\n{}\n".format(name, read_ref) fh.write(fasta)
def get_column_from_tsv(tsv_file_name, column): '''Load a column from a csv file''' if tsv_file_name is not None: data = readtsv(tsv_file_name, encoding='utf-8') assert column in data.dtype.names, "Strand file does not contain required field {}".format(column) return [x for x in data[column]]
def get_per_read_params_dict_from_tsv(input_file): """Load per read parameter .tsv into a np array and parse into a dictionary Args: input_file (str): filename including path for the tsv file Returns: dict : dictionary with keys being UUIDs, values being named tuple('per_read_params', 'trim_start trim_end shift scale')""" try: per_read_params_array = readtsv( input_file, ['UUID', 'trim_start', 'trim_end', 'shift', 'scale']) except Exception as e: sys.stderr.write( 'Failed to get per-read parameters from {}.\n{}\n'.format( input_file, repr(e))) return None per_read_params_dict = {} for row in per_read_params_array: try: per_read_params_dict[row[0]] = { 'trim_start': row[1], 'trim_end': row[2], 'shift': row[3], 'scale': row[4] } except Exception: sys.stderr.write( "Warning: ignoring incorrect line {} in {}\n".format( row, input_file)) return per_read_params_dict
def main(): print("Plots output of predict_squiggle.py") print("Usage:") print( "plot_predict_squiggle_output.py <predict_squiggle_output_file> <output_png_file>" ) if len(sys.argv) < 3: print("ERROR: Needs command line arguments!") else: predict_squiggle_output_file = sys.argv[1] plotfile = sys.argv[2] t = fileio.readtsv(predict_squiggle_output_file) plt.figure(figsize=(16, 5)) tstart = 0 for nrow in range(len(t)): i, sd, dwell = t['current'][nrow], t['sd'][nrow], t['dwell'][nrow] centret = tstart + dwell / 2 plt.bar(centret, sd, dwell, i - sd / 2) plt.text(centret, i, t['base'][nrow]) tstart += dwell plt.xlabel('time') plt.ylabel('current') plt.grid() plt.savefig(plotfile)
def main(): args = get_parser().parse_args() batchdata = {} valdata = {} for td in args.input_directories: batchdata[td] = fileio.readtsv(os.path.join(td, BATCH_LOG_FILENAME)) valdata[td] = fileio.readtsv(os.path.join(td, VAL_LOG_FILENAME)) if args.mav is not None: batchdata[td]['loss'] = moving_average( batchdata[td]['loss'], args.mav) # Plot validation and training loss plt.figure(figsize=(6, 4.8)) colour_cycle = plt.rcParams['axes.prop_cycle'].by_key()['color'] for td, colour in zip(args.input_directories, colour_cycle): label = os.path.basename(os.path.normpath(td)) plt.plot(batchdata[td]['iter'], batchdata[td]['loss'], color=colour, label=label + ' (training)', alpha=0.5, linewidth=0.5) if len(valdata[td]['iter']) == 0: print(('No validtion log data for {}. The first validation run ' + 'has likely not completed.').format(td)) continue plt.plot(valdata[td]['iter'], valdata[td]['loss'], color=colour, label=label + ' (validation)', linewidth=0.5) plt.grid() plt.xlabel('Iterations') plt.ylabel('Loss') if args.upper_y_limit is not None: plt.ylim(top=args.upper_y_limit) if args.lower_y_limit is not None: plt.ylim(bottom=args.lower_y_limit) if args.upper_x_limit is not None: plt.xlim(right=args.upper_x_limit) if args.lower_x_limit is not None: plt.xlim(left=args.lower_x_limit) leg = plt.legend(loc='upper right') for legobj in leg.legendHandles: legobj.set_linewidth(4.0) if args.mav is not None: plt.title('Moving average window = {} iterations'.format(args.mav)) plt.tight_layout() plt.savefig(args.output, dpi=300) plt.close()
def read_batch_log(filepath): t = fileio.readtsv(filepath) return { 't': np.arange(len(t)), 'training_loss': t['loss'], 'gradientnorm': t['gradientnorm'], 'gradientcap': t['gradientcap'] }
def get_alignment_data(alignment_file): """Read alignment summary generated by Guppy or Taiyaki, getting accuracy and length of aligned part of read for each read id :param alignment_file: file path pointing to either Taiyaki (.samacc) or Guppy (.txt) alignment summary :return: tuple (read_ids, accuracies, alignment_lens) where read_ids is a numpy array of strs accuracies is a numpy array of floats (0-1) alignment_lens is a numpy array of ints giving the read (i.e. 'strand') alignment length. -1 is used as a marker for null in this array. :note: The resulting table may have more than one entry for each read id (because there may be more than one possible alignment) """ #Delimiter None accepts space or tab - samaccs are space-separated. t = fileio.readtsv(alignment_file, delimiter=None) try: #Try to read the file as a Guppy alignment summary file read_ids = t['read_id'] accuracies = t['alignment_accuracy'] alignment_lens = (t['alignment_strand_end'] - t['alignment_strand_start']) print("Interpreted alignment file as Guppy output") accuracies[accuracies<0] = np.nan return read_ids, accuracies, alignment_lens except ValueError: #Thrown if the required fields are not present in the file pass try: #Try to read the file as a Taiyaki alignment summary read_ids = t['query'] accuracies = t['accuracy'] #Query length in alignment not available directly in taiyaki summary alignment_lens = (t['reference_end'] - t['reference_start'] + t['insertion'] - t['deletion']) print("Interpreted alignment file as Taiyaki output") return read_ids, accuracies, alignment_lens except ValueError: pass columnlist = list(t.dtype.fields.keys()) raise Exception("Alignment summary file must contain either columns "+ "(read_ids, alignment accuracy, alignment_strand_end, " + "alignment_strand_start) or " + "(id, accuracy, reference_end, reference_start, " + "insertion, deletion )" + ". Columns are {}".format(columnlist) )
def get_alignment_data(alignment_file): """Read alignment summary generated by Guppy or Taiyaki, getting accuracy and length of aligned part of read for each read ID Note: The resulting table may have more than one entry for each read id because there may be more than one possible alignment Args: alignment_file (str): file path pointing to either Taiyaki (.samacc) or Guppy (.txt) alignment summary. Returns: tuple of :class:`ndarray` and :class:`ndarray` and :class:`ndarray`: First element of tuple contains the read ID of the reads analysed, the second element is the corresponding accuracy of each read, and the third element is the alignment length (-1 means unaligned). """ # Delimiter None accepts space or tab - samaccs are space-separated. t = fileio.readtsv(alignment_file, delimiter=None) try: # Try to read the file as a Guppy alignment summary file read_ids = t['read_id'] accuracies = t['alignment_accuracy'] alignment_lens = (t['alignment_strand_end'] - t['alignment_strand_start']) print("Interpreted alignment file as Guppy output") accuracies[accuracies < 0] = np.nan return read_ids, accuracies, alignment_lens except ValueError: # Thrown if the required fields are not present in the file pass try: # Try to read the file as a Taiyaki alignment summary read_ids = t['query'] accuracies = t['accuracy'] # Query length in alignment not available directly in taiyaki summary alignment_lens = (t['reference_end'] - t['reference_start'] + t['insertion'] - t['deletion']) print("Interpreted alignment file as Taiyaki output") return read_ids, accuracies, alignment_lens except ValueError: pass columnlist = list(t.dtype.fields.keys()) raise Exception("Alignment summary file must contain either columns " + "(read_ids, alignment accuracy, alignment_strand_end, " + "alignment_strand_start) or " + "(id, accuracy, reference_end, reference_start, " + "insertion, deletion )" + ". Columns are {}".format(columnlist))
def get_column_from_tsv(tsv_file_name, column): """Load a column from a csv file Args: tsv_file_name (str) : filename column (str) : the column we want Returns: list : the data from the column """ if tsv_file_name is not None: data = readtsv(tsv_file_name, encoding='utf-8') assert column in data.dtype.names, ( "Strand file does not contain required field {}".format(column)) return [x for x in data[column]]
def main(): args = parser.parse_args() AccVals = readtsv(args.combined_read_file)['alignment_accuracy'] fig, ax = plt.subplots() escaped_title = bytes(args.title, "utf-8").decode("unicode_escape") ax.set_title(escaped_title, fontsize=7) ax.set_xlabel('Accuracy') ax.set_ylabel('Reads') ax.minorticks_on() ax.grid(which='major', linestyle=':') ax.grid(which='minor', linestyle=':') plt.hist(np.array(AccVals[AccVals >= 0]), bins=args.bins) plt.tight_layout() plt.savefig(args.output_name)
def iterate_fast5_reads(path, strand_list=None, limit=None, verbose=0, recursive=False): """Return iterator yielding reads in a directory of fast5 files or a single fast5 file. Each read is specified by a tuple (filepath, read_id) Files may be single or multi-read fast5s You may say, "why not yield an ont_fast_api object instead of this nasty tuple?" I would then say. "yes, I did try that, but it led to unfathomable nastiness when I fed these objects in as arguments to multiple processes." If strand_list is given, then only return the reads spcified, according to the following rules: (A) If the strand list file has a column 'read_id' and no column 'filename' or 'filename_fast5' then look through all fast5 files in the path and return all reads with read_ids in that column. (B) If the strand list file has a column 'filename' or 'filename_fast5' and no column 'read_id' then look through all filenames specified and return all reads in them. (C) If the strand list has a column 'filename' or 'filename_fast5' _and_ a column 'read_id' then loop through the rows in the strand list, returning the appropriate tuple for each row. We check that each file exists and contains the read_id. :param path: Directory ( or filename for a single file) :param strand_list: Path to file containing list of files and/or read ids to iterate over. :param limit: Limit number of reads to consider :param verbose : an integer. verbose=0 prints no progress messages, verbose=1 prints a message for every file read. Verbose =2 prints the list of files before starting as well. :param recursive: Search path recursively for fast5 files. Example usage: read_iterator = iterate_fast5_reads('directory') for read_tuple in read_iterator: fname,read_id = read_tuple print("Filename=",fname,", read id = ",read_id) with fast5_interface.get_fast5_file(fname, 'r') as f5file: read = f5file.get_read(read_id) dacs = read.get_raw_data() print("Length of rawget_file_names data:",len(dacs)) """ filepaths, read_ids = None, None if strand_list is not None: strand_table = readtsv(strand_list) if verbose >= 2: print("Columns in strand list file:") print(strand_table.dtype.names) if 'filename' in strand_table.dtype.names: filepaths = strand_table['filename'] elif 'filename_fast5' in strand_table.dtype.names: filepaths = strand_table['filename_fast5'] if 'read_id' in strand_table.dtype.names: read_ids = [str(i) for i in strand_table['read_id']] # If we get to this point and we haven't got read ids or filenames, then # there is nothing in the strand list that we can use (this happens, for # example, when the strand list has no header line). if filepaths is None and read_ids is None: raise Exception( "Strand list at {} has no column that can be used:".format( strand_list) + "(it should contain ('filename' or 'filename_fast5') or 'read_id'," + "or both a filename column and a read_id column)") # The strand list supplies filenames, not paths, so we supply the rest of the path if filepaths is not None: filepaths = [os.path.join(path, x) for x in filepaths] if (filepaths is not None) and (read_ids is not None): # This is the case (C) above. Both filenames and read_ids come from the strandlist # and we therefore know which read_id goes with which file for y in iterate_file_read_pairs(filepaths, read_ids, limit, verbose): yield y return if filepaths is None: # Filenames not supplied by strand list, so we get them from the path if os.path.isdir(path): filepaths = ont_fast5_api.conversion_tools.conversion_utils.get_fast5_file_list( path, recursive=recursive) else: filepaths = [path] for y in iterate_files_reads_unpaired(filepaths, read_ids, limit, verbose): yield y
parser.add_argument('combined_read_file', action=FileExists, help='Combined read file to get data from') parser.add_argument('--bins', default=100, type=Positive(int), help='Number of bins for histogram') parser.add_argument('--title', default='', help='Figure title') parser.add_argument('--output_name', default='basecaller_histogram.png', help='Output file name') if __name__ == "__main__": args = parser.parse_args() AccVals = readtsv(args.combined_read_file)['alignment_accuracy'] fig, ax = plt.subplots() ax.set_title(args.title) ax.set_xlabel('Accuracy') ax.set_ylabel('Reads') ax.minorticks_on() ax.grid(which='major', linestyle=':') ax.grid(which='minor', linestyle=':') plt.hist(np.array(AccVals[AccVals >= 0]), bins=args.bins) plt.tight_layout()
def main(): print("Plots summary of chunk log.") print("Usage:") print("plot_chunk_log.py <chunk_log_file> <output_file>") if len(sys.argv) < 3: print("ERROR: Needs command line arguments!") else: chunk_log_file = sys.argv[1] plotfile = sys.argv[2] t = fileio.readtsv(chunk_log_file) plt.figure(figsize=(16, 12)) plt.subplot(2, 2, 1) plt.title('Mean dwells of chunks sampled to get filter params') f = (t['iteration'] == -1) & (t['status'] == 'pass') bases = t['chunk_len_bases'][f] samples = t['chunk_len_samples'][f] filter_sample_length = len(bases) meandwells = samples / (bases + 0.0001) plt.hist(meandwells, bins=100, log=True) plt.grid() # Remove the part that refers to the sampling for filter params t = t[filter_sample_length:] plt.subplot(2, 2, 2) plt.title('Lengths of accepted and rejected chunks') status_choices = np.unique(t['status']) # Need to do 'pass' first - otherwise it overwhelms everything status_choices = list(status_choices[status_choices != 'pass']) status_choices = ['pass'] + status_choices for status in status_choices: filt = (t['status'] == status) bases = t['chunk_len_bases'][filt] samples = t['chunk_len_samples'][filt] print("Status", status, "number of chunks=", len(bases)) plt.scatter(bases, samples, label=status, s=4) plt.grid() plt.ylabel('Length in bases') plt.xlabel('Length in samples') plt.legend(loc='upper left', framealpha=0.3) for nplot, scale in enumerate('log linear'.split()): plt.subplot(2, 2, nplot + 3, xscale=scale, yscale=scale) plt.title('Max and mean dwells') status_choices = np.unique(t['status']) # Need to do 'pass' first - otherwise it overwhelms everything status_choices = list(status_choices[status_choices != 'pass']) status_choices = ['pass'] + status_choices for status in status_choices: filt = (t['status'] == status) bases = t['chunk_len_bases'][filt] samples = t['chunk_len_samples'][filt] count = len(bases) meandwells = samples / (bases + 0.0001) maxdwells = t['max_dwell'][filt] plt.scatter(meandwells, maxdwells, label=status + ' (' + str(count) + ')', s=4, alpha=0.5) plt.grid() plt.xlabel('Mean dwell') plt.ylabel('Max dwell') plt.legend(loc='lower right', framealpha=0.3) plt.savefig(plotfile)
#!/usr/bin/env python3 import matplotlib as mpl mpl.use('Agg') # So we don't need an x server import matplotlib.pyplot as plt import sys from taiyaki import fileio print("Plots output of predict_squiggle.py") print("Usage:") print("plot_predict_squiggle_output.py <predict_squiggle_output_file> <output_png_file>") if len(sys.argv) < 3: print("ERROR: Needs command line arguments!") else: predict_squiggle_output_file = sys.argv[1] plotfile = sys.argv[2] t = fileio.readtsv(predict_squiggle_output_file) plt.figure(figsize=(16, 5)) tstart = 0 for nrow in range(len(t)): i,sd,dwell = t['current'][nrow], t['sd'][nrow], t['dwell'][nrow] centret = tstart + dwell/2 plt.bar(centret, sd, dwell, i-sd/2) plt.text(centret, i, t['base'][nrow]) tstart +=dwell plt.xlabel('time') plt.ylabel('current') plt.grid() plt.savefig(plotfile)
def iterate_fast5_reads( path, strand_list=None, limit=None, verbose=0, recursive=False): """ Iterate over reads in a directory of fast5 files or a single fast5 file. Files may be single or multi-read fast5s. Args: path (str): Directory (or filename for a single file) strand_list (str or None, optional): Path to file containing list of files and/or read ids to iterate over (as described in notes) or None for all files and reads limit (int or None, optional): Maximum number of reads to consider or None for all verbose (int, optional): 0 prints no messages, 1 prints a message for every file read, 2 prints the list of files before starting as well recursive (bool, optional): Search path recursively for fast5 files Yields: (tuple(str, str)): filepath and read_id for each read. You may say, "why not yield an ont_fast_api object instead of a nasty tuple?" I would say: "yes, I tried that, but it led to unfathomable nastiness when I fed these objects in as arguments to multiple processes." Notes: If strand_list is given, then only return the reads spcified, according to the following rules: (A) If the strand list file has a column 'read_id' and no column 'filename' or 'filename_fast5' then look through all fast5 files in the path and return all reads with read_ids in that column. (B) If the strand list file has a column 'filename' or 'filename_fast5' and no column 'read_id' then look through all filenames specified and return all reads in them. (C) If the strand list has a column 'filename' or 'filename_fast5' _and_ a column 'read_id' then loop through the rows in the strand list, returning the appropriate tuple for each row. We check that each file exists and contains the read_id. Example: read_iterator = iterate_fast5_reads('directory') for read_tuple in read_iterator: fname,read_id = read_tuple print("Filename=",fname,", read id = ",read_id) with fast5_interface.get_fast5_file(fname, 'r') as f5file: read = f5file.get_read(read_id) dacs = read.get_raw_data() print("Length of rawget_file_names data:",len(dacs)) """ filepaths, read_ids = None, None if strand_list is not None: strand_table = readtsv(strand_list) if verbose >= 2: print("Columns in strand list file:") print(strand_table.dtype.names) if 'filename' in strand_table.dtype.names: filepaths = strand_table['filename'] elif 'filename_fast5' in strand_table.dtype.names: filepaths = strand_table['filename_fast5'] if 'read_id' in strand_table.dtype.names: read_ids = [str(i) for i in strand_table['read_id']] # If we get to this point and we haven't got read ids or filenames, # then there is nothing in the strand list that we can use (this # happens, for example, when the strand list has no header line). if filepaths is None and read_ids is None: raise Exception(( "Strand list at {} has no column that can be used: (it " + "should contain ('filename' or 'filename_fast5') or " + "'read_id', or both a filename column and a read_id " + "column)").format(strand_list)) # The strand list supplies filenames, not paths, so we supply the rest # of the path if filepaths is not None: filepaths = [os.path.join(path, x) for x in filepaths] if (filepaths is not None) and (read_ids is not None): # This is the case (C) above. Both filenames and read_ids come from the # strandlist and we therefore know which read_id goes with which file for y in iterate_file_read_pairs(filepaths, read_ids, limit, verbose): yield y return if filepaths is None: # Filenames not supplied by strand list, so we get them from the path if os.path.isdir(path): filepaths = get_fast5_file_list(path, recursive=recursive) else: filepaths = [path] for y in iterate_files_reads_unpaired(filepaths, read_ids, limit, verbose): yield y
type=Positive(int), default=15, help='number of bases up stream') parser.add_argument('references', action=FileExists, help='Fasta file containing references') parser.add_argument('coordinates', action=FileExists, help='coordinates file') bases = {b: i for i, b in enumerate('ACGT')} if __name__ == '__main__': args = parser.parse_args() args.up += 1 refdict = fasta_file_to_dict(args.references) coordinates = readtsv(args.coordinates) background_counts = np.zeros(len(bases), dtype=float) if args.refbackground: for ref in refdict.values(): refstr = ref.decode('ascii') background_counts += [refstr.count(b) for b in bases.keys()] frags = [] for coord in coordinates: readname, pos = coord['filename'], coord['pos'] readname = readname.decode('ascii') if pos < args.down: continue if readname not in refdict: continue
mpl.use('Agg') # So we don't need an x server import matplotlib.pyplot as plt import numpy as np import sys from taiyaki import fileio print("Plots summary of chunk log.") print("Usage:") print("plot_chunk_log.py <chunk_log_file> <output_file>") if len(sys.argv) < 3: print("ERROR: Needs command line arguments!") else: chunk_log_file = sys.argv[1] plotfile = sys.argv[2] t = fileio.readtsv(chunk_log_file) plt.figure(figsize=(16, 12)) plt.subplot(2, 2, 1) plt.title('Mean dwells of chunks sampled to get filter params') f = (t['iteration'] == -1) & (t['status'] == 'pass') bases = t['chunk_len_bases'][f] samples = t['chunk_len_samples'][f] filter_sample_length = len(bases) meandwells = samples / (bases + 0.0001) plt.hist(meandwells, bins=100, log=True) plt.grid() # Remove the part that refers to the sampling for filter params t = t[filter_sample_length:]