def calc_stats(file_in, return_type=None, txt_width=0, log=False, backend=None, format='fasta', minLength=0, **kwargs): """ Given contigs in fastsa format: * calculate length stats (including N50) * plot histogram (use txt_width and backend to select format) * return_types: None: just print text to STDOUT 'report': return text 'data': return dictionary of data """ with open(file_in, 'r') as seq: sizes = [ len(record) for record in SeqIO.parse(seq, format) if len(record) >= minLength ] sizes = numpy.array(sizes) data = get_contig_length_stats(sizes) if return_type != 'data': report = get_contig_length_report(data) if backend is not None: h = plot_assembly(sizes, file_in, data, log=log, backend=backend, **kwargs) if txt_width > 0: if backend is None: h = numpy.histogram(sizes, **kwargs) histogramText = ascii_histogram(h, log=log, width=txt_width) if return_type != 'data': if log: report += "\n\nContig length histogram (log):\n" else: report += "\n\nContig length histogram:\n" report += histogramText else: data['histogram'] = histogramText if return_type == 'data': return data elif return_type is None: print(report) else: return report
def contig_length_stats(contig_stats, return_type=None, txt_width=0, log=False, min_length=0, **kwargs): """ Given contig stats table * calculate length stats (including N50) * optionally plot histogram (use txt_width and backend to select format) (if txt_width is greater than 0 (should be at least 40 for a good plot)) * return_types: None: just print text to STDOUT 'report': return text 'data': return dictionary of data """ report_data = {"min_length": min_length} contig_stats = contig_stats.loc[contig_stats.length >= min_length] if contig_stats.shape[0] == 0: report_data['Assembly'] = {'count': 0} else: report_data['Assembly'] = get_N_stats(contig_stats) for column, label in { 'length': 'Contig Lengths', 'read count': 'Reads per Contig', 'av cov': 'Mean Mapped Depth', 'mx cov': 'Maximum Mapped Depth', 'mn cov': 'Minimum Mapped Depth', 'GC': 'GC Content' }.items(): if column not in contig_stats.columns: continue report_data[label] = get_column_stats(contig_stats[column]) if txt_width > 0: report_data[label]['log'] = "(log)" if log else "" report_data[label]['histogram'] = \ ascii_histogram(numpy.histogram(contig_stats[column], **kwargs), log=log, width=txt_width) if return_type == 'data': return report_data report = get_contig_stats_report(report_data) if return_type is None: print(report) else: return report
def calc_stats(file_in, return_type=None, txt_width=0, log=False, backend=None, format='fasta', minLength=0, **kwargs): """ Given contigs in fastsa format: * calculate length stats (including N50) * plot histogram (use txt_width and backend to select format) * return_types: None: just print text to STDOUT 'report': return text 'data': return dictionary of data """ with open(file_in, 'r') as seq: sizes = [len(record) for record in SeqIO.parse( seq, format) if len(record) >= minLength] sizes = numpy.array(sizes) data = get_contig_length_stats(sizes) if return_type != 'data': report = get_contig_length_report(data) if backend is not None: h = plot_assembly(sizes, file_in, data, log=log, backend=backend, **kwargs) if txt_width > 0: if backend is None: h = numpy.histogram(sizes, **kwargs) histogramText = ascii_histogram(h, log=log, width=txt_width) if return_type != 'data': if log: report += "\n\nContig length histogram (log):\n" else: report += "\n\nContig length histogram:\n" report += histogramText else: data['histogram'] = histogramText if return_type == 'data': return data elif return_type is None: print(report) else: return report
def contig_length_stats(contig_stats, return_type=None, txt_width=0, log=False, min_length=0, **kwargs): """ Given contig stats table * calculate length stats (including N50) * optionally plot histogram (use txt_width and backend to select format) (if txt_width is greater than 0 (should be at least 40 for a good plot)) * return_types: None: just print text to STDOUT 'report': return text 'data': return dictionary of data """ report_data = {"min_length": min_length} contig_stats = contig_stats.loc[contig_stats.length >= min_length] if contig_stats.shape[0] == 0: report_data['Assembly'] = {'count': 0} else: report_data['Assembly'] = get_N_stats(contig_stats) for column, label in {'length': 'Contig Lengths', 'read count': 'Reads per Contig', 'av cov': 'Mean Mapped Depth', 'mx cov': 'Maximum Mapped Depth', 'mn cov': 'Minimum Mapped Depth', 'GC': 'GC Content'}.items(): if column not in contig_stats.columns: continue report_data[label] = get_column_stats(contig_stats[column]) if txt_width > 0: report_data[label]['log'] = "(log)" if log else "" report_data[label]['histogram'] = \ ascii_histogram(numpy.histogram(contig_stats[column], **kwargs), log=log, width=txt_width) if return_type == 'data': return report_data report = get_contig_stats_report(report_data) if return_type is None: print(report) else: return report
def main(): """ Sets up the command line interface """ description = __doc__ parser = argparse.ArgumentParser(description=description) parser.add_argument('infile', nargs='?', type=argparse.FileType('rU'), default=sys.stdin, help=("Input file (defaults to STDIN) containing " "a value on each line")) parser.add_argument('outfile', nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="File to which to write histogram") parser.add_argument('-b', '--bins', type=int, default=50) parser.add_argument('-l', '--label', default="value") parser.add_argument('-w', '--width', default=75, type=int) parser.add_argument('-L', '--log', action='store_true') parser.add_argument('-W', '--max-label-width', type=int, default=10) # log level and help add_universal_arguments(parser) arguments = parser.parse_args() setup_logging(arguments) values = [] for line in arguments.infile: try: values.append(float(line.strip())) except ValueError: if len(line.strip()) > 0: logging.warning("Skipping bad line:\n%s", line.strip()) logging.info("Read in %d values", len(values)) arguments.outfile.write( ascii_histogram( histogram(values, bins=arguments.bins), label=arguments.label, width=arguments.width, log=arguments.log, maxLabelWidth=arguments.max_label_width, ))
def mira_stats(contigStatsFile, minLength=0, bins=20, **kwargs): """ Get length, coverage, and GC stats from mira info file Returns text with N50 and histograms """ contigStats = pandas.read_csv(contigStatsFile, index_col=0, sep='\t') if minLength > 0: contigStats = contigStats[contigStats.length >= minLength] sizes = contigStats['length'] data = get_contig_length_stats(sizes) report = get_contig_length_report(data) # add histograms to report report += '\nHistograms:\n' for key in ['length', 'GC%', 'av.cov', 'mx.cov.', 'av.qual']: report += '\n' report += ascii_histogram(numpy.histogram(contigStats[key], bins=bins), label=key, **kwargs) return report