Ejemplo n.º 1
0
def calc_stats(file_in,
               return_type=None,
               txt_width=0,
               log=False,
               backend=None,
               format='fasta',
               minLength=0,
               **kwargs):
    """
    Given contigs in fastsa format:
     * calculate length stats (including N50)
     * plot histogram (use txt_width and backend to select format)
     * return_types:
       None: just print text to STDOUT
       'report': return text
       'data': return dictionary of data
    """
    with open(file_in, 'r') as seq:
        sizes = [
            len(record) for record in SeqIO.parse(seq, format)
            if len(record) >= minLength
        ]

    sizes = numpy.array(sizes)
    data = get_contig_length_stats(sizes)

    if return_type != 'data':
        report = get_contig_length_report(data)

    if backend is not None:
        h = plot_assembly(sizes,
                          file_in,
                          data,
                          log=log,
                          backend=backend,
                          **kwargs)
    if txt_width > 0:
        if backend is None:
            h = numpy.histogram(sizes, **kwargs)
        histogramText = ascii_histogram(h, log=log, width=txt_width)
        if return_type != 'data':
            if log:
                report += "\n\nContig length histogram (log):\n"
            else:
                report += "\n\nContig length histogram:\n"
            report += histogramText
        else:
            data['histogram'] = histogramText

    if return_type == 'data':
        return data
    elif return_type is None:
        print(report)
    else:
        return report
Ejemplo n.º 2
0
def contig_length_stats(contig_stats,
                        return_type=None,
                        txt_width=0,
                        log=False,
                        min_length=0,
                        **kwargs):
    """
    Given contig stats table
     * calculate length stats (including N50)
     * optionally plot histogram (use txt_width and backend to select format)
       (if txt_width is greater than 0 (should be at least 40 for a good plot))
     * return_types:
       None: just print text to STDOUT
       'report': return text
       'data': return dictionary of data
    """
    report_data = {"min_length": min_length}
    contig_stats = contig_stats.loc[contig_stats.length >= min_length]

    if contig_stats.shape[0] == 0:
        report_data['Assembly'] = {'count': 0}
    else:

        report_data['Assembly'] = get_N_stats(contig_stats)
        for column, label in {
                'length': 'Contig Lengths',
                'read count': 'Reads per Contig',
                'av cov': 'Mean Mapped Depth',
                'mx cov': 'Maximum Mapped Depth',
                'mn cov': 'Minimum Mapped Depth',
                'GC': 'GC Content'
        }.items():
            if column not in contig_stats.columns:
                continue
            report_data[label] = get_column_stats(contig_stats[column])
            if txt_width > 0:
                report_data[label]['log'] = "(log)" if log else ""
                report_data[label]['histogram'] = \
                    ascii_histogram(numpy.histogram(contig_stats[column],
                                                    **kwargs),
                                    log=log,
                                    width=txt_width)

    if return_type == 'data':
        return report_data

    report = get_contig_stats_report(report_data)
    if return_type is None:
        print(report)
    else:
        return report
Ejemplo n.º 3
0
def calc_stats(file_in,
               return_type=None,
               txt_width=0,
               log=False,
               backend=None,
               format='fasta',
               minLength=0,
               **kwargs):
    """
    Given contigs in fastsa format:
     * calculate length stats (including N50)
     * plot histogram (use txt_width and backend to select format)
     * return_types:
       None: just print text to STDOUT
       'report': return text
       'data': return dictionary of data
    """
    with open(file_in, 'r') as seq:
        sizes = [len(record) for record in SeqIO.parse(
            seq, format) if len(record) >= minLength]

    sizes = numpy.array(sizes)
    data = get_contig_length_stats(sizes)

    if return_type != 'data':
        report = get_contig_length_report(data)

    if backend is not None:
        h = plot_assembly(sizes, file_in, data, log=log,
                          backend=backend, **kwargs)
    if txt_width > 0:
        if backend is None:
            h = numpy.histogram(sizes, **kwargs)
        histogramText = ascii_histogram(h, log=log, width=txt_width)
        if return_type != 'data':
            if log:
                report += "\n\nContig length histogram (log):\n"
            else:
                report += "\n\nContig length histogram:\n"
            report += histogramText
        else:
            data['histogram'] = histogramText

    if return_type == 'data':
        return data
    elif return_type is None:
        print(report)
    else:
        return report
Ejemplo n.º 4
0
def contig_length_stats(contig_stats, return_type=None,
                        txt_width=0,
                        log=False,
                        min_length=0,
                        **kwargs):
    """
    Given contig stats table
     * calculate length stats (including N50)
     * optionally plot histogram (use txt_width and backend to select format)
       (if txt_width is greater than 0 (should be at least 40 for a good plot))
     * return_types:
       None: just print text to STDOUT
       'report': return text
       'data': return dictionary of data
    """
    report_data = {"min_length": min_length}
    contig_stats = contig_stats.loc[contig_stats.length >= min_length]

    if contig_stats.shape[0] == 0:
        report_data['Assembly'] = {'count': 0}
    else:

        report_data['Assembly'] = get_N_stats(contig_stats)
        for column, label in {'length': 'Contig Lengths',
                              'read count': 'Reads per Contig',
                              'av cov': 'Mean Mapped Depth',
                              'mx cov': 'Maximum Mapped Depth',
                              'mn cov': 'Minimum Mapped Depth',
                              'GC': 'GC Content'}.items():
            if column not in contig_stats.columns:
                continue
            report_data[label] = get_column_stats(contig_stats[column])
            if txt_width > 0:
                report_data[label]['log'] = "(log)" if log else ""
                report_data[label]['histogram'] = \
                    ascii_histogram(numpy.histogram(contig_stats[column],
                                                    **kwargs),
                                    log=log,
                                    width=txt_width)

    if return_type == 'data':
        return report_data

    report = get_contig_stats_report(report_data)
    if return_type is None:
        print(report)
    else:
        return report
Ejemplo n.º 5
0
def main():
    """
    Sets up the command line interface
    """
    description = __doc__

    parser = argparse.ArgumentParser(description=description)
    parser.add_argument('infile',
                        nargs='?',
                        type=argparse.FileType('rU'),
                        default=sys.stdin,
                        help=("Input file (defaults to STDIN) containing "
                              "a value on each line"))
    parser.add_argument('outfile',
                        nargs='?',
                        type=argparse.FileType('w'),
                        default=sys.stdout,
                        help="File to which to write histogram")
    parser.add_argument('-b', '--bins', type=int, default=50)
    parser.add_argument('-l', '--label', default="value")
    parser.add_argument('-w', '--width', default=75, type=int)
    parser.add_argument('-L', '--log', action='store_true')
    parser.add_argument('-W', '--max-label-width', type=int, default=10)

    # log level and help
    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    values = []
    for line in arguments.infile:
        try:
            values.append(float(line.strip()))
        except ValueError:
            if len(line.strip()) > 0:
                logging.warning("Skipping bad line:\n%s", line.strip())
    logging.info("Read in %d values", len(values))
    arguments.outfile.write(
        ascii_histogram(
            histogram(values, bins=arguments.bins),
            label=arguments.label,
            width=arguments.width,
            log=arguments.log,
            maxLabelWidth=arguments.max_label_width,
        ))
Ejemplo n.º 6
0
def mira_stats(contigStatsFile, minLength=0, bins=20, **kwargs):
    """
    Get length, coverage, and GC stats from mira info file
    Returns text with N50 and histograms
    """
    contigStats = pandas.read_csv(contigStatsFile, index_col=0, sep='\t')
    if minLength > 0:
        contigStats = contigStats[contigStats.length >= minLength]
    sizes = contigStats['length']
    data = get_contig_length_stats(sizes)
    report = get_contig_length_report(data)

    # add histograms to report
    report += '\nHistograms:\n'
    for key in ['length', 'GC%', 'av.cov', 'mx.cov.', 'av.qual']:
        report += '\n'
        report += ascii_histogram(numpy.histogram(contigStats[key], bins=bins),
                                  label=key,
                                  **kwargs)

    return report