Ejemplo n.º 1
0
def n50(args):
    """
    %prog n50 filename

    Given a file with a list of numbers denoting contig lengths, calculate N50.
    Input file can be both FASTA or a list of sizes.
    """
    from jcvi.graphics.histogram import loghistogram

    p = OptionParser(n50.__doc__)
    p.add_option(
        "--print0",
        default=False,
        action="store_true",
        help="Print size and L50 to stdout",
    )

    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    ctgsizes = []

    # Guess file format
    probe = open(args[0]).readline()[0]
    isFasta = probe == ">"
    if isFasta:
        for filename in args:
            f = Fasta(filename)
            ctgsizes += list(b for a, b in f.itersizes())

    else:
        for row in must_open(args):
            try:
                ctgsize = int(float(row.split()[-1]))
            except ValueError:
                continue
            ctgsizes.append(ctgsize)

    a50, l50, nn50 = calculate_A50(ctgsizes)
    sumsize = sum(ctgsizes)
    minsize = min(ctgsizes)
    maxsize = max(ctgsizes)
    n = len(ctgsizes)
    print(", ".join(args), file=sys.stderr)

    summary = (sumsize, l50, nn50, minsize, maxsize, n)
    print(
        " ".join("{0}={1}".format(a, b) for a, b in zip(header, summary)),
        file=sys.stderr,
    )
    loghistogram(ctgsizes)

    if opts.print0:
        print("\t".join(str(x) for x in (",".join(args), sumsize, l50)))

    return zip(header, summary)
Ejemplo n.º 2
0
Archivo: base.py Proyecto: rrane/jcvi
def n50(args):
    """
    %prog n50 filename

    Given a file with a list of numbers denoting contig lengths, calculate N50.
    Input file can be both FASTA or a list of sizes.
    """
    from jcvi.graphics.histogram import loghistogram

    p = OptionParser(n50.__doc__)
    p.add_option(
        "--print0", default=False, action="store_true", help="Print size and L50 to stdout [default: %default]"
    )

    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    ctgsizes = []

    # Guess file format
    probe = open(args[0]).readline()[0]
    isFasta = probe == ">"
    if isFasta:
        for filename in args:
            f = Fasta(filename)
            ctgsizes += list(b for a, b in f.itersizes())

    else:
        for row in must_open(args):
            try:
                ctgsize = int(row.split()[-1])
            except ValueError:
                continue
            ctgsizes.append(ctgsize)

    a50, l50, nn50 = calculate_A50(ctgsizes)
    sumsize = sum(ctgsizes)
    minsize = min(ctgsizes)
    maxsize = max(ctgsizes)
    n = len(ctgsizes)
    print >> sys.stderr, ", ".join(args)

    summary = (sumsize, l50, nn50, minsize, maxsize, n)
    print >> sys.stderr, " ".join("{0}={1}".format(a, b) for a, b in zip(header, summary))
    loghistogram(ctgsizes)

    if opts.print0:
        print "\t".join(str(x) for x in (",".join(args), sumsize, l50))

    return zip(header, summary)
Ejemplo n.º 3
0
def n50(args):
    """
    %prog n50 filename

    Given a file with a list of numbers denoting contig lengths, calculate N50.
    Input file can be both FASTA or a list of sizes.
    """
    p = OptionParser(n50.__doc__)

    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    ctgsizes = []

    # Guess file format
    probe = open(args[0]).readline()[0]
    isFasta = (probe == '>')
    if isFasta:
        for filename in args:
            f = Fasta(filename)
            ctgsizes += list(b for a, b in f.itersizes())

    else:
        for row in must_open(args):
            try:
                ctgsize = int(row.split()[-1])
            except ValueError:
                continue
            ctgsizes.append(ctgsize)

    a50, l50, nn50 = calculate_A50(ctgsizes)
    sumsize = sum(ctgsizes)
    minsize = min(ctgsizes)
    maxsize = max(ctgsizes)
    n = len(ctgsizes)
    print >> sys.stderr, ", ".join(args)

    summary = (sumsize, l50, nn50, minsize, maxsize, n)
    print >> sys.stderr, " ".join("{0}={1}".format(a, b) for a, b in \
                        zip(header, summary))
    loghistogram(ctgsizes, summary=False)

    return zip(header, summary)
Ejemplo n.º 4
0
def summary(args):
    """
    %prog summary cdhit.clstr

    Parse cdhit.clstr file to get distribution of cluster sizes.
    """
    from jcvi.graphics.histogram import loghistogram

    p = OptionParser(summary.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    clstrfile, = args
    cf = ClstrFile(clstrfile)
    data = list(cf.iter_sizes())
    loghistogram(data, summary=True)
Ejemplo n.º 5
0
def summary(args):
    """
    %prog summary cdhit.clstr

    Parse cdhit.clstr file to get distribution of cluster sizes.
    """
    from jcvi.graphics.histogram import loghistogram

    p = OptionParser(summary.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (clstrfile, ) = args
    cf = ClstrFile(clstrfile)
    data = list(cf.iter_sizes())
    loghistogram(data, summary=True)
Ejemplo n.º 6
0
def summary(args):
    """
    %prog summary cdhit.clstr

    Parse cdhit.clstr file to get distribution of cluster sizes.
    """
    from jcvi.graphics.histogram import loghistogram

    p = OptionParser(summary.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    clstrfile, = args
    assert clstrfile.endswith(".clstr")

    fp = open(clstrfile)
    data = []
    for clstr, members in read_block(fp, ">"):
        size = len(members)
        data.append(size)

    loghistogram(data)
Ejemplo n.º 7
0
def summary(args):
    """
    %prog summary cdhit.clstr

    Parse cdhit.clstr file to get distribution of cluster sizes.
    """
    from jcvi.graphics.histogram import loghistogram

    p = OptionParser(summary.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    clstrfile, = args
    assert clstrfile.endswith(".clstr")

    fp = open(clstrfile)
    data = []
    for clstr, members in read_block(fp, ">"):
        size = len(members)
        data.append(size)

    loghistogram(data)