Example #1
0
def frombed(args):
    """
    %prog frombed bedfile contigfasta readfasta

    Convert read placement to contig format. This is useful before running BAMBUS.
    """
    from jcvi.formats.fasta import Fasta
    from jcvi.formats.bed import Bed
    from jcvi.utils.cbook import fill

    p = OptionParser(frombed.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    bedfile, contigfasta, readfasta = args
    prefix = bedfile.rsplit(".", 1)[0]
    contigfile = prefix + ".contig"
    idsfile = prefix + ".ids"

    contigfasta = Fasta(contigfasta)
    readfasta = Fasta(readfasta)

    bed = Bed(bedfile)
    checksum = "00000000 checksum."
    fw_ids = open(idsfile, "w")
    fw = open(contigfile, "w")

    for ctg, reads in bed.sub_beds():
        ctgseq = contigfasta[ctg]
        ctgline = "##{0} {1} {2} bases, {3}".format(\
                ctg, len(reads), len(ctgseq), checksum)

        print >> fw_ids, ctg
        print >> fw, ctgline
        print >> fw, fill(ctgseq.seq)

        for b in reads:
            read = b.accn
            strand = b.strand
            readseq = readfasta[read]
            rc = " [RC]" if strand == "-" else ""
            readlen = len(readseq)
            rstart, rend = 1, readlen
            if strand == "-":
                rstart, rend = rend, rstart

            readrange = "{{{0} {1}}}".format(rstart, rend)
            conrange = "<{0} {1}>".format(b.start, b.end)
            readline = "#{0}(0){1} {2} bases, {3} {4} {5}".format(\
                    read, rc, readlen, checksum, readrange, conrange)
            print >> fw, readline
            print >> fw, fill(readseq.seq)

    logging.debug("Mapped contigs written to `{0}`.".format(contigfile))
    logging.debug("Contig IDs written to `{0}`.".format(idsfile))
Example #2
0
def frombed(args):
    """
    %prog frombed bedfile contigfasta readfasta

    Convert read placement to contig format. This is useful before running BAMBUS.
    """
    from jcvi.formats.fasta import Fasta, SeqIO
    from jcvi.formats.bed import Bed
    from jcvi.utils.cbook import fill

    p = OptionParser(frombed.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    bedfile, contigfasta, readfasta = args
    prefix = bedfile.rsplit(".", 1)[0]
    contigfile = prefix + ".contig"
    idsfile = prefix + ".ids"

    contigfasta = Fasta(contigfasta)
    readfasta = Fasta(readfasta)

    bed = Bed(bedfile)
    checksum = "00000000 checksum."
    fw_ids = open(idsfile, "w")
    fw = open(contigfile, "w")

    for ctg, reads in bed.sub_beds():
        ctgseq = contigfasta[ctg]
        ctgline = "##{0} {1} {2} bases, {3}".format(\
                ctg, len(reads), len(ctgseq), checksum)

        print >> fw_ids, ctg
        print >> fw, ctgline
        print >> fw, fill(ctgseq.seq)

        for b in reads:
            read = b.accn
            strand = b.strand
            readseq = readfasta[read]
            rc = " [RC]" if strand == "-" else ""
            readlen = len(readseq)
            rstart, rend = 1, readlen
            if strand == "-":
                rstart, rend = rend, rstart

            readrange = "{{{0} {1}}}".format(rstart, rend)
            conrange = "<{0} {1}>".format(b.start, b.end)
            readline = "#{0}(0){1} {2} bases, {3} {4} {5}".format(\
                    read, rc, readlen, checksum, readrange, conrange)
            print >> fw, readline
            print >> fw, fill(readseq.seq)

    logging.debug("Mapped contigs written to `{0}`.".format(contigfile))
    logging.debug("Contig IDs written to `{0}`.".format(idsfile))
Example #3
0
def emitFragment(fw, fragID, libID, shredded_seq, fasta=False):
    """
    Print out the shredded sequence.
    """
    if fasta:
        s = SeqRecord(shredded_seq, id=fragID, description="")
        SeqIO.write([s], fw, "fasta")
        return

    seq = str(shredded_seq)
    slen = len(seq)
    qvs = DEFAULTQV * slen  # shredded reads have default low qv

    print >> fw, frgTemplate.format(fragID=fragID, libID=libID,
        seq=fill(seq), qvs=fill(qvs), slen=slen)
Example #4
0
 def add_objective(self, edges, objective=MAXIMIZE):
     assert edges, "Edges must be non-empty"
     self.objective = objective
     items = [" + {0}x{1}".format(w, i + 1) \
             for i, (a, b, w) in enumerate(edges) if w]
     sums = fill(items, width=10)
     self.sum = sums
Example #5
0
def print_objective(lp_handle, edges, objective=MAXIMIZE):
    """
    CPLEX LP format commonly contains three blocks:
    objective, constraints, vars
    spec <http://lpsolve.sourceforge.net/5.0/CPLEX-format.htm>
    """
    print >> lp_handle, objective
    items = [" + {0}x{1}".format(w, i + 1) \
            for i, (a, b, w) in enumerate(edges)]
    sums = fill(items, width=10)
    print >> lp_handle, sums
Example #6
0
def print_objective(lp_handle, edges, objective=MAXIMIZE):
    """
    CPLEX LP format commonly contains three blocks:
    objective, constraints, vars
    spec <http://lpsolve.sourceforge.net/5.0/CPLEX-format.htm>
    """
    print >> lp_handle, objective
    items = [" + {0}x{1}".format(w, i + 1) \
            for i, (a, b, w) in enumerate(edges)]
    sums = fill(items, width=10)
    print >> lp_handle, sums
Example #7
0
def ace(args):
    """
    %prog ace bamfile fastafile

    convert bam format to ace format. This often allows the remapping to be
    assessed as a denovo assembly format. bam file needs to be indexed. also
    creates a .mates file to be used in amos/bambus, and .astat file to mark
    whether the contig is unique or repetitive based on A-statistics in Celera
    assembler.
    """
    p = OptionParser(ace.__doc__)
    p.add_option(
        "--splitdir",
        dest="splitdir",
        default="outRoot",
        help="split the ace per contig to dir",
    )
    p.add_option(
        "--unpaired",
        dest="unpaired",
        default=False,
        help="remove read pairs on the same contig",
    )
    p.add_option(
        "--minreadno",
        dest="minreadno",
        default=3,
        type="int",
        help="minimum read numbers per contig",
    )
    p.add_option(
        "--minctgsize",
        dest="minctgsize",
        default=100,
        type="int",
        help="minimum contig size per contig",
    )
    p.add_option(
        "--astat",
        default=False,
        action="store_true",
        help="create .astat to list repetitiveness",
    )
    p.add_option(
        "--readids",
        default=False,
        action="store_true",
        help="create file of mapped and unmapped ids",
    )

    from pysam import Samfile

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bamfile, fastafile = args
    astat = opts.astat
    readids = opts.readids

    f = Fasta(fastafile)
    prefix = bamfile.split(".")[0]
    acefile = prefix + ".ace"
    readsfile = prefix + ".reads"
    astatfile = prefix + ".astat"

    logging.debug("Load {0}".format(bamfile))
    s = Samfile(bamfile, "rb")

    ncontigs = s.nreferences
    genomesize = sum(x for a, x in f.itersizes())
    logging.debug("Total {0} contigs with size {1} base".format(ncontigs, genomesize))
    qual = "20"  # default qual

    totalreads = sum(s.count(x) for x in s.references)
    logging.debug("Total {0} reads mapped".format(totalreads))

    fw = open(acefile, "w")
    if astat:
        astatfw = open(astatfile, "w")
    if readids:
        readsfw = open(readsfile, "w")

    print("AS {0} {1}".format(ncontigs, totalreads), file=fw)
    print(file=fw)

    for i, contig in enumerate(s.references):
        cseq = f[contig]
        nbases = len(cseq)

        mapped_reads = [x for x in s.fetch(contig) if not x.is_unmapped]
        nreads = len(mapped_reads)

        nsegments = 0
        print("CO {0} {1} {2} {3} U".format(contig, nbases, nreads, nsegments), file=fw)
        print(fill(str(cseq.seq)), file=fw)
        print(file=fw)

        if astat:
            astat = Astat(nbases, nreads, genomesize, totalreads)
            print("{0}\t{1:.1f}".format(contig, astat), file=astatfw)

        text = fill([qual] * nbases, delimiter=" ", width=30)
        print("BQ\n{0}".format(text), file=fw)
        print(file=fw)

        rnames = []
        for a in mapped_reads:
            readname = a.qname
            rname = readname

            if readids:
                print(readname, file=readsfw)
            rnames.append(rname)

            strand = "C" if a.is_reverse else "U"
            paddedstart = a.pos + 1  # 0-based to 1-based
            af = "AF {0} {1} {2}".format(rname, strand, paddedstart)
            print(af, file=fw)

        print(file=fw)

        for a, rname in zip(mapped_reads, rnames):
            aseq, npadded = cigar_to_seq(a)
            if aseq is None:
                continue

            ninfos = 0
            ntags = 0
            alen = len(aseq)
            rd = "RD {0} {1} {2} {3}\n{4}".format(
                rname, alen, ninfos, ntags, fill(aseq)
            )
            qs = "QA 1 {0} 1 {0}".format(alen)

            print(rd, file=fw)
            print(file=fw)
            print(qs, file=fw)
            print(file=fw)
Example #8
0
File: sam.py Project: arvin580/jcvi
def ace(args):
    """
    %prog ace bamfile fastafile

    convert bam format to ace format. This often allows the remapping to be
    assessed as a denovo assembly format. bam file needs to be indexed. also
    creates a .mates file to be used in amos/bambus, and .astat file to mark
    whether the contig is unique or repetitive based on A-statistics in Celera
    assembler.
    """
    p = OptionParser(ace.__doc__)
    p.add_option("--splitdir", dest="splitdir", default="outRoot",
            help="split the ace per contig to dir [default: %default]")
    p.add_option("--unpaired", dest="unpaired", default=False,
            help="remove read pairs on the same contig [default: %default]")
    p.add_option("--minreadno", dest="minreadno", default=3, type="int",
            help="minimum read numbers per contig [default: %default]")
    p.add_option("--minctgsize", dest="minctgsize", default=100, type="int",
            help="minimum contig size per contig [default: %default]")
    p.add_option("--astat", default=False, action="store_true",
            help="create .astat to list repetitiveness [default: %default]")
    p.add_option("--readids", default=False, action="store_true",
            help="create file of mapped and unmapped ids [default: %default]")

    from pysam import Samfile

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bamfile, fastafile = args
    astat = opts.astat
    readids = opts.readids

    f = Fasta(fastafile)
    prefix = bamfile.split(".")[0]
    acefile = prefix + ".ace"
    readsfile = prefix + ".reads"
    astatfile = prefix + ".astat"

    logging.debug("Load {0}".format(bamfile))
    s = Samfile(bamfile, "rb")

    ncontigs = s.nreferences
    genomesize = sum(x for a, x in f.itersizes())
    logging.debug("Total {0} contigs with size {1} base".format(ncontigs,
        genomesize))
    qual = "20"  # default qual

    totalreads = sum(s.count(x) for x in s.references)
    logging.debug("Total {0} reads mapped".format(totalreads))

    fw = open(acefile, "w")
    if astat:
        astatfw = open(astatfile, "w")
    if readids:
        readsfw = open(readsfile, "w")

    print >> fw, "AS {0} {1}".format(ncontigs, totalreads)
    print >> fw

    for i, contig in enumerate(s.references):
        cseq = f[contig]
        nbases = len(cseq)

        mapped_reads = [x for x in s.fetch(contig) if not x.is_unmapped]
        nreads = len(mapped_reads)

        nsegments = 0
        print >> fw, "CO {0} {1} {2} {3} U".format(contig, nbases, nreads,
                nsegments)
        print >> fw, fill(str(cseq.seq))
        print >> fw

        if astat:
            astat = Astat(nbases, nreads, genomesize, totalreads)
            print >> astatfw, "{0}\t{1:.1f}".format(contig, astat)

        text = fill([qual] * nbases, delimiter=" ", width=30)
        print >> fw, "BQ\n{0}".format(text)
        print >> fw

        rnames = []
        for a in mapped_reads:
            readname = a.qname
            rname = readname

            if readids:
                print >> readsfw, readname
            rnames.append(rname)

            strand = "C" if a.is_reverse else "U"
            paddedstart = a.pos + 1  # 0-based to 1-based
            af = "AF {0} {1} {2}".format(rname, strand, paddedstart)
            print >> fw, af

        print >> fw

        for a, rname in zip(mapped_reads, rnames):
            aseq, npadded = cigar_to_seq(a)
            if aseq is None:
                continue

            ninfos = 0
            ntags = 0
            alen = len(aseq)
            rd = "RD {0} {1} {2} {3}\n{4}".format(rname, alen, ninfos, ntags,
                    fill(aseq))
            qs = "QA 1 {0} 1 {0}".format(alen)

            print >> fw, rd
            print >> fw
            print >> fw, qs
            print >> fw
Example #9
0
def gss(args):
    """
    %prog gss fastafile plateMapping

    Generate sequence files and metadata templates suited for gss submission.
    The FASTA file is assumed to be exported from the JCVI data delivery folder
    which looks like:

    >1127963806024 /library_name=SIL1T054-B-01-120KB /clear_start=0
    /clear_end=839 /primer_id=1049000104196 /trace_id=1064147620169
    /trace_file_id=1127963805941 /clone_insert_id=1061064364776
    /direction=reverse /sequencer_run_id=1064147620155
    /sequencer_plate_barcode=B906423 /sequencer_plate_well_coordinates=C3
    /sequencer_plate_96well_quadrant=1 /sequencer_plate_96well_coordinates=B02
    /template_plate_barcode=CC0251602AB /growth_plate_barcode=BB0273005AB
    AGCTTTAGTTTCAAGGATACCTTCATTGTCATTCCCGGTTATGATGATATCATCAAGATAAACAAGAATG
    ACAATGATACCTGTTTGGTTCTGAAGTGTAAAGAGGGTATGTTCAGCTTCAGATCTTCTAAACCCTTTGT
    CTAGTAAGCTGGCACTTAGCTTCCTATACCAAACCCTTTGTGATTGCTTCAGTCCATAAATTGCCTTTTT

    Plate mapping file maps the JTC `sequencer_plate_barcode` to external IDs.
    For example:
    B906423 SIL-001
    """
    p = OptionParser(gss.__doc__)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(p.print_help())

    fastafile, mappingfile = args
    seen = defaultdict(int)
    clone = defaultdict(set)

    plateMapping = DictFile(mappingfile)

    fw = open("MetaData.txt", "w")
    print >> fw, PublicationTemplate.format(**vars)
    print >> fw, LibraryTemplate.format(**vars)
    print >> fw, ContactTemplate.format(**vars)
    logging.debug("Meta data written to `{0}`".format(fw.name))

    fw = open("GSS.txt", "w")
    fw_log = open("GSS.log", "w")
    for rec in SeqIO.parse(fastafile, "fasta"):
        # First pass just check well number matchings and populate sequences in
        # the same clone
        description = rec.description
        a = parse_description(description)
        direction = a["direction"][0]
        sequencer_plate_barcode = a["sequencer_plate_barcode"][0]
        sequencer_plate_well_coordinates = \
            a["sequencer_plate_well_coordinates"][0]
        sequencer_plate_96well_quadrant = \
            a["sequencer_plate_96well_quadrant"][0]
        sequencer_plate_96well_coordinates = \
            a["sequencer_plate_96well_coordinates"][0]

        # Check the 96-well ID is correctly converted to 384-well ID
        w96 = sequencer_plate_96well_coordinates
        w96quad = int(sequencer_plate_96well_quadrant)
        w384 = sequencer_plate_well_coordinates
        assert convert_96_to_384(w96, w96quad) == w384

        plate = sequencer_plate_barcode
        assert plate in plateMapping, \
            "{0} not found in `{1}` !".format(plate, mappingfile)

        plate = plateMapping[plate]
        d = Directions[direction]

        cloneID = "{0}{1}".format(plate, w384)
        gssID = "{0}{1}".format(cloneID, d)
        seen[gssID] += 1

        if seen[gssID] > 1:
            gssID = "{0}{1}".format(gssID, seen[gssID])

        seen[gssID] += 1
        clone[cloneID].add(gssID)

    seen = defaultdict(int)
    for rec in SeqIO.parse(fastafile, "fasta"):
        # need to populate gssID, mateID, cloneID, seq, plate, row, column
        description = rec.description
        a = parse_description(description)
        direction = a["direction"][0]
        sequencer_plate_barcode = a["sequencer_plate_barcode"][0]
        sequencer_plate_well_coordinates = \
            a["sequencer_plate_well_coordinates"][0]
        w384 = sequencer_plate_well_coordinates

        plate = sequencer_plate_barcode
        plate = plateMapping[plate]
        d = Directions[direction]

        row = w384[0]
        column = int(w384[1:])
        seq = fill(str(rec.seq), width=70)

        cloneID = "{0}{1}".format(plate, w384)
        gssID = "{0}{1}".format(cloneID, d)
        primer = Primers[d]
        seen[gssID] += 1

        if seen[gssID] > 1:
            logging.error("duplicate key {0} found".format(gssID))
            gssID = "{0}{1}".format(gssID, seen[gssID])

        othergss = clone[cloneID] - set([gssID])
        othergss = ", ".join(sorted(othergss))
        vars.update(locals())

        print >> fw, GSSTemplate.format(**vars)

        # Write conversion logs to log file
        print >> fw_log, "{0}\t{1}".format(gssID, description)
        print >> fw_log, "=" * 60

    logging.debug("A total of {0} seqs written to `{1}`".\
            format(len(seen), fw.name))
    fw.close()
    fw_log.close()
Example #10
0
def gss(args):
    """
    %prog gss fastafile plateMapping

    Generate sequence files and metadata templates suited for gss submission.
    The FASTA file is assumed to be exported from the JCVI data delivery folder
    which looks like:

    >1127963806024 /library_name=SIL1T054-B-01-120KB /clear_start=0
    /clear_end=839 /primer_id=1049000104196 /trace_id=1064147620169
    /trace_file_id=1127963805941 /clone_insert_id=1061064364776
    /direction=reverse /sequencer_run_id=1064147620155
    /sequencer_plate_barcode=B906423 /sequencer_plate_well_coordinates=C3
    /sequencer_plate_96well_quadrant=1 /sequencer_plate_96well_coordinates=B02
    /template_plate_barcode=CC0251602AB /growth_plate_barcode=BB0273005AB
    AGCTTTAGTTTCAAGGATACCTTCATTGTCATTCCCGGTTATGATGATATCATCAAGATAAACAAGAATG
    ACAATGATACCTGTTTGGTTCTGAAGTGTAAAGAGGGTATGTTCAGCTTCAGATCTTCTAAACCCTTTGT
    CTAGTAAGCTGGCACTTAGCTTCCTATACCAAACCCTTTGTGATTGCTTCAGTCCATAAATTGCCTTTTT

    Plate mapping file maps the JTC `sequencer_plate_barcode` to external IDs.
    For example:
    B906423 SIL-001
    """
    p = OptionParser(gss.__doc__)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(p.print_help())

    fastafile, mappingfile = args
    seen = defaultdict(int)
    clone = defaultdict(set)

    plateMapping = DictFile(mappingfile)

    fw = open("MetaData.txt", "w")
    print >> fw, PublicationTemplate.format(**vars)
    print >> fw, LibraryTemplate.format(**vars)
    print >> fw, ContactTemplate.format(**vars)
    logging.debug("Meta data written to `{0}`".format(fw.name))

    fw = open("GSS.txt", "w")
    fw_log = open("GSS.log", "w")
    for rec in SeqIO.parse(fastafile, "fasta"):
        # First pass just check well number matchings and populate sequences in
        # the same clone
        description = rec.description
        a = parse_description(description)
        direction = a["direction"][0]
        sequencer_plate_barcode = a["sequencer_plate_barcode"][0]
        sequencer_plate_well_coordinates = \
            a["sequencer_plate_well_coordinates"][0]
        sequencer_plate_96well_quadrant = \
            a["sequencer_plate_96well_quadrant"][0]
        sequencer_plate_96well_coordinates = \
            a["sequencer_plate_96well_coordinates"][0]

        # Check the 96-well ID is correctly converted to 384-well ID
        w96 = sequencer_plate_96well_coordinates
        w96quad = int(sequencer_plate_96well_quadrant)
        w384 = sequencer_plate_well_coordinates
        assert convert_96_to_384(w96, w96quad) == w384

        plate = sequencer_plate_barcode
        assert plate in plateMapping, \
            "{0} not found in `{1}` !".format(plate, mappingfile)

        plate = plateMapping[plate]
        d = Directions[direction]

        cloneID = "{0}{1}".format(plate, w384)
        gssID = "{0}{1}".format(cloneID, d)
        seen[gssID] += 1

        if seen[gssID] > 1:
            gssID = "{0}{1}".format(gssID, seen[gssID])

        seen[gssID] += 1
        clone[cloneID].add(gssID)

    seen = defaultdict(int)
    for rec in SeqIO.parse(fastafile, "fasta"):
        # need to populate gssID, mateID, cloneID, seq, plate, row, column
        description = rec.description
        a = parse_description(description)
        direction = a["direction"][0]
        sequencer_plate_barcode = a["sequencer_plate_barcode"][0]
        sequencer_plate_well_coordinates = \
            a["sequencer_plate_well_coordinates"][0]
        w384 = sequencer_plate_well_coordinates

        plate = sequencer_plate_barcode
        plate = plateMapping[plate]
        d = Directions[direction]

        row = w384[0]
        column = int(w384[1:])
        seq = fill(str(rec.seq), width=70)

        cloneID = "{0}{1}".format(plate, w384)
        gssID = "{0}{1}".format(cloneID, d)
        primer = Primers[d]
        seen[gssID] += 1

        if seen[gssID] > 1:
            logging.error("duplicate key {0} found".format(gssID))
            gssID = "{0}{1}".format(gssID, seen[gssID])

        othergss = clone[cloneID] - set([gssID])
        othergss = ", ".join(sorted(othergss))
        vars.update(locals())

        print >> fw, GSSTemplate.format(**vars)

        # Write conversion logs to log file
        print >> fw_log, "{0}\t{1}".format(gssID, description)
        print >> fw_log, "=" * 60

    logging.debug("A total of {0} seqs written to `{1}`".\
            format(len(seen), fw.name))
    fw.close()
    fw_log.close()