Beispiel #1
0
def extract_ends(rec, sites, flank, fw, maxfragsize=800):
    """
    Extraction of ends of fragments above certain size.
    """
    nsites = len(sites)
    size = len(rec)
    for i, s in enumerate(sites):
        newid = "{0}:{1}".format(rec.name, s)
        recs = []

        if i == 0 or s - sites[i - 1] <= maxfragsize:
            newidL = newid + "L"
            left = max(s - flank, 0)
            right = s
            frag = rec.seq[left:right].strip("Nn")
            recL = SeqRecord(frag, id=newidL, description="")
            if i == 0 and s > maxfragsize:  # Contig L-end
                pass
            else:
                recs.append(recL)

        if i == nsites - 1 or sites[i + 1] - s <= maxfragsize:
            newidR = newid + "R"
            left = s
            right = min(s + flank, size)
            frag = rec.seq[left:right].strip("Nn")
            recR = SeqRecord(frag, id=newidR, description="")
            if i == nsites - 1 and size - s > maxfragsize:  # Contig R-end
                pass
            else:
                recs.append(recR)

        SeqIO.write(recs, fw, "fasta")
Beispiel #2
0
def emitFragment(fw,
                 fragID,
                 libID,
                 shredded_seq,
                 clr=None,
                 qvchar="l",
                 fasta=False):
    """
    Print out the shredded sequence.
    """
    if fasta:
        s = SeqRecord(shredded_seq, id=fragID, description="")
        SeqIO.write([s], fw, "fasta")
        return

    seq = str(shredded_seq)
    slen = len(seq)
    qvs = qvchar * slen  # shredded reads have default low qv

    if clr is None:
        clr_beg, clr_end = 0, slen
    else:
        clr_beg, clr_end = clr

    print(
        frgTemplate.format(
            fragID=fragID,
            libID=libID,
            seq=seq,
            qvs=qvs,
            clr_beg=clr_beg,
            clr_end=clr_end,
        ),
        file=fw,
    )
Beispiel #3
0
def extract_full(rec, sites, flank, fw):
    """
    Full extraction of seq flanking the sites.
    """
    for s in sites:
        newid = "{0}:{1}".format(rec.name, s)
        left = max(s - flank, 0)
        right = min(s + flank, len(rec))
        frag = rec.seq[left:right].strip("Nn")
        newrec = SeqRecord(frag, id=newid, description="")
        SeqIO.write([newrec], fw, "fasta")
Beispiel #4
0
def circular(args):
    """
    %prog circular fastafile startpos

    Make circular genome, startpos is the place to start the sequence. This can
    be determined by mapping to a reference. Self overlaps are then resolved.
    Startpos is 1-based.
    """
    from jcvi.assembly.goldenpath import overlap

    p = OptionParser(circular.__doc__)
    p.add_option(
        "--flip",
        default=False,
        action="store_true",
        help="Reverse complement the sequence",
    )
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, startpos = args
    startpos = int(startpos)
    key, seq = next(parse_fasta(fastafile))
    aseq = seq[startpos:]
    bseq = seq[:startpos]
    aseqfile, bseqfile = "a.seq", "b.seq"

    for f, s in zip((aseqfile, bseqfile), (aseq, bseq)):
        fw = must_open(f, "w")
        print(">{0}\n{1}".format(f, s), file=fw)
        fw.close()

    o = overlap([aseqfile, bseqfile])
    seq = aseq[:o.qstop] + bseq[o.sstop:]
    seq = Seq(seq)

    if opts.flip:
        seq = seq.reverse_complement()

    for f in (aseqfile, bseqfile):
        os.remove(f)

    fw = must_open(opts.outfile, "w")
    rec = SeqRecord(seq, id=key, description="")
    SeqIO.write([rec], fw, "fasta")
    fw.close()
Beispiel #5
0
def emitFragment(fw, fragID, libID, shredded_seq, fasta=False):
    """
    Print out the shredded sequence.
    """
    if fasta:
        s = SeqRecord(shredded_seq, id=fragID, description="")
        SeqIO.write([s], fw, "fasta")
        return

    seq = str(shredded_seq)
    slen = len(seq)
    qvs = DEFAULTQV * slen  # shredded reads have default low qv

    print >> fw, frgTemplate.format(fragID=fragID, libID=libID,
        seq=fill(seq), qvs=fill(qvs), slen=slen)
Beispiel #6
0
def load(args):
    '''
    %prog load gff_file fasta_file [--options]

    Parses the selected features out of GFF, with subfeatures concatenated.
    For example, to get the CDS sequences, do this::

    $ %prog load athaliana.gff athaliana.fa --parents mRNA --children CDS
    '''
    from jcvi.formats.fasta import Seq, SeqRecord

    p = OptionParser(load.__doc__)
    p.add_option(
        "--parents",
        dest="parents",
        default="mRNA",
        help="list of features to extract, use comma to separate (e.g."
        "'gene,mRNA') [default: %default]")
    p.add_option(
        "--children",
        dest="children",
        default="CDS",
        help="list of features to extract, use comma to separate (e.g."
        "'five_prime_UTR,CDS,three_prime_UTR') [default: %default]")
    p.add_option("--attribute",
                 help="The attribute field to extract [default: %default]")
    set_outfile(p)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(p.print_help())

    gff_file, fasta_file = args

    g = make_index(gff_file)
    f = Fasta(fasta_file, index=False)
    fw = must_open(opts.outfile, "w")

    parents = set(opts.parents.split(','))
    children_list = set(opts.children.split(','))
    attr = opts.attribute

    for feat in get_parents(gff_file, parents):

        children = []
        for c in g.children(feat.id, 1):

            if c.featuretype not in children_list:
                continue
            child = f.sequence(
                dict(chr=c.chrom, start=c.start, stop=c.stop, strand=c.strand))
            children.append((child, c))

        if not children:
            print >>sys.stderr, "[warning] %s has no children with type %s" \
                                    % (feat.id, ','.join(children_list))
            continue
        # sort children in incremental position
        children.sort(key=lambda x: x[1].start)
        # reverse children if negative strand
        if feat.strand == '-':
            children.reverse()
        feat_seq = ''.join(x[0] for x in children)

        description = ",".join(feat.attributes[attr]) \
                if attr and attr in feat.attributes else ""
        description = description.replace("\"", "")

        rec = SeqRecord(Seq(feat_seq), id=feat.id, description=description)
        SeqIO.write([rec], fw, "fasta")
        fw.flush()