Python Bed.sub_bedsの例、jcvi.formats.bed.Bed.sub_beds Pythonの例

コード例 #1

0

ファイルを表示

ファイル: hic.py プロジェクト: xuanblo/jcvi

def prepare_synteny(tourfile, lastfile, odir, p, opts):
    """
    Prepare synteny plots for movie().
    """
    qbedfile, sbedfile = get_bed_filenames(lastfile, p, opts)
    qbedfile = op.abspath(qbedfile)
    sbedfile = op.abspath(sbedfile)

    qbed = Bed(qbedfile, sorted=False)
    contig_to_beds = dict(qbed.sub_beds())

    # Create a separate directory for the subplots and movie
    mkdir(odir, overwrite=True)
    os.chdir(odir)
    logging.debug("Change into subdir `{}`".format(odir))

    # Make anchorsfile
    anchorsfile = ".".join(op.basename(lastfile).split(".", 2)[:2]) \
                  + ".anchors"
    fw = open(anchorsfile, "w")
    for b in Blast(lastfile):
        print >> fw, "\t".join((gene_name(b.query), gene_name(b.subject),
                                str(int(b.score))))
    fw.close()

    # Symlink sbed
    symlink(sbedfile, op.basename(sbedfile))

    return anchorsfile, qbedfile, contig_to_beds

コード例 #2

0

ファイルを表示

ファイル: hic.py プロジェクト: zengxiaofei/jcvi

def prepare_synteny(tourfile, lastfile, odir, p, opts):
    """
    Prepare synteny plots for movie().
    """
    qbedfile, sbedfile = get_bed_filenames(lastfile, p, opts)
    qbedfile = op.abspath(qbedfile)
    sbedfile = op.abspath(sbedfile)

    qbed = Bed(qbedfile, sorted=False)
    contig_to_beds = dict(qbed.sub_beds())

    # Create a separate directory for the subplots and movie
    mkdir(odir, overwrite=True)
    os.chdir(odir)
    logging.debug("Change into subdir `{}`".format(odir))

    # Make anchorsfile
    anchorsfile = ".".join(op.basename(lastfile).split(".",
                                                       2)[:2]) + ".anchors"
    fw = open(anchorsfile, "w")
    for b in Blast(lastfile):
        print >> fw, "\t".join(
            (gene_name(b.query), gene_name(b.subject), str(int(b.score))))
    fw.close()

    # Symlink sbed
    symlink(sbedfile, op.basename(sbedfile))

    return anchorsfile, qbedfile, contig_to_beds

コード例 #3

0

ファイルを表示

ファイル: ies.py プロジェクト: zhaotao1987/jcvi

def insertion(args):
    """
    %prog insertion mic.mac.bed

    Find IES based on mapping MIC reads to MAC genome. Output a bedfile with
    'lesions' (stack of broken reads) in the MAC genome.
    """
    p = OptionParser(insertion.__doc__)
    p.add_option("--mindepth", default=6, type="int",
                 help="Minimum depth to call an insertion")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args
    mindepth = opts.mindepth
    bed = Bed(bedfile)
    fw = must_open(opts.outfile, "w")
    for seqid, feats in bed.sub_beds():
        left_ends = Counter([x.start for x in feats])
        right_ends = Counter([x.end for x in feats])
        selected = []
        for le, count in left_ends.items():
            if count >= mindepth:
                selected.append((seqid, le, "LE-{0}".format(le), count))
        for re, count in right_ends.items():
            if count >= mindepth:
                selected.append((seqid, re, "RE-{0}".format(re), count))
        selected.sort()
        for seqid, pos, label, count in selected:
            label = "{0}-r{1}".format(label, count)
            print >> fw, "\t".join((seqid, str(pos - 1), str(pos), label))

コード例 #4

0

ファイルを表示

def frombed(args):
    """
    %prog frombed bedfile contigfasta readfasta

    Convert read placement to contig format. This is useful before running BAMBUS.
    """
    from jcvi.formats.fasta import Fasta, SeqIO
    from jcvi.formats.bed import Bed
    from jcvi.utils.cbook import fill

    p = OptionParser(frombed.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    bedfile, contigfasta, readfasta = args
    prefix = bedfile.rsplit(".", 1)[0]
    contigfile = prefix + ".contig"
    idsfile = prefix + ".ids"

    contigfasta = Fasta(contigfasta)
    readfasta = Fasta(readfasta)

    bed = Bed(bedfile)
    checksum = "00000000 checksum."
    fw_ids = open(idsfile, "w")
    fw = open(contigfile, "w")

    for ctg, reads in bed.sub_beds():
        ctgseq = contigfasta[ctg]
        ctgline = "##{0} {1} {2} bases, {3}".format(\
                ctg, len(reads), len(ctgseq), checksum)

        print >> fw_ids, ctg
        print >> fw, ctgline
        print >> fw, fill(ctgseq.seq)

        for b in reads:
            read = b.accn
            strand = b.strand
            readseq = readfasta[read]
            rc = " [RC]" if strand == "-" else ""
            readlen = len(readseq)
            rstart, rend = 1, readlen
            if strand == "-":
                rstart, rend = rend, rstart

            readrange = "{{{0} {1}}}".format(rstart, rend)
            conrange = "<{0} {1}>".format(b.start, b.end)
            readline = "#{0}(0){1} {2} bases, {3} {4} {5}".format(\
                    read, rc, readlen, checksum, readrange, conrange)
            print >> fw, readline
            print >> fw, fill(readseq.seq)

    logging.debug("Mapped contigs written to `{0}`.".format(contigfile))
    logging.debug("Contig IDs written to `{0}`.".format(idsfile))

コード例 #5

0

ファイルを表示

ファイル: contig.py プロジェクト: Hensonmw/jcvi

def frombed(args):
    """
    %prog frombed bedfile contigfasta readfasta

    Convert read placement to contig format. This is useful before running BAMBUS.
    """
    from jcvi.formats.fasta import Fasta
    from jcvi.formats.bed import Bed
    from jcvi.utils.cbook import fill

    p = OptionParser(frombed.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    bedfile, contigfasta, readfasta = args
    prefix = bedfile.rsplit(".", 1)[0]
    contigfile = prefix + ".contig"
    idsfile = prefix + ".ids"

    contigfasta = Fasta(contigfasta)
    readfasta = Fasta(readfasta)

    bed = Bed(bedfile)
    checksum = "00000000 checksum."
    fw_ids = open(idsfile, "w")
    fw = open(contigfile, "w")

    for ctg, reads in bed.sub_beds():
        ctgseq = contigfasta[ctg]
        ctgline = "##{0} {1} {2} bases, {3}".format(\
                ctg, len(reads), len(ctgseq), checksum)

        print >> fw_ids, ctg
        print >> fw, ctgline
        print >> fw, fill(ctgseq.seq)

        for b in reads:
            read = b.accn
            strand = b.strand
            readseq = readfasta[read]
            rc = " [RC]" if strand == "-" else ""
            readlen = len(readseq)
            rstart, rend = 1, readlen
            if strand == "-":
                rstart, rend = rend, rstart

            readrange = "{{{0} {1}}}".format(rstart, rend)
            conrange = "<{0} {1}>".format(b.start, b.end)
            readline = "#{0}(0){1} {2} bases, {3} {4} {5}".format(\
                    read, rc, readlen, checksum, readrange, conrange)
            print >> fw, readline
            print >> fw, fill(readseq.seq)

    logging.debug("Mapped contigs written to `{0}`.".format(contigfile))
    logging.debug("Contig IDs written to `{0}`.".format(idsfile))

コード例 #6

0

ファイルを表示

def write_lst(bedfile):
    pf = op.basename(bedfile).split(".")[0]
    mkdir(pf)
    bed = Bed(bedfile)
    stanza = []
    for seqid, bs in bed.sub_beds():
        fname = op.join(pf, "{0}.lst".format(seqid))
        fw = open(fname, "w")
        for b in bs:
            print("{0}{1}".format(b.accn.replace(" ", ""), b.strand), file=fw)
        stanza.append((seqid, fname))
        fw.close()
    return pf, stanza

コード例 #7

0

ファイルを表示

ファイル: synfind.py プロジェクト: xuanblo/jcvi

def write_lst(bedfile):
    pf = op.basename(bedfile).split(".")[0]
    mkdir(pf)
    bed = Bed(bedfile)
    stanza = []
    for seqid, bs in bed.sub_beds():
        fname = op.join(pf, "{0}.lst".format(seqid))
        fw = open(fname, "w")
        for b in bs:
            print >> fw, "{0}{1}".format(b.accn.replace(" ", ""), b.strand)
        stanza.append((seqid, fname))
        fw.close()
    return pf, stanza

コード例 #8

0

ファイルを表示

def breakpoint(args):
    """
    %prog breakpoint blastfile bedfile

    Identify breakpoints where collinearity ends. `blastfile` contains mapping
    from markers (query) to scaffolds (subject). `bedfile` contains marker
    locations in the related species.
    """
    from jcvi.formats.blast import bed
    from jcvi.utils.range import range_interleave

    p = OptionParser(breakpoint.__doc__)
    p.add_option("--xdist",
                 type="int",
                 default=20,
                 help="xdist (in related genome) cutoff [default: %default]")
    p.add_option("--ydist",
                 type="int",
                 default=200000,
                 help="ydist (in current genome) cutoff [default: %default]")
    p.add_option("-n",
                 type="int",
                 default=5,
                 help="number of markers in a block [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, bedfile = args
    order = Bed(bedfile).order
    blastbedfile = bed([blastfile])
    bbed = Bed(blastbedfile)
    key = lambda x: x[1]
    for scaffold, bs in bbed.sub_beds():
        blocks = get_blocks(scaffold,
                            bs,
                            order,
                            xdist=opts.xdist,
                            ydist=opts.ydist,
                            N=opts.n)
        sblocks = []
        for block in blocks:
            xx, yy = zip(*block)
            sblocks.append((scaffold, min(yy), max(yy)))
        iblocks = range_interleave(sblocks)
        for ib in iblocks:
            ch, start, end = ib
            print "{0}\t{1}\t{2}".format(ch, start - 1, end)

コード例 #9

0

ファイルを表示

ファイル: synteny.py プロジェクト: bennyyu/jcvi

def breakpoint(args):
    """
    %prog breakpoint blastfile bedfile

    Identify breakpoints where collinearity ends. `blastfile` contains mapping
    from markers (query) to scaffolds (subject). `bedfile` contains marker
    locations in the related species.
    """
    from jcvi.formats.blast import bed
    from jcvi.utils.range import range_interleave

    p = OptionParser(breakpoint.__doc__)
    p.add_option("--xdist", type="int", default=20,
                 help="xdist (in related genome) cutoff [default: %default]")
    p.add_option("--ydist", type="int", default=200000,
                 help="ydist (in current genome) cutoff [default: %default]")
    p.add_option("-n", type="int", default=5,
                 help="number of markers in a block [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, bedfile = args
    order = Bed(bedfile).order
    blastbedfile = bed([blastfile])
    bbed = Bed(blastbedfile)
    key = lambda x: x[1]
    for scaffold, bs in bbed.sub_beds():
        blocks = get_blocks(scaffold, bs, order,
                            xdist=opts.xdist, ydist=opts.ydist, N=opts.n)
        sblocks = []
        for block in blocks:
            xx, yy = zip(*block)
            sblocks.append((scaffold, min(yy), max(yy)))
        iblocks = range_interleave(sblocks)
        for ib in iblocks:
            ch, start, end = ib
            print "{0}\t{1}\t{2}".format(ch, start - 1, end)

コード例 #10

0

ファイルを表示

def annotate(args):
    """
    %prog annotate new.bed old.bed 2> log

    Annotate the `new.bed` with features from `old.bed` for the purpose of
    gene numbering.

    Ambiguity in ID assignment can be resolved by either of the following 2 methods:
    - `alignment`: make use of global sequence alignment score (calculated by `needle`)
    - `overlap`: make use of overlap length (calculated by `intersectBed`)

    Transfer over as many identifiers as possible while following guidelines:
    http://www.arabidopsis.org/portals/nomenclature/guidelines.jsp#editing

    Note: Following RegExp pattern describes the structure of the identifier
    assigned to features in the `new.bed` file.

    new_id_pat = re.compile(r"^\d+\.[cemtx]+\S+")

    Examples: 23231.m312389, 23231.t004898, 23231.tRNA.144
    Adjust the value of `new_id_pat` manually as per your ID naming conventions.
    """
    from jcvi.utils.grouper import Grouper

    valid_resolve_choices = ["alignment", "overlap"]

    p = OptionParser(annotate.__doc__)
    p.add_option("--resolve", default="alignment", choices=valid_resolve_choices,
                 help="Resolve ID assignment based on a certain metric" \
                        + " [default: %default]")
    p.add_option("--atg_name", default=False, action="store_true",
                help="Specify is locus IDs in `new.bed` file follow ATG nomenclature" \
                        + " [default: %default]")

    g1 = OptionGroup(p, "Optional parameters (alignment):\n" \
            + "Use if resolving ambiguities based on sequence `alignment`")
    g1.add_option("--pid",
                  dest="pid",
                  default=35.,
                  type="float",
                  help="Percent identity cutoff [default: %default]")
    g1.add_option("--score",
                  dest="score",
                  default=250.,
                  type="float",
                  help="Alignment score cutoff [default: %default]")
    p.add_option_group(g1)

    g2 = OptionGroup(p, "Optional parameters (overlap):\n" \
            + "Use if resolving ambiguities based on `overlap` length\n" \
            + "Parameters equivalent to `intersectBed`")
    g2.add_option(
        "-f",
        dest="f",
        default=0.5,
        type="float",
        help="Minimum overlap fraction (0.0 - 1.0) [default: %default]")
    g2.add_option(
        "-r",
        dest="r",
        default=False,
        action="store_true",
        help="Require fraction overlap to be reciprocal [default: %default]")
    g2.add_option("-s",
                  dest="s",
                  default=True,
                  action="store_true",
                  help="Require same strandedness [default: %default]")
    p.add_option_group(g2)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    nbedfile, obedfile = args
    npf, opf = nbedfile.rsplit(".", 1)[0], obedfile.rsplit(".", 1)[0]

    # Make consolidated.bed
    cbedfile = "consolidated.bed"
    if not os.path.isfile(cbedfile):
        consolidate(nbedfile, obedfile, cbedfile)
    else:
        logging.warning("`{0}` already exists. Skipping step".format(cbedfile))

    logging.warning("Resolving ID assignment ambiguity based on `{0}`".\
            format(opts.resolve))

    if opts.resolve == "alignment":
        # Get pairs and prompt to run needle
        pairsfile = "nw.pairs"
        scoresfile = "nw.scores"
        if not os.path.isfile(pairsfile):
            get_pairs(cbedfile, pairsfile)
        else:
            logging.warning("`{0}` already exists. Checking for needle output".\
                    format(pairsfile))

        # If needle scores do not exist, prompt user to run needle
        if not os.path.isfile(scoresfile):
            logging.error("`{0}` does not exist. Please process {1} using `needle`".\
                    format(scoresfile, pairsfile))
            sys.exit()
    else:
        scoresfile = "ovl.scores"
        # Calculate overlap length using intersectBed
        calculate_ovl(nbedfile, obedfile, opts, scoresfile)

    logging.warning("`{0}' exists. Storing scores in memory".\
            format(scoresfile))
    scores = read_scores(scoresfile, opts)

    # Iterate through consolidated bed and
    # filter piles based on score
    abedline = {}

    cbed = Bed(cbedfile)
    g = Grouper()
    for c in cbed:
        accn = c.accn
        g.join(*accn.split(";"))

    nbedline = {}
    nbed = Bed(nbedfile)
    for line in nbed:
        nbedline[line.accn] = line

    splits = set()
    for chr, chrbed in nbed.sub_beds():
        abedline, splits = annotate_chr(chr, chrbed, g, scores, nbedline,
                                        abedline, opts, splits)

    if splits is not None:
        abedline = process_splits(splits, scores, nbedline, abedline)

    abedfile = npf + ".annotated.bed"
    afh = open(abedfile, "w")
    for accn in abedline:
        print >> afh, abedline[accn]
    afh.close()

    sort([abedfile, "-i"])

コード例 #11

0

ファイルを表示

ファイル: postprocess.py プロジェクト: radaniba/jcvi

def scaffold(args):
    """
    %prog scaffold ctgfasta agpfile

    Build scaffolds based on ordering in the AGP file.
    """
    from jcvi.formats.agp import bed, order_to_agp, build
    from jcvi.formats.bed import Bed

    p = OptionParser(scaffold.__doc__)
    p.add_option("--prefix",
                 default=False,
                 action="store_true",
                 help="Keep IDs with same prefix together [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ctgfasta, agpfile = args
    sizes = Sizes(ctgfasta).mapping

    pf = ctgfasta.rsplit(".", 1)[0]
    phasefile = pf + ".phases"
    fwphase = open(phasefile, "w")
    newagpfile = pf + ".new.agp"
    fwagp = open(newagpfile, "w")

    scaffoldbuckets = defaultdict(list)

    bedfile = bed([agpfile, "--nogaps", "--outfile=tmp"])
    bb = Bed(bedfile)
    for s, partialorder in bb.sub_beds():
        name = partialorder[0].accn
        bname = name.rsplit("_", 1)[0] if opts.prefix else s
        scaffoldbuckets[bname].append([(b.accn, b.strand)
                                       for b in partialorder])

    # Now the buckets contain a mixture of singletons and partially resolved
    # scaffolds. Print the scaffolds first then remaining singletons.
    for bname, scaffolds in sorted(scaffoldbuckets.items()):
        ctgorder = []
        singletons = set()
        for scaf in sorted(scaffolds):
            for node, orientation in scaf:
                ctgorder.append((node, orientation))
            if len(scaf) == 1:
                singletons.add(node)
        nscaffolds = len(scaffolds)
        nsingletons = len(singletons)
        if nsingletons == 1 and nscaffolds == 0:
            phase = 3
        elif nsingletons == 0 and nscaffolds == 1:
            phase = 2
        else:
            phase = 1

        msg = "{0}: Scaffolds={1} Singletons={2} Phase={3}".\
            format(bname, nscaffolds, nsingletons, phase)
        print >> sys.stderr, msg
        print >> fwphase, "\t".join((bname, str(phase)))

        order_to_agp(bname, ctgorder, sizes, fwagp)

    fwagp.close()
    os.remove(bedfile)

    fastafile = "final.fasta"
    build([newagpfile, ctgfasta, fastafile])
    tidy([fastafile])

コード例 #12

0

ファイルを表示

def instantiate(args):
    """
    %prog instantiate tagged.bed blacklist.ids big_gaps.bed

    instantiate NEW genes tagged by renumber.
    """
    p = OptionParser(instantiate.__doc__)
    p.set_annot_reformat_opts()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    taggedbed, blacklist, gapsbed = args
    r = NameRegister(prefix=opts.prefix, pad0=opts.pad0, uc=opts.uc)
    r.get_blacklist(blacklist)
    r.get_gaps(gapsbed)

    # Run through the bed, identify stretch of NEW ids to instantiate,
    # identify the flanking FRAMEs, interpolate!
    bed = Bed(taggedbed)
    outputbed = taggedbed.rsplit(".", 1)[0] + ".new.bed"
    fw = open(outputbed, "w")

    tagkey = lambda x: x.rsplit("|", 1)[-1]
    for chr, sbed in bed.sub_beds():
        current_chr = chr_number(chr)
        if not current_chr:
            continue

        sbed = list(sbed)

        ranks = []
        for i, s in enumerate(sbed):
            nametag = s.extra[0]
            tag = tagkey(nametag)

            if tag in (NEW, FRAME):
                ranks.append((i, nametag))

        blocks = []
        for tag, names in groupby(ranks, key=lambda x: tagkey(x[-1])):
            names = list(names)
            if tag == NEW:
                blocks.append((tag, [sbed[x[0]] for x in names]))
            else:
                start, end = names[0][-1], names[-1][-1]
                start, end = atg_name(start,
                                      retval="rank"), atg_name(end,
                                                               retval="rank")
                blocks.append((tag, [start, end]))

        id_table = {}  # old to new name conversion
        for i, (tag, info) in enumerate(blocks):
            if tag != NEW:
                continue

            start_id = 0 if i == 0 else blocks[i - 1][1][-1]
            end_id = start_id + 10000 if i == len(blocks) -1 \
                        else blocks[i + 1][1][0]

            r.allocate(info, chr, start_id, end_id, id_table)

        # Output new names
        for i, s in enumerate(sbed):
            nametag = s.extra[0]
            name, tag = nametag.split("|")

            if tag == NEW:
                assert name == '.'
                name = id_table[s.accn]
            elif tag == OVERLAP:
                if name in id_table:
                    name = id_table[name]

            s.extra[0] = "|".join((name, tag))
            print >> fw, s

    fw.close()

コード例 #13

0

ファイルを表示

def renumber(args):
    """
    %prog renumber Mt35.consolidated.bed > tagged.bed

    Renumber genes for annotation updates.
    """
    from jcvi.algorithms.lis import longest_increasing_subsequence
    from jcvi.utils.grouper import Grouper

    p = OptionParser(renumber.__doc__)
    p.set_annot_reformat_opts()

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args

    pf = bedfile.rsplit(".", 1)[0]
    abedfile = pf + ".a.bed"
    bbedfile = pf + ".b.bed"
    if need_update(bedfile, (abedfile, bbedfile)):
        prepare(bedfile)

    mbed = Bed(bbedfile)
    g = Grouper()
    for s in mbed:
        accn = s.accn
        g.join(*accn.split(";"))

    bed = Bed(abedfile)
    for chr, sbed in bed.sub_beds():
        current_chr = chr_number(chr)
        if not current_chr:
            continue

        ranks = []
        gg = set()
        for s in sbed:
            accn = s.accn
            achr, arank = atg_name(accn)
            if achr != current_chr:
                continue
            ranks.append(arank)
            gg.add(accn)

        lranks = longest_increasing_subsequence(ranks)
        print >> sys.stderr, current_chr, len(sbed), "==>", len(ranks), \
                    "==>", len(lranks)

        granks = set(gene_name(current_chr, x, prefix=opts.prefix, \
                     pad0=opts.pad0, uc=opts.uc) for x in lranks) | \
                 set(gene_name(current_chr, x, prefix=opts.prefix, \
                     pad0=opts.pad0, sep="te", uc=opts.uc) for x in lranks)

        tagstore = {}
        for s in sbed:
            achr, arank = atg_name(s.accn)
            accn = s.accn
            if accn in granks:
                tag = (accn, FRAME)
            elif accn in gg:
                tag = (accn, RETAIN)
            else:
                tag = (".", NEW)

            tagstore[accn] = tag

        # Find cases where genes overlap
        for s in sbed:
            accn = s.accn
            gaccn = g[accn]
            tags = [((tagstore[x][-1] if x in tagstore else NEW), x)
                    for x in gaccn]
            group = [(PRIORITY.index(tag), x) for tag, x in tags]
            best = min(group)[-1]

            if accn != best:
                tag = (best, OVERLAP)
            else:
                tag = tagstore[accn]

            print "\t".join((str(s), "|".join(tag)))

コード例 #14

0

ファイルを表示

ファイル: reformat.py プロジェクト: Hensonmw/jcvi

def renumber(args):
    """
    %prog renumber Mt35.consolidated.bed > tagged.bed

    Renumber genes for annotation updates.
    """
    from jcvi.algorithms.lis import longest_increasing_subsequence
    from jcvi.utils.grouper import Grouper

    p = OptionParser(renumber.__doc__)
    p.set_annot_reformat_opts()

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args

    pf = bedfile.rsplit(".", 1)[0]
    abedfile = pf + ".a.bed"
    bbedfile = pf + ".b.bed"
    if need_update(bedfile, (abedfile, bbedfile)):
        prepare(bedfile)

    mbed = Bed(bbedfile)
    g = Grouper()
    for s in mbed:
        accn = s.accn
        g.join(*accn.split(";"))

    bed = Bed(abedfile)
    for chr, sbed in bed.sub_beds():
        current_chr = chr_number(chr)
        if not current_chr:
            continue

        ranks = []
        gg = set()
        for s in sbed:
            accn = s.accn
            achr, arank = atg_name(accn)
            if achr != current_chr:
                continue
            ranks.append(arank)
            gg.add(accn)

        lranks = longest_increasing_subsequence(ranks)
        print >> sys.stderr, current_chr, len(sbed), "==>", len(ranks), \
                    "==>", len(lranks)

        granks = set(gene_name(current_chr, x, prefix=opts.prefix, \
                     pad0=opts.pad0, uc=opts.uc) for x in lranks) | \
                 set(gene_name(current_chr, x, prefix=opts.prefix, \
                     pad0=opts.pad0, sep="te", uc=opts.uc) for x in lranks)

        tagstore = {}
        for s in sbed:
            achr, arank = atg_name(s.accn)
            accn = s.accn
            if accn in granks:
                tag = (accn, FRAME)
            elif accn in gg:
                tag = (accn, RETAIN)
            else:
                tag = (".", NEW)

            tagstore[accn] = tag

        # Find cases where genes overlap
        for s in sbed:
            accn = s.accn
            gaccn = g[accn]
            tags = [((tagstore[x][-1] if x in tagstore else NEW), x) for x in gaccn]
            group = [(PRIORITY.index(tag), x) for tag, x in tags]
            best = min(group)[-1]

            if accn != best:
                tag = (best, OVERLAP)
            else:
                tag = tagstore[accn]

            print "\t".join((str(s), "|".join(tag)))

コード例 #15

0

ファイルを表示

ファイル: reformat.py プロジェクト: Hensonmw/jcvi

def annotate(args):
    """
    %prog annotate new.bed old.bed 2> log

    Annotate the `new.bed` with features from `old.bed` for the purpose of
    gene numbering.

    Ambiguity in ID assignment can be resolved by either of the following 2 methods:
    - `alignment`: make use of global sequence alignment score (calculated by `needle`)
    - `overlap`: make use of overlap length (calculated by `intersectBed`)

    Transfer over as many identifiers as possible while following guidelines:
    http://www.arabidopsis.org/portals/nomenclature/guidelines.jsp#editing

    Note: Following RegExp pattern describes the structure of the identifier
    assigned to features in the `new.bed` file.

    new_id_pat = re.compile(r"^\d+\.[cemtx]+\S+")

    Examples: 23231.m312389, 23231.t004898, 23231.tRNA.144
    Adjust the value of `new_id_pat` manually as per your ID naming conventions.
    """
    from jcvi.utils.grouper import Grouper

    valid_resolve_choices = ["alignment", "overlap"]

    p = OptionParser(annotate.__doc__)
    p.add_option("--resolve", default="alignment", choices=valid_resolve_choices,
                 help="Resolve ID assignment based on a certain metric" \
                        + " [default: %default]")
    p.add_option("--atg_name", default=False, action="store_true",
                help="Specify is locus IDs in `new.bed` file follow ATG nomenclature" \
                        + " [default: %default]")

    g1 = OptionGroup(p, "Optional parameters (alignment):\n" \
            + "Use if resolving ambiguities based on sequence `alignment`")
    g1.add_option("--pid", dest="pid", default=35., type="float",
            help="Percent identity cutoff [default: %default]")
    g1.add_option("--score", dest="score", default=250., type="float",
            help="Alignment score cutoff [default: %default]")
    p.add_option_group(g1)

    g2 = OptionGroup(p, "Optional parameters (overlap):\n" \
            + "Use if resolving ambiguities based on `overlap` length\n" \
            + "Parameters equivalent to `intersectBed`")
    g2.add_option("-f", dest="f", default=0.5, type="float",
            help="Minimum overlap fraction (0.0 - 1.0) [default: %default]")
    g2.add_option("-r", dest="r", default=False, action="store_true",
            help="Require fraction overlap to be reciprocal [default: %default]")
    g2.add_option("-s", dest="s", default=True, action="store_true",
            help="Require same strandedness [default: %default]")
    p.add_option_group(g2)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    nbedfile, obedfile = args
    npf, opf = nbedfile.rsplit(".", 1)[0], obedfile.rsplit(".", 1)[0]

    # Make consolidated.bed
    cbedfile = "consolidated.bed"
    if not os.path.isfile(cbedfile):
        consolidate(nbedfile, obedfile, cbedfile)
    else:
        logging.warning("`{0}` already exists. Skipping step".format(cbedfile))

    logging.warning("Resolving ID assignment ambiguity based on `{0}`".\
            format(opts.resolve))

    if opts.resolve == "alignment":
        # Get pairs and prompt to run needle
        pairsfile = "nw.pairs"
        scoresfile = "nw.scores"
        if not os.path.isfile(pairsfile):
            get_pairs(cbedfile, pairsfile)
        else:
            logging.warning("`{0}` already exists. Checking for needle output".\
                    format(pairsfile))

        # If needle scores do not exist, prompt user to run needle
        if not os.path.isfile(scoresfile):
            logging.error("`{0}` does not exist. Please process {1} using `needle`".\
                    format(scoresfile, pairsfile))
            sys.exit()
    else:
        scoresfile = "ovl.scores"
        # Calculate overlap length using intersectBed
        calculate_ovl(nbedfile, obedfile, opts, scoresfile)

    logging.warning("`{0}' exists. Storing scores in memory".\
            format(scoresfile))
    scores = read_scores(scoresfile, opts)

    # Iterate through consolidated bed and
    # filter piles based on score
    abedline = {}

    cbed = Bed(cbedfile)
    g = Grouper()
    for c in cbed:
        accn = c.accn
        g.join(*accn.split(";"))

    nbedline = {}
    nbed = Bed(nbedfile)
    for line in nbed: nbedline[line.accn] = line

    splits = set()
    for chr, chrbed in nbed.sub_beds():
        abedline, splits = annotate_chr(chr, chrbed, g, scores, nbedline, abedline, opts, splits)

    if splits is not None:
        abedline = process_splits(splits, scores, nbedline, abedline)

    abedfile = npf + ".annotated.bed"
    afh = open(abedfile, "w")
    for accn in abedline:
        print >> afh, abedline[accn]
    afh.close()

    sort([abedfile, "-i"])

コード例 #16

0

ファイルを表示

def cut(args):
    """
    %prog cut agpfile bedfile

    Cut at the boundaries of the ranges in the bedfile.
    """
    p = OptionParser(cut.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    agpfile, bedfile = args
    agp = AGP(agpfile)
    bed = Bed(bedfile)
    simple_agp = agp.order
    newagpfile = agpfile.replace(".agp", ".cut.agp")
    fw = open(newagpfile, "w")

    agp_fixes = defaultdict(list)
    for component, intervals in bed.sub_beds():
        i, a = simple_agp[component]
        object = a.object
        component_span = a.component_span
        orientation = a.orientation

        assert a.component_beg, a.component_end
        arange = a.component_beg, a.component_end

        cuts = set()
        for i in intervals:
            start, end = i.start, i.end
            end -= 1

            assert start <= end
            cuts.add(start)
            cuts.add(end)

        cuts.add(0)
        cuts.add(component_span)
        cuts = list(sorted(cuts))

        sum_of_spans = 0
        for i, (a, b) in enumerate(pairwise(cuts)):
            oid = object + "_{0}".format(i)
            aline = [oid, 0, 0, 0]
            cspan = b - a
            aline += ['D', component, a + 1, b, orientation]
            sum_of_spans += cspan

            aline = "\t".join(str(x) for x in aline)
            agp_fixes[component].append(aline)

        assert component_span == sum_of_spans

    # Finally write the masked agp
    for a in agp:
        if not a.is_gap and a.component_id in agp_fixes:
            print >> fw, "\n".join(agp_fixes[a.component_id])
        else:
            print >> fw, a

    fw.close()
    # Reindex
    idxagpfile = reindex([newagpfile])
    shutil.move(idxagpfile, newagpfile)

    return newagpfile

コード例 #17

0

ファイルを表示

ファイル: reformat.py プロジェクト: Hensonmw/jcvi

def instantiate(args):
    """
    %prog instantiate tagged.bed blacklist.ids big_gaps.bed

    instantiate NEW genes tagged by renumber.
    """
    p = OptionParser(instantiate.__doc__)
    p.set_annot_reformat_opts()
    p.add_option("--extended_stride", default=False, action="store_true",
                 help="Toggle extended strides for gene numbering")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    taggedbed, blacklist, gapsbed = args
    r = NameRegister(prefix=opts.prefix, pad0=opts.pad0, uc=opts.uc)
    r.get_blacklist(blacklist)
    r.get_gaps(gapsbed)

    # Run through the bed, identify stretch of NEW ids to instantiate,
    # identify the flanking FRAMEs, interpolate!
    bed = Bed(taggedbed)
    outputbed = taggedbed.rsplit(".", 1)[0] + ".new.bed"
    fw = open(outputbed, "w")

    tagkey = lambda x: x.rsplit("|", 1)[-1]
    for chr, sbed in bed.sub_beds():
        current_chr = chr_number(chr)
        if not current_chr:
            continue

        sbed = list(sbed)

        ranks = []
        for i, s in enumerate(sbed):
            nametag = s.extra[0]
            tag = tagkey(nametag)

            if tag in (NEW, FRAME):
                ranks.append((i, nametag))

        blocks = []
        for tag, names in groupby(ranks, key=lambda x: tagkey(x[-1])):
            names = list(names)
            if tag == NEW:
                blocks.append((tag, [sbed[x[0]] for x in names]))
            else:
                start, end = names[0][-1], names[-1][-1]
                start, end = atg_name(start, retval="rank"), atg_name(end, retval="rank")
                blocks.append((tag, [start, end]))

        id_table = {}  # old to new name conversion
        for i, (tag, info) in enumerate(blocks):
            if tag != NEW:
                continue

            start_id = 0 if i == 0 else blocks[i - 1][1][-1]
            end_id = start_id + 10000 if i == len(blocks) -1 \
                        else blocks[i + 1][1][0]

            r.allocate(info, chr, start_id, end_id, id_table, extended_stride=opts.extended_stride)

        # Output new names
        for i, s in enumerate(sbed):
            nametag = s.extra[0]
            name, tag = nametag.split("|")

            if tag == NEW:
                assert name == '.'
                name = id_table[s.accn]
            elif tag == OVERLAP:
                if name in id_table:
                    name = id_table[name]

            s.extra[0] = "|".join((name, tag))
            print >> fw, s

    fw.close()

コード例 #18

0

ファイルを表示

ファイル: agp.py プロジェクト: bennyyu/jcvi

def mask(args):
    """
    %prog mask agpfile bedfile

    Mask given ranges in componets to gaps.
    """
    p = OptionParser(mask.__doc__)
    p.add_option("--split", default=False, action="store_true",
                 help="Split object and create new names [default: %default]")
    p.add_option("--log", default=False, action="store_true",
                 help="Write verbose logs to .masklog file [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(p.print_help())

    agpfile, bedfile = args
    agp = AGP(agpfile)
    bed = Bed(bedfile)
    simple_agp = agp.order
    # agp lines to replace original ones, keyed by the component
    agp_fixes = defaultdict(list)

    newagpfile = agpfile.replace(".agp", ".masked.agp")
    logfile = bedfile.replace(".bed", ".masklog")
    fw = open(newagpfile, "w")
    if opts.log:
        fwlog = open(logfile, "w")

    for component, intervals in bed.sub_beds():
        if opts.log:
            print >> fwlog, "\n".join(str(x) for x in intervals)
        i, a = simple_agp[component]
        object = a.object
        component_span = a.component_span
        orientation = a.orientation
        if opts.log:
            print >> fwlog, a

        assert a.component_beg, a.component_end
        arange = a.component_beg, a.component_end

        # Make sure `ivs` contain DISJOINT ranges, and located within `arange`
        ivs = []
        for i in intervals:
            iv = range_intersect(arange, (i.start, i.end))
            if iv is not None:
                ivs.append(iv)

        # Sort the ends of `ivs` as well as the arange
        arange = a.component_beg - 1, a.component_end + 1
        endpoints = sorted(flatten(ivs + [arange]))
        # reverse if component on negative strand
        if orientation == '-':
            endpoints.reverse()

        sum_of_spans = 0
        # assign complements as sequence components
        for i, (a, b) in enumerate(pairwise(endpoints)):
            if orientation == '-':
                a, b = b, a
            if orientation not in ('+', '-'):
                orientation = '+'

            oid = object + "_{0}".format(i / 2) if opts.split else object
            aline = [oid, 0, 0, 0]
            if i % 2 == 0:
                cspan = b - a - 1
                aline += ['D', component, a + 1, b - 1, orientation]
                is_gap = False
            else:
                cspan = b - a + 1
                aline += ["N", cspan, "fragment", "yes"]
                is_gap = True
            if cspan <= 0:
                continue

            sum_of_spans += cspan
            aline = "\t".join(str(x) for x in aline)
            if not (opts.split and is_gap):
                agp_fixes[component].append(aline)

            if opts.log:
                print >> fwlog, aline

        assert component_span == sum_of_spans
        if opts.log:
            print >> fwlog

    # Finally write the masked agp
    for a in agp:
        if not a.is_gap and a.component_id in agp_fixes:
            print >> fw, "\n".join(agp_fixes[a.component_id])
        else:
            print >> fw, a

    fw.close()
    # Reindex
    idxagpfile = reindex([newagpfile])
    shutil.move(idxagpfile, newagpfile)

    return newagpfile

コード例 #19

0

ファイルを表示

ファイル: agp.py プロジェクト: bennyyu/jcvi

def cut(args):
    """
    %prog cut agpfile bedfile

    Cut at the boundaries of the ranges in the bedfile. Use --shrink to control
    the exact boundaries where you cut.
    """
    p = OptionParser(cut.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    agpfile, bedfile = args
    agp = AGP(agpfile)
    bed = Bed(bedfile)
    simple_agp = agp.order
    newagpfile = agpfile.replace(".agp", ".cut.agp")
    fw = open(newagpfile, "w")

    agp_fixes = defaultdict(list)
    for component, intervals in bed.sub_beds():
        i, a = simple_agp[component]
        object = a.object
        component_span = a.component_span
        orientation = a.orientation

        assert a.component_beg, a.component_end
        arange = a.component_beg, a.component_end

        cuts = set()
        for i in intervals:
            start, end = i.start, i.end
            end -= 1

            assert start <= end
            cuts.add(start)
            cuts.add(end)

        cuts.add(0)
        cuts.add(component_span)
        cuts = list(sorted(cuts))

        sum_of_spans = 0
        for i, (a, b) in enumerate(pairwise(cuts)):
            oid = object + "_{0}".format(i)
            aline = [oid, 0, 0, 0]
            cspan = b - a
            aline += ['D', component, a + 1, b, orientation]
            sum_of_spans += cspan

            aline = "\t".join(str(x) for x in aline)
            agp_fixes[component].append(aline)

        assert component_span == sum_of_spans

    # Finally write the masked agp
    for a in agp:
        if not a.is_gap and a.component_id in agp_fixes:
            print >> fw, "\n".join(agp_fixes[a.component_id])
        else:
            print >> fw, a

    fw.close()
    # Reindex
    idxagpfile = reindex([newagpfile])
    shutil.move(idxagpfile, newagpfile)

    return newagpfile

コード例 #20

0

ファイルを表示

def mask(args):
    """
    %prog mask agpfile bedfile

    Mask given ranges in components to gaps.
    """
    p = OptionParser(mask.__doc__)
    p.add_option("--split",
                 default=False,
                 action="store_true",
                 help="Split object and create new names [default: %default]")
    p.add_option(
        "--log",
        default=False,
        action="store_true",
        help="Write verbose logs to .masklog file [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(p.print_help())

    agpfile, bedfile = args
    agp = AGP(agpfile)
    bed = Bed(bedfile)
    simple_agp = agp.order
    # agp lines to replace original ones, keyed by the component
    agp_fixes = defaultdict(list)

    newagpfile = agpfile.replace(".agp", ".masked.agp")
    logfile = bedfile.replace(".bed", ".masklog")
    fw = open(newagpfile, "w")
    if opts.log:
        fwlog = open(logfile, "w")

    for component, intervals in bed.sub_beds():
        if opts.log:
            print >> fwlog, "\n".join(str(x) for x in intervals)
        i, a = simple_agp[component]
        object = a.object
        component_span = a.component_span
        orientation = a.orientation
        if opts.log:
            print >> fwlog, a

        assert a.component_beg, a.component_end
        arange = a.component_beg, a.component_end

        # Make sure `ivs` contain DISJOINT ranges, and located within `arange`
        ivs = []
        for i in intervals:
            iv = range_intersect(arange, (i.start, i.end))
            if iv is not None:
                ivs.append(iv)

        # Sort the ends of `ivs` as well as the arange
        arange = a.component_beg - 1, a.component_end + 1
        endpoints = sorted(flatten(ivs + [arange]))
        # reverse if component on negative strand
        if orientation == '-':
            endpoints.reverse()

        sum_of_spans = 0
        # assign complements as sequence components
        for i, (a, b) in enumerate(pairwise(endpoints)):
            if orientation == '-':
                a, b = b, a
            if orientation not in ('+', '-'):
                orientation = '+'

            oid = object + "_{0}".format(i / 2) if opts.split else object
            aline = [oid, 0, 0, 0]
            if i % 2 == 0:
                cspan = b - a - 1
                aline += ['D', component, a + 1, b - 1, orientation]
                is_gap = False
            else:
                cspan = b - a + 1
                aline += ["N", cspan, "fragment", "yes"]
                is_gap = True
            if cspan <= 0:
                continue

            sum_of_spans += cspan
            aline = "\t".join(str(x) for x in aline)
            if not (opts.split and is_gap):
                agp_fixes[component].append(aline)

            if opts.log:
                print >> fwlog, aline

        #assert component_span == sum_of_spans
        if opts.log:
            print >> fwlog

    # Finally write the masked agp
    for a in agp:
        if not a.is_gap and a.component_id in agp_fixes:
            print >> fw, "\n".join(agp_fixes[a.component_id])
        else:
            print >> fw, a

    fw.close()
    # Reindex
    idxagpfile = reindex([newagpfile])
    shutil.move(idxagpfile, newagpfile)

    return newagpfile

コード例 #21

0

ファイルを表示

ファイル: postprocess.py プロジェクト: Nicholas-NVS/jcvi

def scaffold(args):
    """
    %prog scaffold ctgfasta agpfile

    Build scaffolds based on ordering in the AGP file.
    """
    from jcvi.formats.agp import bed, order_to_agp, build
    from jcvi.formats.bed import Bed

    p = OptionParser(scaffold.__doc__)
    p.add_option("--prefix", default=False, action="store_true",
            help="Keep IDs with same prefix together [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ctgfasta, agpfile = args
    sizes = Sizes(ctgfasta).mapping

    pf = ctgfasta.rsplit(".", 1)[0]
    phasefile = pf + ".phases"
    fwphase = open(phasefile, "w")
    newagpfile = pf + ".new.agp"
    fwagp = open(newagpfile, "w")

    scaffoldbuckets = defaultdict(list)

    bedfile = bed([agpfile, "--nogaps", "--outfile=tmp"])
    bb = Bed(bedfile)
    for s, partialorder in bb.sub_beds():
        name = partialorder[0].accn
        bname = name.rsplit("_", 1)[0] if opts.prefix else s
        scaffoldbuckets[bname].append([(b.accn, b.strand) for b in partialorder])

    # Now the buckets contain a mixture of singletons and partially resolved
    # scaffolds. Print the scaffolds first then remaining singletons.
    for bname, scaffolds in sorted(scaffoldbuckets.items()):
        ctgorder = []
        singletons = set()
        for scaf in sorted(scaffolds):
            for node, orientation in scaf:
                ctgorder.append((node, orientation))
            if len(scaf) == 1:
                singletons.add(node)
        nscaffolds = len(scaffolds)
        nsingletons = len(singletons)
        if nsingletons == 1 and nscaffolds == 0:
            phase = 3
        elif nsingletons == 0 and nscaffolds == 1:
            phase = 2
        else:
            phase = 1

        msg = "{0}: Scaffolds={1} Singletons={2} Phase={3}".\
            format(bname, nscaffolds, nsingletons, phase)
        print >> sys.stderr, msg
        print >> fwphase, "\t".join((bname, str(phase)))

        order_to_agp(bname, ctgorder, sizes, fwagp)

    fwagp.close()
    os.remove(bedfile)

    fastafile = "final.fasta"
    build([newagpfile, ctgfasta, fastafile])
    tidy([fastafile])

コード例 #22

0

ファイルを表示

def install(args):
    """
    %prog install patchers.bed patchers.fasta backbone.fasta alt.fasta

    Install patches into backbone, using sequences from alternative assembly.
    The patches sequences are generated via jcvi.assembly.patch.fill().

    The output is a bedfile that can be converted to AGP using
    jcvi.formats.agp.frombed().
    """
    from jcvi.apps.base import blast
    from jcvi.formats.blast import BlastSlow
    from jcvi.formats.fasta import SeqIO
    from jcvi.utils.iter import roundrobin

    p = OptionParser(install.__doc__)
    p.add_option(
        "--rclip",
        default=1,
        type="int",
        help="Pair ID is derived from rstrip N chars [default: %default]")
    p.add_option(
        "--maxsize",
        default=1000000,
        type="int",
        help="Maximum size of patchers to be replaced [default: %default]")
    p.add_option("--prefix",
                 help="Prefix of the new object [default: %default]")
    p.add_option(
        "--strict",
        default=False,
        action="store_true",
        help="Only update if replacement has no gaps [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    pbed, pfasta, bbfasta, altfasta = args
    Max = opts.maxsize  # Max DNA size to replace gap
    rclip = opts.rclip
    prefix = opts.prefix

    blastfile = blast([altfasta, pfasta, "--wordsize=100", "--pctid=99"])
    order = Bed(pbed).order

    beforebed, afterbed = "before.bed", "after.bed"
    fwa = open(beforebed, "w")
    fwb = open(afterbed, "w")

    key1 = lambda x: x.query
    key2 = lambda x: x.query[:-rclip] if rclip else key1
    data = BlastSlow(blastfile)

    for pe, lines in groupby(data, key=key2):
        lines = list(lines)
        if len(lines) != 2:
            continue

        a, b = lines

        aquery, bquery = a.query, b.query
        asubject, bsubject = a.subject, b.subject
        if asubject != bsubject:
            continue

        astrand, bstrand = a.orientation, b.orientation
        assert aquery[-1] == 'L' and bquery[-1] == 'R', str((aquery, bquery))

        ai, ax = order[aquery]
        bi, bx = order[bquery]
        qstart, qstop = ax.start + a.qstart - 1, bx.start + b.qstop - 1

        if astrand == '+' and bstrand == '+':
            sstart, sstop = a.sstart, b.sstop

        elif astrand == '-' and bstrand == '-':
            sstart, sstop = b.sstart, a.sstop

        else:
            continue

        if sstart > sstop:
            continue

        if sstop > sstart + Max:
            continue

        name = aquery[:-1] + "LR"
        print >> fwa, "\t".join(str(x) for x in \
                    (ax.seqid, qstart - 1, qstop, name, 1000, "+"))
        print >> fwb, "\t".join(str(x) for x in \
                    (asubject, sstart - 1, sstop, name, 1000, astrand))

    fwa.close()
    fwb.close()

    beforefasta = fastaFromBed(beforebed, bbfasta, name=True, stranded=True)
    afterfasta = fastaFromBed(afterbed, altfasta, name=True, stranded=True)

    # Exclude the replacements that contain more Ns than before
    ah = SeqIO.parse(beforefasta, "fasta")
    bh = SeqIO.parse(afterfasta, "fasta")
    count_Ns = lambda x: x.seq.count('n') + x.seq.count('N')
    exclude = set()
    for arec, brec in zip(ah, bh):
        an = count_Ns(arec)
        bn = count_Ns(brec)
        if opts.strict:
            if bn == 0:
                continue

        elif bn < an:
            continue

        id = arec.id
        exclude.add(id)

    logging.debug("Ignore {0} updates because of decreasing quality."\
                    .format(len(exclude)))

    abed = Bed(beforebed, sorted=False)
    bbed = Bed(afterbed, sorted=False)
    abed = [x for x in abed if x.accn not in exclude]
    bbed = [x for x in bbed if x.accn not in exclude]

    abedfile = "before.filtered.bed"
    bbedfile = "after.filtered.bed"
    afbed = Bed()
    afbed.extend(abed)
    bfbed = Bed()
    bfbed.extend(bbed)

    afbed.print_to_file(abedfile)
    bfbed.print_to_file(bbedfile)

    # Shuffle the two bedfiles together
    sz = Sizes(bbfasta)
    sizes = sz.mapping
    shuffled = "shuffled.bed"
    border = bfbed.order

    all = []
    afbed.sort(key=afbed.nullkey)
    totalids = len(sizes)
    import math
    pad = int(math.log10(totalids)) + 1
    cj = 0
    seen = set()
    accn = lambda x: "{0}{1:0{2}d}".format(prefix, x, pad)

    for seqid, aa in afbed.sub_beds():
        cj += 1
        abeds, bbeds, beds = [], [], []
        size = sizes[seqid]
        ranges = [(x.seqid, x.start, x.end) for x in aa]
        cranges = range_interleave(ranges, sizes={seqid: size})
        for seqid, start, end in cranges:
            bedline = "\t".join(str(x) for x in (seqid, start - 1, end))
            abeds.append(BedLine(bedline))

        for a in aa:
            gapid = a.accn
            bi, b = border[gapid]
            bbeds.append(b)

        a = abeds[0] if abeds else []
        assert abs(len(abeds) - len(bbeds)) <= 1
        if (not a) or a.start > 1:
            abeds, bbeds = bbeds, abeds

        beds = list(roundrobin(abeds, bbeds))
        if prefix:
            for b in beds:
                b.accn = accn(cj)

        all.extend(beds)
        seen.add(seqid)

    # Singletons
    for seqid, size in sz.iter_sizes():
        if seqid in seen:
            continue

        bedline = "\t".join(str(x) for x in (seqid, 0, size, accn(cj)))
        b = BedLine(bedline)

        cj += 1
        if prefix:
            b.accn = accn(cj)

        all.append(b)

    shuffledbed = Bed()
    shuffledbed.extend(all)
    shuffledbed.print_to_file(shuffled)