Example #1
0
def insertion(args):
    """
    %prog insertion mic.mac.bed

    Find IES based on mapping MIC reads to MAC genome. Output a bedfile with
    'lesions' (stack of broken reads) in the MAC genome.
    """
    p = OptionParser(insertion.__doc__)
    p.add_option("--mindepth", default=6, type="int",
                 help="Minimum depth to call an insertion")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args
    mindepth = opts.mindepth
    bed = Bed(bedfile)
    fw = must_open(opts.outfile, "w")
    for seqid, feats in bed.sub_beds():
        left_ends = Counter([x.start for x in feats])
        right_ends = Counter([x.end for x in feats])
        selected = []
        for le, count in left_ends.items():
            if count >= mindepth:
                selected.append((seqid, le, "LE-{0}".format(le), count))
        for re, count in right_ends.items():
            if count >= mindepth:
                selected.append((seqid, re, "RE-{0}".format(re), count))
        selected.sort()
        for seqid, pos, label, count in selected:
            label = "{0}-r{1}".format(label, count)
            print >> fw, "\t".join((seqid, str(pos - 1), str(pos), label))
Example #2
0
 def __init__(self, haplotype_set, maf=.1):
     self.haplotype_set = haplotype_set
     self.nind = len(haplotype_set)
     self.notmissing = sum(1 for x in haplotype_set if x)
     counter = Counter()
     for haplotypes in haplotype_set:
         counter.update(Counter(haplotypes))
     self.counter = {}
     for h, c in counter.items():
         if c >= self.notmissing * maf:
             self.counter[h] = c
Example #3
0
def calc_ldscore(a, b):
    assert len(a) == len(b)
    # Assumes markers as A/B
    c = Counter(zip(a, b))
    c_aa = c[('A', 'A')]
    c_ab = c[('A', 'B')]
    c_ba = c[('B', 'A')]
    c_bb = c[('B', 'B')]
    n = c_aa + c_ab + c_ba + c_bb
    if n == 0:
        return 0

    f = 1. / n
    x_aa = c_aa * f
    x_ab = c_ab * f
    x_ba = c_ba * f
    x_bb = c_bb * f
    p_a = x_aa + x_ab
    p_b = x_ba + x_bb
    q_a = x_aa + x_ba
    q_b = x_ab + x_bb
    D = x_aa - p_a * q_a
    denominator = p_a * p_b * q_a * q_b
    if denominator == 0:
        return 0

    r2 = D * D / denominator
    return r2
Example #4
0
def calc_ldscore(a, b):
    assert len(a) == len(b), "{0}\n{1}".format(a, b)
    # Assumes markers as A/B
    c = Counter(zip(a, b))
    c_aa = c[("A", "A")]
    c_ab = c[("A", "B")]
    c_ba = c[("B", "A")]
    c_bb = c[("B", "B")]
    n = c_aa + c_ab + c_ba + c_bb
    if n == 0:
        return 0

    f = 1.0 / n
    x_aa = c_aa * f
    x_ab = c_ab * f
    x_ba = c_ba * f
    x_bb = c_bb * f
    p_a = x_aa + x_ab
    p_b = x_ba + x_bb
    q_a = x_aa + x_ba
    q_b = x_ab + x_bb
    D = x_aa - p_a * q_a
    denominator = p_a * p_b * q_a * q_b
    if denominator == 0:
        return 0

    r2 = D * D / denominator
    return r2
Example #5
0
def validate(args):
    """
    %prog validate outdir genome.fasta

    Validate current folder after MAKER run and check for failures. Failed batch
    will be written to a directory for additional work.
    """
    from jcvi.utils.counter import Counter

    p = OptionParser(validate.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    outdir, genome = args
    counter = Counter()

    fsnames, suffix = get_fsnames(outdir)
    dsfile = "{0}{1}/{0}.maker.output/{0}_master_datastore_index.log"
    dslogs = [dsfile.format(x, suffix) for x in fsnames]
    all_failed = []
    for f, d in zip(fsnames, dslogs):
        dslog = DatastoreIndexFile(d)
        counter.update(dslog.scaffold_status.values())
        all_failed.extend([(f, x) for x in dslog.failed])

    cmd = 'tail maker.*.out | grep -c "now finished"'
    n = int(popen(cmd).read())
    assert len(fsnames) == n
    print("ALL jobs have been finished", file=sys.stderr)

    nfailed = len(all_failed)
    if nfailed == 0:
        print("ALL scaffolds are completed with no errors", file=sys.stderr)
        return

    print("Scaffold status:", file=sys.stderr)
    print(counter, file=sys.stderr)
    failed = "FAILED"
    fw = open(failed, "w")
    print("\n".join(["\t".join((f, x)) for f, x in all_failed]), file=fw)
    fw.close()

    nlines = sum(1 for x in open("FAILED"))
    assert nlines == nfailed
    print("FAILED !! {0} instances.".format(nfailed), file=sys.stderr)

    # Rebuild the failed batch
    failed_ids = failed + ".ids"
    failed_fasta = failed + ".fasta"
    cmd = "cut -f2 {0}".format(failed)
    sh(cmd, outfile=failed_ids)
    if need_update((genome, failed_ids), failed_fasta):
        cmd = "faSomeRecords {0} {1} {2}".\
                    format(genome, failed_ids, failed_fasta)
        sh(cmd)
Example #6
0
def contrast_stores(bed1_store_r,
                    bed2_store,
                    minreads=10,
                    minpct=.1,
                    prefix="AB"):
    for target, reads in bed1_store_r.iteritems():
        nreads = len(reads)
        if nreads < minreads:
            continue
        good_mapping = max(minreads / 2, minpct * nreads)
        bed2_targets = Counter(bed2_store.get(r) for r in reads)
        c = dict(
            (k, v) for (k, v) in bed2_targets.items() if v >= good_mapping)
        ctag = "|".join("{0}({1})".format(k, v) for (k, v) in c.items())
        print prefix, target, nreads, ctag, len(set(c.keys()) - set([None]))
Example #7
0
def variation(args):
    """
    %prog variation P1.bed P2.bed F1.bed

    Associate IES in parents and progeny.
    """
    p = OptionParser(variation.__doc__)
    p.add_option("--diversity",
                 choices=("breakpoint", "variant"),
                 default="variant",
                 help="Plot diversity")
    opts, args, iopts = p.set_image_options(args, figsize="6x6")

    if len(args) != 3:
        sys.exit(not p.print_help())

    pfs = [op.basename(x).split('-')[0] for x in args]
    P1, P2, F1 = pfs
    newbedfile = "-".join(pfs) + ".bed"
    if need_update(args, newbedfile):
        newbed = Bed()
        for pf, filename in zip(pfs, args):
            bed = Bed(filename)
            for b in bed:
                b.accn = "-".join((pf, b.accn))
                b.score = None
                newbed.append(b)
        newbed.print_to_file(newbedfile, sorted=True)

    neworder = Bed(newbedfile).order
    mergedbedfile = mergeBed(newbedfile, nms=True)
    bed = Bed(mergedbedfile)
    valid = 0
    total_counts = Counter()
    F1_counts = []
    bp_diff = []
    novelbedfile = "novel.bed"
    fw = open(novelbedfile, "w")
    for b in bed:
        accns = b.accn.split(',')
        pfs_accns = [x.split("-")[0] for x in accns]
        pfs_counts = Counter(pfs_accns)
        if len(pfs_counts) != 3:
            print(b, file=fw)
            continue

        valid += 1
        total_counts += pfs_counts
        F1_counts.append(pfs_counts[F1])

        # Collect breakpoint positions between P1 and F1
        P1_accns = [x for x in accns if x.split("-")[0] == P1]
        F1_accns = [x for x in accns if x.split("-")[0] == F1]
        if len(P1_accns) != 1:
            continue

        ri, ref = neworder[P1_accns[0]]
        P1_accns = [neworder[x][-1] for x in F1_accns]
        bp_diff.extend(x.start - ref.start for x in P1_accns)
        bp_diff.extend(x.end - ref.end for x in P1_accns)

    print("A total of {0} sites show consistent deletions across samples.".\
                    format(percentage(valid, len(bed))), file=sys.stderr)
    for pf, count in total_counts.items():
        print("{0:>9}: {1:.2f} deletions/site".\
                    format(pf, count * 1. / valid), file=sys.stderr)

    F1_counts = Counter(F1_counts)

    # Plot the IES variant number diversity
    from jcvi.graphics.base import plt, savefig, set_ticklabels_helvetica

    fig = plt.figure(1, (iopts.w, iopts.h))
    if opts.diversity == "variant":
        left, height = zip(*sorted(F1_counts.items()))
        for l, h in zip(left, height):
            print("{0:>9} variants: {1}".format(l, h), file=sys.stderr)
            plt.text(l,
                     h + 5,
                     str(h),
                     color="darkslategray",
                     size=8,
                     ha="center",
                     va="bottom",
                     rotation=90)

        plt.bar(left, height, align="center")
        plt.xlabel("Identified number of IES per site")
        plt.ylabel("Counts")
        plt.title("IES variation in progeny pool")
        ax = plt.gca()
        set_ticklabels_helvetica(ax)
        savefig(F1 + ".counts.pdf")

    # Plot the IES breakpoint position diversity
    else:
        bp_diff = Counter(bp_diff)
        bp_diff_abs = Counter()
        for k, v in bp_diff.items():
            bp_diff_abs[abs(k)] += v
        plt.figure(1, (iopts.w, iopts.h))
        left, height = zip(*sorted(bp_diff_abs.items()))
        for l, h in zip(left, height)[:21]:
            plt.text(l,
                     h + 50,
                     str(h),
                     color="darkslategray",
                     size=8,
                     ha="center",
                     va="bottom",
                     rotation=90)

        plt.bar(left, height, align="center")
        plt.xlabel("Progeny breakpoint relative to SB210")
        plt.ylabel("Counts")
        plt.xlim(-.5, 20.5)
        ax = plt.gca()
        set_ticklabels_helvetica(ax)
        savefig(F1 + ".breaks.pdf")
        # Serialize the data to a file
        fw = open("Breakpoint-offset-histogram.csv", "w")
        for k, v in sorted(bp_diff.items()):
            print("{0},{1}".format(k, v), file=fw)
        fw.close()

        total = sum(height)
        zeros = bp_diff[0]
        within_20 = sum([v for i, v in bp_diff.items() if -20 <= i <= 20])
        print("No deviation: {0}".format(percentage(zeros, total)),
              file=sys.stderr)
        print(" Within 20bp: {0}".format(percentage(within_20, total)),
              file=sys.stderr)
Example #8
0
def deletion(args):
    """
    %prog deletion [mac.mic.bam|mac.mic.bed] mic.gaps.bed

    Find IES based on mapping MAC reads to MIC genome.
    """
    p = OptionParser(deletion.__doc__)
    p.add_option("--mindepth",
                 default=3,
                 type="int",
                 help="Minimum depth to call a deletion")
    p.add_option("--minspan",
                 default=30,
                 type="int",
                 help="Minimum span to call a deletion")
    p.add_option("--split",
                 default=False,
                 action="store_true",
                 help="Break at cigar N into separate parts")
    p.set_tmpdir()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bedfile, gapsbedfile = args
    if bedfile.endswith(".bam"):
        bamfile = bedfile
        bedfile = bamfile.replace(".sorted.", ".").replace(".bam", ".bed")
        if need_update(bamfile, bedfile):
            cmd = "bamToBed -i {0}".format(bamfile)
            if opts.split:
                cmd += " -split"
            cmd += " | cut -f1-4"
            sh(cmd, outfile=bedfile)

    sort_tmpdir = "--tmpdir={0}".format(opts.tmpdir)
    if bedfile.endswith(".sorted.bed"):
        pf = bedfile.rsplit(".", 2)[0]
        sortedbedfile = bedfile
    else:
        pf = bedfile.rsplit(".", 1)[0]
        sortedbedfile = pf + ".sorted.bed"
        if need_update(bedfile, sortedbedfile):
            sort([bedfile, "-u", "--accn", sort_tmpdir])

    # Find reads that contain multiple matches
    ibedfile = pf + ".d.bed"
    if need_update(sortedbedfile, ibedfile):
        bed = Bed(sortedbedfile, sorted=False)
        fw = open(ibedfile, "w")
        logging.debug("Write deletions to `{0}`.".format(ibedfile))
        for accn, bb in groupby(bed, key=lambda x: x.accn):
            bb = list(bb)
            branges = [(x.seqid, x.start, x.end) for x in bb]
            iranges = range_interleave(branges)
            for seqid, start, end in iranges:
                if end - start + 1 < opts.minspan:
                    continue
                print("\t".join(str(x) for x in \
                            (seqid, start - 1, end, accn + '-d')), file=fw)
        fw.close()

    # Uniqify the insertions and count occurrences
    countbedfile = pf + ".uniq.bed"
    if need_update(ibedfile, countbedfile):
        bed = Bed(ibedfile)
        fw = open(countbedfile, "w")
        logging.debug("Write counts to `{0}`.".format(countbedfile))
        registry = Counter((x.seqid, x.start, x.end) for x in bed)
        ies_id = 1
        for (seqid, start, end), count in registry.items():
            ies_name = "{0:05d}-r{1}".format(ies_id, count)
            if count < opts.mindepth:
                continue
            print("\t".join(str(x) for x in \
                            (seqid, start - 1, end, ies_name)), file=fw)
            ies_id += 1
        fw.close()
        sort([countbedfile, "-i", sort_tmpdir])

    # Remove deletions that contain some read depth
    depthbedfile = pf + ".depth.bed"
    if need_update((sortedbedfile, countbedfile), depthbedfile):
        depth([
            sortedbedfile, countbedfile, "--outfile={0}".format(depthbedfile)
        ])

    validbedfile = pf + ".valid.bed"
    if need_update(depthbedfile, validbedfile):
        fw = open(validbedfile, "w")
        logging.debug("Filter valid deletions to `{0}`.".format(validbedfile))
        bed = Bed(depthbedfile)
        all_scores = [float(b.score) for b in bed]
        lb, ub = outlier_cutoff(all_scores)
        logging.debug(
            "Bounds for depths: LB={0:.2f} (ignored)  UB={1:.2f}".format(
                lb, ub))
        for b in bed:
            if float(b.score) > ub:
                continue
            print(b, file=fw)
        fw.close()

    # Remove deletions that contain sequencing gaps on its flanks
    selectedbedfile = pf + ".selected.bed"
    if need_update(validbedfile, selectedbedfile):
        flanksbedfile = pf + ".flanks.bed"
        fw = open(flanksbedfile, "w")
        bed = Bed(validbedfile)
        flank = 100
        logging.debug("Write deletion flanks to `{0}`.".format(flanksbedfile))
        for b in bed:
            start, end = b.start, b.end
            b.start, b.end = start, min(start + flank - 1, end)
            print(b, file=fw)
            b.start, b.end = max(start, end - flank + 1), end
            print(b, file=fw)
        fw.close()

        intersectidsfile = pf + ".intersect.ids"
        cmd = "intersectBed -a {0} -b {1}".format(flanksbedfile, gapsbedfile)
        cmd += " | cut -f4 | sort -u"
        sh(cmd, outfile=intersectidsfile)
        some([
            validbedfile, intersectidsfile, "-v",
            "--outfile={0}".format(selectedbedfile)
        ])

    # Find best-scoring non-overlapping set
    iesbedfile = pf + ".ies.bed"
    if need_update(selectedbedfile, iesbedfile):
        bed = Bed(selectedbedfile)
        fw = open(iesbedfile, "w")
        logging.debug("Write IES to `{0}`.".format(iesbedfile))
        branges = [Range(x.seqid, x.start, x.end, int(x.accn.rsplit("r")[-1]), i) \
                        for i, x in enumerate(bed)]
        iranges, iscore = range_chain(branges)
        logging.debug("Best chain score: {0} ({1} IES)".\
                        format(iscore, len(iranges)))
        ies_id = 1
        for seqid, start, end, score, id in iranges:
            ies_name = "IES-{0:05d}-r{1}".format(ies_id, score)
            span = end - start + 1
            print("\t".join(str(x) for x in \
                            (seqid, start - 1, end, ies_name, span)), file=fw)
            ies_id += 1
        fw.close()
Example #9
0
def pixel_stats(img):
    img = [(p_round(r), p_round(g), p_round(b)) for r, g, b in img]
    c = Counter(img)
    imgx, count = c.most_common(1)[0]
    return imgx
Example #10
0
 def tally_markers(self, markers):
     counter = Counter([x.seqid for x in markers])
     self.scaffold_1m = len([x for x in counter.values() if x == 1])
     self.scaffold_2m = len([x for x in counter.values() if x == 2])
     self.scaffold_3m = len([x for x in counter.values() if x == 3])
     self.scaffold_4m = len([x for x in counter.values() if x >= 4])
Example #11
0
 def mlg_counts(self):
     return Counter([x.mlg for x in self.markers])
Example #12
0
def resolve(args):
    """
    %prog resolve matrixfile fastafile bamfolder

    Separate repeats along collapsed contigs. First scan the matrixfile for
    largely heterozygous sites. For each heterozygous site, we scan each bam to
    retrieve distinct haplotypes. The frequency of each haplotype is then
    computed, the haplotype with the highest frequency, assumed to be
    paralogous, is removed.
    """
    import pysam
    from collections import defaultdict
    from itertools import groupby

    p = OptionParser(resolve.__doc__)
    p.add_option("--missing", default=.5, help="Maximum level of missing data")
    p.add_option("--het",
                 default=.5,
                 help="Maximum level of heterozygous calls")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    matrixfile, fastafile, bamfolder = args
    #f = Fasta(fastafile)
    fp = open(matrixfile)
    for row in fp:
        if row[0] != '#':
            break
    header = row.split()
    ngenotypes = len(header) - 4
    nmissing = int(round(opts.missing * ngenotypes))
    logging.debug("A total of {0} individuals scanned".format(ngenotypes))
    logging.debug("Look for markers with < {0} missing and > {1} het".\
                    format(opts.missing, opts.het))
    bamfiles = iglob(bamfolder, "*.bam")
    logging.debug("Folder `{0}` contained {1} bam files".\
                    format(bamfolder, len(bamfiles)))

    data = []
    for row in fp:
        if row[0] == '#':
            continue
        atoms = row.split()
        seqid, pos, ref, alt = atoms[:4]
        genotypes = atoms[4:]
        c = Counter(genotypes)
        c0 = c.get('0', 0)
        c3 = c.get('3', 0)
        if c0 >= nmissing:
            continue
        hetratio = c3 * 1. / (ngenotypes - c0)
        if hetratio <= opts.het:
            continue
        pos = int(pos)
        data.append((seqid, pos, ref, alt, c, hetratio))

    data.sort()
    logging.debug("A total of {0} target markers in {1} contigs.".\
                    format(len(data), len(set(x[0] for x in data))))
    samfiles = [pysam.AlignmentFile(x, "rb") for x in bamfiles]
    samfiles = [(op.basename(x.filename).split(".")[0], x) for x in samfiles]
    samfiles.sort()
    logging.debug("BAM files grouped to {0} individuals".\
                    format(len(set(x[0] for x in samfiles))))

    fw = must_open(opts.outfile, "w")
    for seqid, d in groupby(data, lambda x: x[0]):
        d = list(d)
        nmarkers = len(d)
        logging.debug("Process contig {0} ({1} markers)".format(
            seqid, nmarkers))
        haplotype_set = []
        for pf, sf in groupby(samfiles, key=lambda x: x[0]):
            haplotypes = []
            for pfi, samfile in sf:
                reads = defaultdict(list)
                positions = []
                for s, pos, ref, alt, c, hetratio in d:
                    for c in samfile.pileup(seqid):
                        if c.reference_pos != pos - 1:
                            continue
                        for r in c.pileups:
                            rname = r.alignment.query_name
                            rbase = r.alignment.query_sequence[
                                r.query_position]
                            reads[rname].append((pos, rbase))
                    positions.append(pos)
                for read in reads.values():
                    hap = ['-'] * nmarkers
                    for p, rbase in read:
                        hap[positions.index(p)] = rbase
                    hap = "".join(hap)
                    if "-" in hap:
                        continue
                    haplotypes.append(hap)
            haplotypes = set(haplotypes)
            haplotype_set.append(haplotypes)
        hr = HaplotypeResolver(haplotype_set)
        print >> fw, seqid, hr
        hr.solve(fw)
Example #13
0
def graph(args):
    """
    %prog graph best.edges

    Convert Celera Assembler's "best.edges" to a GEXF which can be used to
    feed into Gephi to check the topology of the best overlapping graph. Mutual
    best edges are represented as thicker edges.

    Reference:
    https://github.com/PacificBiosciences/Bioinformatics-Training/blob/master/scripts/CeleraToGephi.py
    """
    p = OptionParser(graph.__doc__)
    p.add_option(
        "--query",
        default=-1,
        type="int",
        help="Search from node, -1 to select random node, 0 to disable")
    p.add_option("--contig", help="Search from contigs, use comma to separate")
    p.add_option("--largest",
                 default=0,
                 type="int",
                 help="Only show largest components")
    p.add_option("--maxsize", default=500, type="int", help="Max graph size")
    p.add_option("--nomutualbest",
                 default=False,
                 action="store_true",
                 help="Do not plot mutual best edges as heavy")
    add_graph_options(p)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bestedges, = args
    query = opts.query
    contig = opts.contig
    largest = opts.largest
    frgctg = opts.frgctg
    edgeweight = not opts.nomutualbest
    G = read_graph(bestedges, maxerr=opts.maxerr)

    if largest:
        H = list(nx.connected_component_subgraphs(G))
        c = min(len(H), largest)
        logging.debug("{0} components found, {1} retained".format(len(H), c))
        G = nx.Graph()
        for x in H[:c]:
            G.add_edges_from(x.edges())

    if query:
        if query == -1:
            query = choice(G.nodes())
        reads_to_ctgs = parse_ctgs(bestedges, frgctg)
        if contig:
            contigs = set(contig.split(","))
            core = [k for k, v in reads_to_ctgs.items() if v in contigs]
        else:
            ctg = reads_to_ctgs.get(query)
            core = [k for k, v in reads_to_ctgs.items() if v == ctg]
            logging.debug("Reads ({0}) extended from the same contig {1}".\
                          format(len(core), ctg))

        # Extract a local neighborhood
        SG = nx.Graph()
        H = graph_local_neighborhood(G, query=core, maxsize=opts.maxsize)
        SG.add_edges_from(H.edges(data=edgeweight))
        G = SG

        seen = []
        for n, attrib in G.nodes_iter(data=True):
            contig = reads_to_ctgs.get(n, "na")
            attrib['label'] = contig
            seen.append(contig)
        c = Counter(seen)
        cc = ["{0}({1})".format(k, v) for k, v in c.most_common()]
        print("Contigs: {0}".format(" ".join(cc)), file=sys.stderr)

    gexf = "best"
    if query >= 0:
        gexf += ".{0}".format(query)
    gexf += ".gexf"
    nx.write_gexf(G, gexf)
    logging.debug("Graph written to `{0}` (|V|={1}, |E|={2})".\
                    format(gexf, len(G), G.size()))
Example #14
0
def read_graph(bestedges, maxerr=100, directed=False):
    logging.debug("Max error = {0}%".format(maxerr))
    tag = "dir." if directed else ""
    bestgraph = bestedges.split(".")[0] + ".err{0}.{1}graph".format(
        maxerr, tag)
    if need_update(bestedges, bestgraph):
        G = {} if directed else nx.Graph()
        fp = open(bestedges)
        best_store = {}
        for row in fp:
            if row[0] == '#':
                continue
            id1, lib_id, best5, o5, best3, o3, j1, j2 = row.split()
            id1, best5, best3 = int(id1), int(best5), int(best3)
            j1, j2 = float(j1), float(j2)
            if j1 <= maxerr or j2 <= maxerr:
                if not directed:
                    G.add_node(id1)
                id1p5, id1p3 = "{0}-5'".format(id1), "{0}-3'".format(id1)
                best5o5 = "{0}-{1}".format(best5, o5)
                best3o3 = "{0}-{1}".format(best3, o3)
                best_store[id1p5] = best5o5
                best_store[id1p3] = best3o3
            if best5 and j1 <= maxerr:
                if directed:
                    G[id1p5] = best5o5
                else:
                    G.add_edge(best5, id1, weight=10)
            if best3 and j2 <= maxerr:
                if directed:
                    G[id1p3] = best3o3
                else:
                    G.add_edge(id1, best3, weight=10)

        # Annotate edge weight for mutual best link, note that edge weights are
        # (11) set close to 10, to minimize impact to layout (Yifan Hu's
        # multilevel)
        nmutuals = 0
        for k, v in best_store.items():
            if best_store.get(v) == k and k < v:
                k, v = int(k.split("-")[0]), int(v.split("-")[0])
                G[k][v]["weight"] = 11
                nmutuals += 1
        logging.debug("Mutual best edges: {0}".format(nmutuals))

        if directed:
            fw = open(bestgraph, "w")
            dump(G, fw)
            fw.close()
        else:
            nx.write_gpickle(G, bestgraph)
        logging.debug("Graph pickled to `{0}`".format(bestgraph))

        # Compute node degree histogram and save in (degree, counts) tab file
        degrees = G.degree()
        degree_counter = Counter(degrees.values())
        degreesfile = "degrees.txt"
        fw = open(degreesfile, "w")
        for degree, count in sorted(degree_counter.items()):
            print("{0}\t{1}".format(degree, count), file=fw)
        fw.close()
        logging.debug("Node degree distribution saved to `{0}`".\
                        format(degreesfile))

        # Save high degree (top 1%) nodes in save in (node, degree) tab file
        percentile = sorted(degrees.values(),
                            reverse=True)[len(degrees) / 1000]
        logging.debug("Top 0.1% has degree of at least {0}".format(percentile))
        hubs = [(k, v) for k, v in degrees.items() if v >= percentile]
        hubs.sort(key=lambda x: x[1], reverse=True)  # degress descending
        hubsfile = "hubs.txt"
        fw = open(hubsfile, "w")
        for node, degree in hubs:
            print("{0}\t{1}".format(node, degree), file=fw)
        fw.close()
        logging.debug("Hubs saved to `{0}`".format(hubsfile))

    logging.debug("Read graph from `{0}`".format(bestgraph))
    if directed:
        G = load(open(bestgraph))
    else:
        G = nx.read_gpickle(bestgraph)
        graph_stats(G)
    return G
Example #15
0
def prune(args):
    """
    %prog prune best.edges

    Prune overlap graph.
    """
    from collections import defaultdict

    p = OptionParser(prune.__doc__)
    add_graph_options(p)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bestedges, = args
    G = read_graph(bestedges, maxerr=opts.maxerr)
    reads_to_ctgs = parse_ctgs(bestedges, opts.frgctg)
    edges = defaultdict(int)
    r = defaultdict(int)
    for a, b, d in G.edges_iter(data=True):
        ua, ub = reads_to_ctgs.get(a), reads_to_ctgs.get(b)
        nn = (ua, ub).count(None)
        if nn == 0:
            if ua == ub:
                r["Same tigs"] += 1
            else:
                r["Diff tigs"] += 1
                if ua > ub:
                    ua, ub = ub, ua
                edges[(ua, ub)] += 1
        elif nn == 1:
            r["One null"] += 1
        else:
            assert nn == 2
            r["Two nulls"] += 1

    U = nx.Graph()
    difftigs = "diff_tigs.txt"
    neighbors = defaultdict(list)
    fw = open(difftigs, "w")
    for (ua, ub), count in edges.items():
        print("\t".join((ua, ub, str(count))), file=fw)
        U.add_edge(ua, ub, weight=count)
        neighbors[ua].append((ub, count))
        neighbors[ub].append((ua, count))
    fw.close()

    print("[Unitig edge property]", file=sys.stderr)
    for k, v in r.items():
        print(": ".join((k, str(v))), file=sys.stderr)
    print("Total: {0}".format(sum(r.values())), file=sys.stderr)

    print("[Unitig degree distribution]", file=sys.stderr)
    degrees = U.degree()
    degree_counter = Counter(degrees.values())
    for degree, count in sorted(degree_counter.items()):
        print("{0}\t{1}".format(degree, count), file=sys.stderr)

    # To find associative contigs, one look for a contig that is connected and
    # only connected to another single contig - and do that recursively until no
    # more contigs can be found
    associative = {}
    for ua, ubs in neighbors.items():
        if len(ubs) == 1:  # Only one neighbor
            ub, count = ubs[0]
            if count >= 2:  # Bubble
                associative[ua] = (ub, count)
    print("A total of {0} associative contigs found"\
                        .format(len(associative)), file=sys.stderr)

    # Keep only one for mutual associative
    for ua, ub in associative.items():
        if ub in associative and ua < ub:
            print(ua, "mutually associative with", ub, file=sys.stderr)
            del associative[ub]
    print("A total of {0} associative contigs retained"\
                        .format(len(associative)), file=sys.stderr)

    assids = "associative.ids"
    fw = open(assids, "w")
    for ua, (ub, count) in sorted(associative.items(),
                                  key=lambda x: (x[1], x[0])):
        print("\t".join((ua, ub, str(count))), file=fw)
    fw.close()
    logging.debug("Associative contigs written to `{0}`".format(assids))
Example #16
0
def weblogo(args):
    """
    %prog weblogo [fastafile|fastqfile]

    Extract base composition for reads
    """
    import numpy as np
    from jcvi.utils.progressbar import ProgressBar, Percentage, Bar, ETA

    p = OptionParser(weblogo.__doc__)
    p.add_option("-N", default=10, type="int",
                 help="Count the first and last N bases")
    p.add_option("--nreads", default=1000000, type="int",
                 help="Parse first N reads")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastqfile, = args
    N = opts.N
    nreads = opts.nreads

    pat = "ATCG"
    L = np.zeros((4, N), dtype="int32")
    R = np.zeros((4, N), dtype="int32")
    p = dict((a, i) for (i, a) in enumerate(pat))
    L4, R3 = Counter(), Counter()
    widgets = ['Parse reads: ', Percentage(), ' ',
               Bar(marker='>', left='[', right=']'), ' ', ETA()]
    pr = ProgressBar(maxval=nreads, term_width=60, widgets=widgets).start()

    k = 0
    fw_L = open("L.fasta", "w")
    fw_R = open("R.fasta", "w")
    fastq = fastqfile.endswith(".fastq")
    it = iter_fastq(fastqfile) if fastq else \
           SeqIO.parse(must_open(fastqfile), "fasta")
    for rec in it:
        k += 1
        if k % 1000 == 0:
            pr.update(k)
        if k > nreads:
            break
        if rec is None:
            break
        s = str(rec.seq)
        for i, a in enumerate(s[:N]):
            if a in p:
                a = p[a]
                L[a][i] += 1
        for j, a in enumerate(s[-N:][::-1]):
            if a in p:
                a = p[a]
                R[a][N - 1 - j] += 1
        l4, r3 = s[:4], s[-3:]
        L4[l4] += 1
        R3[r3] += 1
        print >> fw_L, ">{0}\n{1}".format(k, s[:N])
        print >> fw_R, ">{0}\n{1}".format(k, s[-N:])

    fw_L.close()
    fw_R.close()

    cmd = "weblogo -F png -s large -f {0}.fasta -o {0}.png"
    cmd += " --color-scheme classic --composition none -U probability"
    cmd += " --title {1}"
    sh(cmd.format('L', "First_10_bases"))
    sh(cmd.format('R', "Last_10_bases"))

    np.savetxt("L.{0}.csv".format(pat), L, delimiter=',', fmt="%d")
    np.savetxt("R.{0}.csv".format(pat), R, delimiter=',', fmt="%d")

    fw = open("L4.common", "w")
    for p, c in L4.most_common(N):
        print >> fw, "\t".join((p, str(c)))
    fw.close()

    fw = open("R3.common", "w")
    for p, c in R3.most_common(N):
        print >> fw, "\t".join((p, str(c)))
    fw.close()