def main(args):
    # create a dataframe with all miRNA counts from all samples
    allcounts = {}
    for f in args.counts:
        fname = op.basename(f).split(args.fullext)[0]
        casecounts = {}
        for line in reader(f, header="chrom start stop name score strand count".split()):
            casecounts[line['name']] = int(line['count'])
        allcounts[fname] = casecounts
    countsdf = pd.DataFrame(allcounts)

    # create a set of unique miRNAs from all the miRNA lists
    uniquemirnas = []
    for f in args.mirnalist:
        for line in reader(f, header=['name']):
            uniquemirnas.append(line['name'])
    uniquemirnas = set(uniquemirnas)
    
    # log the counts
    # countsdf = np.log(countsdf + 1)
    
    manojset = "MP1 MP2 MP9 MP20 MP21 MP24 MP34 MP35 MP36 MP38 MP42.ACTG MP43.ACTG MP43.TCGA MP44.ACTG MP44.TCGA MP45.ACTG MP45.TCGA".split()
    manojset = "MP2 MP9 MP20 MP21 MP24 MP34 MP35 MP36 MP38 MP42.ACTG MP43.ACTG MP43.TCGA MP44.ACTG MP44.TCGA MP45.ACTG MP45.TCGA".split()
    # manojset = "MP2 MP9 MP20 MP21 MP34 MP35 MP36 MP43.ACTG MP43.TCGA MP44.ACTG MP44.TCGA".split()
    peterset1 = "PK11 PK21 PK24 PK31 PK41 PK42 PK51 PK52 PK54".split()
    peterset2 = "PK11 PK12 PK21 PK22 PK31 PK32 PK41 PK51 PK52 PK53".split()
    
    # print matrix
    countsdf.ix[uniquemirnas,manojset].to_csv(args.out, sep=",", header=True)
    countsdf.ix[uniquemirnas,peterset1].to_csv("peter1_top50.csv", sep=",", header=True)
    countsdf.ix[uniquemirnas,peterset2].to_csv("peter2_top50.csv", sep=",", header=True)
Beispiel #2
0
def local_shuffle(bed, loc='500000'):
    """
    Randomize the location of each interval in `bed` by moving its
    start location to within `loc` bp of its current location or to
    its containing interval in `loc`.

    Arguments:
        bed - input bed file
        loc - shuffle intervals to within this distance (+ or -).
               If not an integer, then this should be a BED file containing
               regions such that each interval in `bed` is shuffled within
               its containing interval in `loc`
    """
    from random import randint
    if str(loc).isdigit():
        dist = abs(int(loc))
        with nopen(bed) as fh:
            for toks in (l.rstrip('\r\n').split('\t') for l in fh):
                d = randint(-dist, dist)
                toks[1:3] = [str(max(0, int(bloc) + d)) for bloc in toks[1:3]]
                print "\t".join(toks)
    else:
        # we are using dist as the windows within which to shuffle
        assert os.path.exists(loc)
        bed4 = mktemp()
        with open(bed4, 'w') as fh:
            # this step is so we don't have to track the number of columns in A
            for toks in reader(bed, header=False):
                fh.write("%s\t%s\n" % ("\t".join(toks[:3]), SEP.join(toks)))

        missing = 0
        # we first find the b-interval that contains each a-interval by
        # using bedtools intersect
        for toks in reader("|bedtools intersect -wao -a {bed4} -b {loc}"
                           .format(**locals()), header=False):
            ajoin = toks[:4]
            a = ajoin[3].split(SEP)  # extract the full interval
            b = toks[4:]

            if int(b[-1]) == 0:
                missing += 1
                continue
            assert a[0] == b[0], ('chroms dont match', a, b)

            alen = int(a[2]) - int(a[1])
            # doesn't care if the new interval is completely contained in b
            astart = randint(int(b[1]), int(b[2]))

            # subtract half the time.
            aend = (astart - alen) if randint(0, 1) == 0 and astart > alen \
                else (astart + alen)

            a[1], a[2] = map(str, (astart, aend) if astart < aend
                             else (aend, astart))

            print "\t".join(a)
        if missing > 0:
            print >> sys.stderr, ("found {missing} intervals in {bed} that "
                                  " were not contained in {loc}"
                                  .format(**locals()))
Beispiel #3
0
def multi_intersect(files, cutoff):
    """files = {sample_name:file_path}"""
    sitestmp = open(tempfile.mkstemp(suffix=".bed")[1], 'wb')
    snames = [op.basename(f).split(".")[0].split("_")[0] for f in files]
    cmd = ("|bedtools multiinter -cluster -header "
                "-names {names} -i {files}").format(names=" ".join(snames),
                                                    files=" ".join(files))
    # apply cutoff, name peaks
    for i, l in enumerate(reader(cmd, header=True)):
        if int(l['num']) < cutoff: continue
        print >>sitestmp, "\t".join([l['chrom'], l['start'], l['end'],
                                        "peak_{i}".format(i=i)])
    sitestmp.close()
    # annotate the merged sites by intersecting with all of the files
    classtmp = open(tempfile.mkstemp(suffix=".bed")[1], 'wb')
    annotated_peaks = sitestmp.name
    # pull out peak classes from input files
    for f in files:
        annotated_peaks = map_peak_class(f, annotated_peaks)
    for peak in reader(annotated_peaks, header=AnnotatedPeak):
        if peak.name is None: continue
        print >>classtmp, "{chrom}\t{start}\t{stop}\t{name}\n".format(
                                chrom=peak.chrom, start=peak.start,
                                stop=peak.stop, name=peak.name)
    classtmp.close()
    return classtmp.name
Beispiel #4
0
def main(count_files, metadata):
    pools = defaultdict(list)
    for toks in reader(metadata):
        for k, v in toks.iteritems():
            if k.startswith("Pool") and v == "TRUE":
                # get the samples
                pool_name = k.split("_")[-1]
                pools[pool_name].append(toks['alias'])
    for pool, samples in pools.iteritems():
        print >>sys.stderr, ">> processing", pool
        for strand in ["pos", "neg"]:
            files = [f for f in count_files if os.path.basename(f).split(".")[0] in samples and strand in os.path.basename(f)]
            # simplest way to join files into a dataframe
            raw_count_data = {}
            for file_path in files:
                sample = get_sample_name(file_path)
                raw_count_data[sample] = {}
                for toks in reader(file_path, header=['gene', 'site', 'count']):
                    raw_count_data[sample]["{gene}:{site}".format(gene=toks['gene'], site=toks['site'])] = int(toks['count'])
            # dataframe from dict of dicts
            count_data = pd.DataFrame(raw_count_data)
            # will need to split into multiindex here to match new count fmt
            count_data.index = pd.MultiIndex.from_tuples([x.split(":") for x in count_data.index], names=['gene','site'])
            # normalize the counts
            count_data = norm_deseq(count_data)
            # round the normalized counts up to int
            # don't want to throw out single counts at any site
            count_data = count_data.apply(np.ceil)
            # sum the rows
            count_data[pool] = count_data.sum(axis=1)
            # print results
            out_file = gzip.open("{pool}.{strand}.txt.gz".format(pool=pool, strand=strand), "wb")
            count_data[pool].astype('int').to_csv(out_file, sep="\t")
            out_file.close()
Beispiel #5
0
def search(args):
    """Given fasta, gff, and bam, parses for sequence, annotates feature, and
    reports coverage.
    Args: bedgraph, fasta, gff, seq, feature, verbose.
    """
    match_seq = args.seq.upper()

    # write a temp bed of sequence match sites
    site_temp = open(tempfile.mktemp(suffix=".bed"), 'wb')
    with nopen(args.fasta) as fasta:
        for chrom, seq in read_fasta(fasta):            
            if args.verbose: sys.stderr.write(">> processing %s...\n" % chrom)
            # for each sequence match
            for i, m in enumerate([s.start() for s in re.finditer(match_seq, seq)]):
                start = m
                stop = start + 2
                name = "%s_%s_%d" % (chrom, match_seq, i)
                fields = [chrom, start, stop, name]
                site_temp.write("\t".join(map(str, fields)) + "\n")
    site_temp.close()

    # convert gff to bed with gene name as bed name field
    gff_temp = open(tempfile.mktemp(suffix=".bed"), 'wb')
    result_header = "chrom source feature start stop score strand frame attributes comments".split()
    # for filtering unique and storing start and stop for each gene
    genes = {}
    if args.verbose: sys.stderr.write(">> selecting %s from gff records...\n" % args.feature)
    for g in reader(args.gff, header=result_header):
        try:
            if not g['feature'] == args.feature: continue
            # regex gene name out
            gene_name = re.findall(r'Name=([\w\.]+)', g['attributes'])[0]
            # skip already seen
            if genes.has_key(gene_name): continue
            genes[gene_name] = {'start':int(g['start']), 'stop':int(g['stop']), 'strand':g['strand']}
            fields = [g['chrom'], g['start'], g['stop'], gene_name]
            gff_temp.write("\t".join(map(str, fields)) + "\n")
        except KeyError:
            if not g['chrom'].startswith("#"):
                sys.stderr.write("ERROR parsing gff!\n")
                sys.exit(1)
    gff_temp.close()

    # sort the gene bed, map and collapse genes onto site_temp, then add counts
    if args.verbose: sys.stderr.write(">> finding relative gene location per sequence match...\n")
    result_header = "chrom start stop name gene_name counts".split()
    cmd = "|sortBed -i %s | mapBed -a %s -b - -c 4 -o collapse | mapBed -a - -b %s -c 4 -o sum"\
            % (gff_temp.name, site_temp.name, args.bedgraph)
    for b in reader(cmd, header=result_header):
        # sequence position(s) relative to gene(s) it overlaps
        locs = get_locs(int(b['start']), b['gene_name'], genes)
        fields = [b['chrom'], b['start'], b['stop'], b['name'], b['gene_name'], b['counts'], locs]
        print "\t".join(map(str, fields))
Beispiel #6
0
def uniprot(args):
    """Add Uniprot annotation to gene list. Args: genes, uniprotdb, column"""
    uniprot_db = {}
    uniprot_header = header(args.uniprotdb)
    for entry in reader(args.uniprotdb):
        for gene in entry['Gene names'].split():
            uniprot_db[gene] = entry
    for entry in reader(args.genes, header=False):
        uniprot_fields = []
        for gene in entry[int(args.column) - 1].split(","):
            uniprot = uniprot_db.get(gene)
            if uniprot:
                for h in uniprot_header:
                    uniprot_fields.append(uniprot[h])
        print "\t".join(entry) + "\t".join(map(str, uniprot_fields))
Beispiel #7
0
def merge_beds(excl_list, genome, prefix="ex"):
    if not os.path.exists(genome):
        fgen = mktemp()
        genome = Shuffler.genome(genome, fgen)

    if len(excl_list) == 1:
        excl = excl_list[0]
    else:
        excl = mktemp()
        _run("|cut -f 1-3 %s | sort -k1,1 -k2,2n | bedtools merge -i - > %s" \
                % (" ".join(excl_list), excl))

    bases = []
    for i, f in enumerate((genome, excl)):
        n_bases = 0
        for toks in reader(f, header=False):
            try:
                if i == 0:
                    n_bases += int(toks[1])
                else:
                    n_bases += (int(toks[2]) - int(toks[1]))
            except ValueError:
                pass
        bases.append(n_bases)

    #print >>sys.stderr, "# %scluding %5g out of %5g total bases (%.3g%%) in the genome" % \
    #        (prefix, bases[1] , bases[0], 100. * bases[1] / float(bases[0]))
    return excl
Beispiel #8
0
def runcontingent(path):
    from entropy import entropy
    import toolshed as ts
    it = ts.reader(path)
    iterable = (Interval(**iv) for iv in it)
    values = defaultdict(list)
    genes = set()
    by_transcript = defaultdict(list)
    by_domain = defaultdict(list)
    for iv in iterable:
        by_domain[iv.domain].append(iv)
        by_transcript[iv.transcript].append(iv)

    for domain, ivs in by_domain.items():
        if len(ivs) < 2: continue
        if sum(iv.mafs.count(',') for iv in ivs) < 3: continue
        if domain == ".": continue
        intervals = ivs[:]
        for iv in ivs:
            intervals.extend(by_transcript[iv.transcript])
        intervals = set(intervals)
        if len(intervals) < 3: continue
        pval, ratio, tbl, gene = contingent(intervals, domain, nodoms_only=False)
        ent = entropy(intervals)
        values['domain'].append(domain)
        values['pval'].append(pval)
        values['ent'].append(ent)
        values['tbl'].append(tbl)
        values['ratio'].append(ratio)
        values['num_intervals'].append(len(intervals))
        values['num_domains'].append(len(ivs))
        [genes.add(x) for x in gene]
        values['genes'].append(",".join(genes))
        genes=set()
    return values['domain'],values['pval'],values['ent'],values['tbl'],values['ratio'],values['num_intervals'],values['num_domains'],values['genes']
Beispiel #9
0
def read_pvalues(bedfilename, log_pvalues, verbose):
    ''' read in p-values from a bed file score field.

    returns: list sorted by signifance (most significant first)'''
    pvals = []

    if verbose:
        print >>sys.stderr, ">> reading p-values from %s .." % bedfilename

    for d in reader(bedfilename, header=['chrom','start','end','name','score','strand']):
        if log_pvalues:
            pval = float(d['score'])
        else:
            pval = -1 * log10(pval)
        pvals.append(pval)

    if verbose:
        print >>sys.stderr, ">> read %d p-values" % len(pvals)

    # sort the pvalues from most to least signif (smallest to largest) and
    # reverse so largest are first
    pvals.sort()

    # if pvals are log transformed, biggest (i.e. most significant) are
    # first
    if log_pvalues: pvals.reverse()

    return pvals
Beispiel #10
0
def readX(fX, transpose, n=1, nan_value=0):
    """
    n == 1 means to skip first column because it's the ID
    returns ids, probe_names, X
    """
    fhX = reader(fX, header=False)
    X_probes = fhX.next()[1:]

    ids, X = [], []
    #nan = float('nan')
    for toks in fhX:
        ids.append(toks[0])
        try:
            vals = map(float, toks[n:])
        except ValueError:
            vals = [
                float(t) if not t in ("NA", "na", "") else nan_value
                for t in toks[n:]
            ]
        X.append(np.array(vals))
    X = np.array(X)
    if transpose:
        return X_probes, np.array(ids), X
        #return np.array(ids), X_probes, X.T
    else:
        return np.array(ids), X_probes, X.T
Beispiel #11
0
def write_region_bed(feature_iter, true_regions, out_fh):
    """
    Write a region bed file suitable for use in :func:`~evaluate`.
    given true regions (likely from an external program, otherwise use
    :func:`~write_modeled_regions`).

    Parameters
    ----------

    feature_iter : iterable of Features

    true_regions : file
        BED file containing true regions

    out_fh : filehandle
        where to write the data
    """
    fmt = "{chrom}\t{start}\t{end}\t{truth}\t{size}\n"
    out_fh.write(ts.fmt2header(fmt))

    regions = defaultdict(InterLap)

    for i, toks in enumerate(ts.reader(true_regions, header=False)):
        # see if it's a header.
        if i == 0 and not (toks[1] + toks[2]).isdigit(): continue
        chrom, start, end = toks[0], int(toks[1]), int(toks[2])
        regions[chrom].add((start, end))

    for f in feature_iter:
        truth = 'true' if (f.position, f.position) in regions[f.chrom] else 'false'
        out_fh.write(fmt.format(chrom=f.chrom, start=f.position - 1,
                    end=f.position, truth=truth, size=1))
    out_fh.flush()
Beispiel #12
0
def main():
    p = argparse.ArgumentParser(__doc__)

    p.add_argument("-g", dest="group", help="group by the first column (usually"
                 " chromosome or probe) if this [optional]", default=False,
                 action="store_true")

    p.add_argument("--skip", dest="skip", help="Maximum number of intervening "
             "basepairs to skip before seeing a value. If this number is "
                 "exceeded, the region is ended chromosome or probe "
                 "[default: %default]", type=int, default=50000)
    p.add_argument("--min-region-size", dest="min-region", help="minimum "
            "length of the region. regions shorter than this are not printed"
                 "[default: %default] (no minimum)", type=int, default=0)
    p.add_argument("--seed", dest="seed", help="A value must be at least this"
                 " large in order to seed a region. [default: %default]",
                 type=float, default=5.0)
    p.add_argument("--keep-cols", dest="keep", help="comma separated list of"
            "columns to add to the output data", default="")

    p.add_argument("--threshold", dest="threshold", help="After seeding, a value"
                 "of at least this number can extend a region [default: "
                 "%default]", type=float, default=3.0)
    p.add_argument("regions")

    args = p.parse_args()

    f = reader(args.regions, header=False, sep="\t")
    keep = [int(k) for k in args.keep.strip().split(",") if k]
    report_cutoff = args.seed
    for key, region in gen_regions(f, args.skip, args.seed, args.threshold,
            args.group, keep, report_cutoff):
        print key + "\t" + "\t".join(map(str, region))
Beispiel #13
0
def main(bam, output):
    sample = path.basename(bam).rsplit(".bam", 1)[0]
    plot_file = output if output else bam.rsplit(".bam", 1)[0] + "_lorenz_curve.png"

    coverages = []
    print("Calculating coverages", file=sys.stderr)
    for toks in reader("|bedtools genomecov -5 -d -ibam %s" % bam, header=['name', 'start', 'coverage']):
        coverages.append(int(toks['coverage']))

    coverages_r = IntVector(coverages)

    print("Generating Lorenz curve", file=sys.stderr)

    # Gini coefficient
    G = ineq.Gini(coverages_r)
    l = "G = %.3f" % G[0]

    grdevices.png(plot_file, width=1200, height=800)
    # draw the plot
    plot(ineq.Lc(coverages_r), xlab="Genome Fraction",
        ylab="Coverage Fraction", bty="n", lwd=1, main="Lorenz Curve of %s" % sample,
        col="black", xaxs="r", yaxs="r")

    # add the Gini coefficient to the plot
    legend('topleft', legend=l, bty='n', cex=1.3)
    grdevices.dev_off()

    print("Gini Coefficient = %f" % G[0])
Beispiel #14
0
def read_king(king_file):
    pairs = {}
    import toolshed as ts
    for d in ts.reader(king_file):
        pairs[(d['ID1'], d['ID2'])] = float(d['Kinship'])
        pairs[(d['ID2'], d['ID1'])] = float(d['Kinship'])
    return pairs
Beispiel #15
0
def filter(p_bed, region_bed, max_p=None, p_col_name="P.Value"):
    ph = ['p' + h for h in get_header(p_bed)]
    rh = get_header(region_bed)
    if isinstance(p_col_name, (int, long)):
        p_col_name = ph[p_col_name][1:]

    a = dict(p_bed=p_bed, region_bed=region_bed)
    a['p_bed'] = fix_header(a['p_bed'])

    yield rh + ["t-pos", "t-neg", "t-sum", "n_gt_p05", "n_gt_p1"]
    for group, plist in groupby(reader('|bedtools intersect -b %(p_bed)s -a %(region_bed)s -wo' % a,
            header=rh + ph), itemgetter('chrom','start','end')):
        plist = list(plist)
        plist = [x for x in plist if (int(x['start']) <= int(x['pstart']) <= int(x['pend'])) and ((int(x['start']) <= int(x['pend']) <= int(x['end'])))]
        tscores = [float(row['pt']) for row in plist if 'pt' in row]

        if max_p:
            if any(float(row['p' + p_col_name]) > max_p for row in plist):
                continue

        ngt05  = sum(1 for row in plist if float(row['p' + p_col_name]) > 0.05)
        ngt1  = sum(1 for row in plist if float(row['p' + p_col_name]) > 0.1)
        tpos = sum(1 for ts in tscores if ts > 0)
        tneg = sum(1 for ts in tscores if ts < 0)
        tsum = sum(ts for ts in tscores)
        frow = [plist[0][h] for h in rh] + [str(tpos), str(tneg), str(tsum), str(ngt05), str(ngt1)]
        yield frow
Beispiel #16
0
def get_vdj_regions(counter, imgt):
    p = defaultdict(list)
    s = {}
    for l in reader(imgt, header=True):
        if not l['Functionality'] == "productive": continue
        try:
            v = l["V-GENE and allele"].split()[1]
        except IndexError:
            v = "na"
        try:
            j = l["J-GENE and allele"].split()[1]
        except IndexError:
            j = "na"
        try:
            d = l["D-GENE and allele"].split()[1]
        except IndexError:
            d = "na"
        composition = "%s,%s,%s" % (v, d, j)
        protein_seq = l["AA JUNCTION"]
        p[protein_seq].append(composition)
        try:
            if len(l['Sequence']) > len(s[protein_seq]):
                s[protein_seq] = l['Sequence']
        except KeyError:
            s[protein_seq] = l['Sequence']
    return p, s
Beispiel #17
0
def main(table):
    d = {}
    for toks in reader(table, header=True, sep=" "):
        row_gene = toks['Genes']
        d[row_gene] = {}
        for col_gene in toks.keys():
            # row 1, col 1 is a generic header entry
            if col_gene == "Genes": continue
            d[row_gene][col_gene] = int(toks[col_gene])

    # print node size attributes
    node_out = open("node_attrs.txt", "wb")
    print >>node_out, "source\ttotal_mutations"
    for k in d.keys():
        print >>node_out, "{gene}\t{count}".format(gene=k, count=d[k][k])
    node_out.close()

    # print network and edge attributes
    interaction_type = "pp"
    network_out = open("network.txt", "wb")
    print >>network_out, "source\tinteraction_type\ttarget\tcomutation_count"
    seen = set()
    for row_gene in d.keys():
        for col_gene, count in d[row_gene].iteritems():
            if count == 0: continue
            # double checking these were filtered out
            if row_gene == col_gene: continue
            # check to see if the interaction was already added in the opposite direction
            if "{gene2}_{gene1}".format(gene2=col_gene, gene1=row_gene) in seen: continue
            print >>network_out, "{gene1}\t{interaction}\t{gene2}\t{count}".format(gene1=row_gene, interaction=interaction_type, gene2=col_gene, count=count)
            seen.add("{gene1}_{gene2}".format(gene1=row_gene, gene2=col_gene))
    network_out.close()
Beispiel #18
0
def readccrs(path, gerp, phast, cadd):
    for i, d in enumerate(ts.reader(path, header="ordered")):
        d['gerp'] = ",".join(
            map(
                str,
                gerp.values("chr" + d['chrom'], int(d['start']),
                            int(d['end']))))
        d['phast'] = ",".join(
            map(
                str,
                phast.values("chr" + d['chrom'], int(d['start']),
                             int(d['end']))))
        region = d['chrom'] + ":" + d['start'] + "-" + d['end']
        var = None
        vals = []
        caddvals = []
        for toks in (x.rstrip('\r\n').split("\t")
                     for x in ts.nopen("| tabix " + cadd +
                                       " {region}".format(region=region))
                     if x[1] != "#"):  #TODO replace w cyvcf2
            if var == None or var == toks[1]:
                vals.append(float(toks[5]))
            elif var != toks[1] and var != None:
                caddvals.append(np.mean(vals))
                vals = []
            var = toks[1]
        d['cadd'] = ",".join(map(str, caddvals))
        if i == 0:
            print "\t".join(d.keys())
        print "\t".join(map(str, d.values()))
Beispiel #19
0
def main(args):
    gm = defaultdict(dict)
    for f in args.files:
        for l in reader(f, header="chrom start stop name score strand abundance".split()):
            try:
                if int(l['abundance']) == 0: continue
                # gm[<parsed file name>][<miRNA name>] = abundance value
                gm[op.basename(f).split(".mirna_abundance", 1)[0]][l['name']] = l['abundance']
            except KeyError:
                # header failed to set l['abundance']
                pass
    # the sample names
    caselist = sorted(gm.keys())
    # only save lines where at least one sample has a positive value
    completeset = []
    for i, case in enumerate(caselist):
        keys = sorted(gm[caselist[i]].keys())
        for k in keys:
            completeset.append(k)
    mirnas = set(completeset)
    
    # print the matrix
    print "\t".join(k for k in caselist)
    for mirna in mirnas:
        fields = [mirna]
        for c in caselist:
            try:
                fields.append(gm[c][mirna])
            except KeyError:
                # miRNA not present in this case
                fields.append("0.0")
        print "\t".join(map(str, fields))
Beispiel #20
0
def main(args):
    gm = defaultdict(dict)
    for f in args.files:
        for l in reader(f, header="chrom start stop name counts nonzero blength nonzerofracofb".split()):
            try:
                # gm[<parsed file name>][<peak name>] = count value
                fullname = "%s:%s:%s:%s" % (l["name"], l["chrom"], l["start"], l["stop"])
                gm[f.split(".", 1)[0]][fullname] = l["counts"]
            except KeyError:
                # header failed to set l['val']
                pass

    # print the matrix
    caselist = sorted(gm.keys())
    # this step is unnecessary as they all have counts for the same peaks
    completeset = []
    for i, case in enumerate(caselist):
        keys = sorted(gm[caselist[i]].keys())
        for k in keys:
            completeset.append(k)

    peaks = set(completeset)
    print "#peak_name\t" + "\t".join(k for k in caselist)
    for peak in peaks:
        fields = [peak]
        for c in caselist:
            try:
                fields.append(gm[c][peak])
            except KeyError:
                # miRNA not present in this case
                fields.append("0.0")
        print "\t".join(map(str, fields))
Beispiel #21
0
def get_unique_protein_seqs(imgt):
    c = Counter()
    for l in reader(imgt, header=True):
        if not l['Functionality'] == "productive": continue
        if len(l['AA JUNCTION']) < 2: continue
        c.update([l['AA JUNCTION']])
    return c
def main(shifts, sites):
    refsites = sites_to_dict(sites)
    try:
        cols = reader(shifts, header=False).next()
    except StopIteration:
        print >>sys.stderr, ">> empty file:", shifts
        sys.exit(1)
    comparisons = cols[2:]
    shifts_d = shifts_to_dict(comparisons, shifts)
    for comparison, all_sites in shifts_d.iteritems():
        lines = []
        for (site, shift) in all_sites.iteritems():
            a, b = site.split(",")
            a = refsites[a]
            b = refsites[b]
            lines.append(bed12line(a.chrom, a.start, b.stop, a.strand, shift))
        lines = sorted(lines, key=operator.itemgetter(0, 1))
        if len(lines) == 0:
            print >>sys.stderr, ">> nothing found in", comparison
            continue
        result = "{comparison}.dexseq.bed".format(**locals())
        print >>sys.stderr, ">> writing", result
        f = open(result, 'wb')
        for line in lines:
            print >>f, "\t".join(map(str, line))
        f.close()
Beispiel #23
0
def plot(f, axs, shared):

    diffs = []

    xs, ys = [], []
    for d in ts.reader(f, sep=","):
        if not (d['sample_a'], d['sample_b']) in shared: continue

        x = float(d['rel'])
        y = float(d['pedigree_relatedness'])
        #if abs(x - y) > 0.25: continue
        diffs.append(x - y)
        xs.append(x)
        ys.append(y)
    """
    ax.scatter(xs, ys)
    ax.set_xlabel('relatedness by genotype')
    ax.set_ylabel('relatedness by ped file')
    ax.set_title(f)
    """
    p5, p95 = np.percentile(diffs, [2.5, 97.5])
    m, std = np.mean(diffs), np.std(diffs)

    ax2 = axs
    ax2.set_title(convert(f))
    ax2.hist(diffs, 40)
    ax2.text(0.6,
             0.8,
             "95%% range: %.3f - %.3f\nmean: %.3f std: %-3f" %
             (p5, p95, m, std),
             transform=ax2.transAxes)
    ax2.set_xlabel("genotype - expected")
    ax2.set_ylabel("count")
Beispiel #24
0
def plot(f, axs, shared):

    diffs = []

    xs, ys = [], []
    for d in ts.reader(f, sep=","):
        if not (d['sample_a'], d['sample_b']) in shared: continue

        x = float(d['rel'])
        y = float(d['pedigree_relatedness'])
        #if abs(x - y) > 0.25: continue
        diffs.append(x - y)
        xs.append(x)
        ys.append(y)
    """
    ax.scatter(xs, ys)
    ax.set_xlabel('relatedness by genotype')
    ax.set_ylabel('relatedness by ped file')
    ax.set_title(f)
    """
    p5, p95 = np.percentile(diffs, [2.5, 97.5])
    m, std = np.mean(diffs), np.std(diffs)

    ax2 = axs
    ax2.set_title(convert(f))
    ax2.hist(diffs, 40)
    ax2.text(0.6, 0.8, "95%% range: %.3f - %.3f\nmean: %.3f std: %-3f" % (p5, p95, m, std),
                    transform=ax2.transAxes)
    ax2.set_xlabel("genotype - expected")
    ax2.set_ylabel("count")
def make_tree(path):
    tree = defaultdict(IntervalTree)
    prev_chrom, prev_pos = None, 0
    prev_hap = None
    added = defaultdict(int)
    for i, line in enumerate(ts.reader(path, sep=',')):
        chrom = line['chromosome']
        pos = line['position(B38)']
        hap_probs = list(line.items())[3:]
        hap_probs_np = np.array([v for k,v in hap_probs])
        max_hap = [v for i,v in enumerate(hap_probs) if i == np.argmax(hap_probs_np)][0]
        hap, score = max_hap
        if float(score) < 0.8: continue
        if i == 0:
            prev_hap = hap
            prev_chrom = chrom
            prev_pos = pos
        if chrom == prev_chrom and hap != prev_hap:
            tree[prev_chrom].add(int(prev_pos), int(pos), other=prev_hap)
#            print ('added {}:{}-{}'.format(prev_chrom, prev_pos, pos))
            prev_pos = pos
            prev_chrom = chrom
            prev_hap = hap
            added[chrom] = int(pos)
        elif chrom != prev_chrom:
            tree[prev_chrom].add(int(prev_pos), added[prev_chrom], other=prev_hap)
#            print ('added {}:{}-{}'.format(prev_chrom, prev_pos, added[prev_chrom]))
            prev_pos = 0
            prev_chrom = chrom
            prev_hap = hap
        else: 
            added[chrom] = int(pos)
            continue
    return tree
def read_values(
    path='/uufs/chpc.utah.edu/common/home/u1021864/analysis/scoredregions.bed'
):
    var = defaultdict(defaultdict)
    ccrs = defaultdict(list)
    genes = defaultdict(list)
    for i, region in enumerate(ts.reader(path, header="ordered")):
        ccrs['gerp'].append(np.mean(map(float, region['GERP'].split(","))))
        ccrs['phast'].append(
            np.mean(map(float, region['phastCons'].split(","))))
        ccrs['cadd'].append(np.mean(map(float, region['CADD'].split(","))))
        length = sum([
            int(i.split("-")[1]) - int(i.split("-")[0])
            for i in region['ranges'].split(',')
        ])
        ccrs['pct'].append(float(region['weighted_pct']))
        ccrs['gene'].append(region['gene'])
        ccrs['chrom'].append(region['chrom'])
        ccrs['ranges'].append(region['ranges'])
        ccrs['length'].append(length)
        if genes[region['gene']]:
            genes[region['gene']][0] += 1
            genes[region['gene']][1] += length
        else:
            genes[region['gene']] = [1, length]
    var['ccrs'] = ccrs
    var['genes'] = genes
    return var
Beispiel #27
0
def merge_beds(excl_list, genome, prefix="ex"):
    if not os.path.exists(genome):
        fgen = mktemp()
        genome = Shuffler.genome(genome, fgen)

    if len(excl_list) == 1:
        excl = excl_list[0]
    else:
        excl = mktemp()
        _run("|cut -f 1-3 %s | sort -k1,1 -k2,2n | bedtools merge -i - > %s" \
                % (" ".join(excl_list), excl))

    bases = []
    for i, f in enumerate((genome, excl)):
        n_bases = 0
        for toks in reader(f, header=False):
            try:
                if i == 0:
                    n_bases += int(toks[1])
                else:
                    n_bases += (int(toks[2]) - int(toks[1]))
            except ValueError:
                pass
        bases.append(n_bases)

    #print >>sys.stderr, "# %scluding %5g out of %5g total bases (%.3g%%) in the genome" % \
    #        (prefix, bases[1] , bases[0], 100. * bases[1] / float(bases[0]))
    return excl
Beispiel #28
0
def main():
    args = get_args()
    
    if args.verbose:
        sys.stderr.write(">> building gene orthology cross-reference...\n")
    xref = get_xref(args.xref)
    
    if args.verbose:
        sys.stderr.write(">> building uniprot library...\n")
    uniprot = parse_uniprot_flat(args.uniprot)
    
    if args.verbose:
        sys.stderr.write(">> annotating matrisome...\n")
    
    header = nopen(args.matrisome).readline().rstrip("\r\n").split("\t")
    headerext = ['r_ENSRNOP', 'r_score', 'r_geneid', 'r_gene_description', \
                    'r_uniprot', 'r_interpro', 'r_refseqn', 'r_refseqp', \
                    'r_ensg', 'r_enst', 'r_ensp']
    header.extend(headerext)
    print "\t".join(h for h in header)
    
    for entry in reader(args.matrisome):
        
        # reset vars
        for h in headerext:
            entry[h] = ""
        
        # handle multiple entries delimited by ":"
        for entryname in entry[args.xref_col].split(":"):
            
            # looping over entire defaultdict each time
            for uid, ddict in xref.iteritems():
                
                # find a matching ortholog
                for orthoname in ddict['orthonames']:
                    if orthoname == entryname:
                        
                        # use the uid to get the rat names and scores
                        for ratname, ratscore in izip(xref[uid]['ratnames'], xref[uid]['ratscores']):
                            # print ratname
                            entry['r_ENSRNOP'] += "%s:" % ratname
                            entry['r_score'] += "%s:" % ratscore
                            
                            # for each rat ENSP, add the corresponding annotation(s)
                            for uniqueid, uniprot_entry in uniprot.iteritems():
                                for ensemblname in uniprot_entry['ensemblp']:
                                    if ensemblname == ratname:
                                        #print all of the info for this uid
                                        entry['r_geneid'] += ':'.join(t for t in uniprot[uniqueid]['geneid']) + ":"
                                        entry['r_gene_description'] += ':'.join(t for t in uniprot[uniqueid]['description']) + ":"
                                        entry['r_uniprot'] += ':'.join(t for t in uniprot[uniqueid]['uniprotid']) + ":"
                                        entry['r_interpro'] += ':'.join(t for t in uniprot[uniqueid]['interpro']) + ":"
                                        entry['r_refseqn'] += ':'.join(t for t in uniprot[uniqueid]['refseqn']) + ":"
                                        entry['r_refseqp'] += ':'.join(t for t in uniprot[uniqueid]['refseqp']) + ":"
                                        entry['r_ensg'] += ':'.join(t for t in uniprot[uniqueid]['ensemblg']) + ":"
                                        entry['r_enst'] += ':'.join(t for t in uniprot[uniqueid]['ensemblt']) + ":"
                                        entry['r_ensp'] += ':'.join(t for t in uniprot[uniqueid]['ensemblp']) + ":"
                                        
        print "\t".join(entry[h].rstrip(":") for h in header)
Beispiel #29
0
def parse_txt(txt):
    """returns dictionary of miRNAs present in the network."""
    observed_mirs = set()
    for t in txt:
        for toks in reader(t, header=['name', 'chrom', 'start', 'stop', 'gene']):
            mir_name = toks['name'].split("|")[0]
            observed_mirs.add(mir_name)
    return observed_mirs
Beispiel #30
0
def lamina():
    if not op.exists('lamina.bed'):
        fh = open('lamina.bed', 'w')
        fh.write("#chrom\tstart\tend\tvalue\n")
        for gff in reader('http://www.nature.com/nature/journal/v453/n7197/extref/nature06947-s2.txt', header=False):
            fh.write("\t".join([gff[0], gff[3], gff[4], gff[5]]) + "\n")
        fh.close()
    return 'lamina.bed'
def shifts_to_dict(cols, fname):
    d = OrderedDict()
    for c in cols:
        d[c] = {}
        for l in reader(fname):
            if not l[c] == "proximal" and not l[c] == "distal": continue
            d[c][l['Sites']] = l[c]
    return d
Beispiel #32
0
def bam2bedgraph(args):
    """Convert bam to bedgraph. Args: bedgraph, bam, strand"""
    cmd = "|bedtools genomecov -bg -5 -ibam %s" % (args.bam)
    if args.strand:
        cmd = "|bedtools genomecov -bg -5 -strand %s -ibam %s" % (args.strand, args.bam)
    result_header = "chrom start stop counts".split()
    for b in reader(cmd, header=result_header):
        print "\t".join(b[r] for r in result_header)
Beispiel #33
0
def rad_format(fmethylated, fcounts, fout):
    if isinstance(fout, basestring):
        fout = ts.nopen(fout, "w")
    for i, (m, c) in enumerate(it.izip(ts.reader(fmethylated, header=False),
                                       ts.reader(fcounts, header=False))):
        if i == 0:
            fout.write("\t" + "\t".join(m[1:]) + "\n")
        else:
            assert m[0] == c[0]
            methyls = m[1:]
            counts = c[1:]
            pairs = "\t".join("%s %s" % (ci, mi) for mi, ci in zip(methyls, counts))
            chrom, pos = c[0].split(":")
            pos = int(pos)
            site = "%s:%i:%i" % (chrom, pos, pos + 1)
            fout.write("%s\t%s\n" % (site, pairs))
    return fout.name
Beispiel #34
0
def example():
    import toolshed as ts
    from collections import namedtuple

    it = ts.reader('/uufs/chpc.utah.edu/common/home/u6000294/lustre/u6000294/pmodel/y.sort.bed.gz')
    iterable = (Interval(**iv) for iv in it)
    for gene, val in slider(iterable, size_grouper(1), FRV_inline, maf_cutoff=0.005):
        print "%s\t%.3f\t%.3f" % (gene[0].autoregs, val, IAFI_inline(gene, 65000))
Beispiel #35
0
def partsort(afile, group_cols, sort_cols, sort_convertors, header=False):
    """
    the converted columns are appended to the end of the row.
    then after the sort, these are removed.
    this removes problems with floating point reprs.
    """
    the_first_line = get_header(afile)
    row_len = len(the_first_line)
    n_extra = len(sort_convertors)

    # maintain order of the sort cols, but use the appended columns for the
    # numeric ones.
    actual_sort_cols = []
    n_extra = 0

    # since we append floats to the end *and* want to maintain the
    # requested sort order, we create the `actual_sort_cols`
    for c in sort_cols:
        if not c in sort_convertors:
            actual_sort_cols.append(c)
        else:
            idx = row_len + n_extra
            actual_sort_cols.append(idx)
            n_extra += 1

    # if it was stdin, then we read one line to get the header length.
    lines = reader(afile, header=header) if afile != "-" \
            else chain([the_first_line], reader(afile, header))
    # groupby the correct columns
    for keyed, group in groupby(lines,
                                lambda toks: [toks[i] for i in group_cols]):

        # then generate the rows with the converted columns appended.
        def gen_converted_group():
            for toks in group:
                # add the converted columns onto the end.
                yield toks + [
                    fn(toks[col_idx])
                    for col_idx, fn in sort_convertors.items()
                ]

        # then iterator over the sorted cols.
        for toks in sorted(gen_converted_group(),
                           key=itemgetter(*actual_sort_cols)):
            # strip the extra columns.
            yield toks[:row_len]
Beispiel #36
0
 def write_result(fanno, written=[False]):
     for i, d in enumerate(reader(fanno, header="ordered")):
         if i == 0 and written[0] == False:
             print >> out, "\t".join(d.keys())
             written[0] = True
         print >> out, "\t".join(d.values())
     os.unlink(fanno)
     os.unlink(fanno.replace(".anno", ""))
Beispiel #37
0
def main(dexseq, pval, pval_cutoff):
    dex_runs = OrderedDict()
    for fname in dexseq:
        cols = reader(fname, header=False).next()[1:]
        try:
            a, b = sample_names(cols, "log2fold")
            strand = gstrand(fname)
        except StrandNotFound:
            print >>sys.stderr, ">> strand (pos, neg) must be in file names."
            sys.exit(1)
        except UnboundLocalError:
            print >>sys.stderr, ">> failed to get sample names for", fname
            print >>sys.stderr, ">> skipping..."
            continue
        log2fold = cols[-1]
        assert a != b
        run_id = "{a}_to_{b}.{strand}".format(**locals())
        dex_runs[run_id] = {}
        for group in grouper(reader(fname, header=True), "geneID"):
            results = OrderedDict()
            for site in group:
                try:
                    # p-value threshold filtering
                    if float(site[pval]) > pval_cutoff: continue
                except ValueError:
                    continue
                # fold change should be recorded from dexseq
                assert site[log2fold] != "NA"
                site_id = int(site['exonID'].rsplit(".")[-1])
                results[site_id] = {'fc':float(site[log2fold]), 'name':site['exonID'].lstrip('E')}
            if len(results) < 2: continue
            # iterating over the pairs involved in switching event
            for (aid, ad), (bid, bd) in pairs(results):
                # the direction of change
                direction = shift(aid, ad['fc'], bid, bd['fc'])
                comp = "{aname},{bname}".format(aname=ad['name'], bname=bd['name'])
                # complex name to ease creating multiindex dataframe
                dex_runs[run_id]["{gene}:{comp}".format(gene=site['geneID'], comp=comp)] = direction
    try:
        df = pd.DataFrame(dex_runs)
        # create multiindex via split
        df.index = pd.MultiIndex.from_tuples([x.split(":") for x in df.index], names=['Gene','Sites'])
        df.to_csv(sys.stdout, sep="\t", na_rep="na")
    except Exception:
        # empty dataframe
        print >>sys.stderr, "No significant sites were found."
def read_regions(fregions):
    tree = defaultdict(InterLap)
    for i, toks in enumerate(ts.reader(fregions, header=False)):
        if i == 0 and not (toks[1] + toks[2]).isdigit(): continue
        tree[toks[0]].add((int(toks[1]), int(toks[2]), toks))
    sys.stderr.write("# read %i regions from %s\n" \
            % (sum(len(v) for v in tree.values()), fregions))
    return tree
Beispiel #39
0
def cross_ref(kgxref, table_id, table_symbol):
    """Returns dictionary of knownGene cross-reference table by the table
    identifier, ie. refseq.
    """
    xref = {}
    for x in reader(kgxref):
        xref[x[table_id]] = x[table_symbol]
    return xref
Beispiel #40
0
 def write_result(fanno, written=[False]):
     for i, d in enumerate(reader(fanno, header="ordered")):
         if i == 0 and written[0] == False:
             print >>out, "\t".join(d.keys())
             written[0] = True
         print >>out, "\t".join(x if x else "NA" for x in d.values())
     os.unlink(fanno)
     os.unlink(fanno.replace(".anno", ""))
Beispiel #41
0
 def write_result(fanno, written=[False]):
     for i, d in enumerate(reader(fanno, header="ordered")):
         if i == 0 and written[0] == False:
             print(("\t".join(list(d.keys()))) >> out)
             written[0] = True
         print(("\t".join(x if x else "NA"
                          for x in list(d.values()))) >> out)
     os.unlink(fanno)
     os.unlink(fanno.replace(".anno", ""))
Beispiel #42
0
def main():
    p = argparse.ArgumentParser(__doc__)

    p.add_argument("-g",
                   dest="group",
                   help="group by the first column (usually"
                   " chromosome or probe) if this [optional]",
                   default=False,
                   action="store_true")

    p.add_argument(
        "--skip",
        dest="skip",
        help="Maximum number of intervening "
        "basepairs to skip before seeing a value. If this number is "
        "exceeded, the region is ended chromosome or probe "
        "[default: %default]",
        type=int,
        default=50000)
    p.add_argument(
        "--min-region-size",
        dest="min-region",
        help="minimum "
        "length of the region. regions shorter than this are not printed"
        "[default: %default] (no minimum)",
        type=int,
        default=0)
    p.add_argument("--seed",
                   dest="seed",
                   help="A value must be at least this"
                   " large in order to seed a region. [default: %default]",
                   type=float,
                   default=5.0)
    p.add_argument("--keep-cols",
                   dest="keep",
                   help="comma separated list of"
                   "columns to add to the output data",
                   default="")

    p.add_argument("--threshold",
                   dest="threshold",
                   help="After seeding, a value"
                   "of at least this number can extend a region [default: "
                   "%default]",
                   type=float,
                   default=3.0)
    p.add_argument("regions")

    args = p.parse_args()

    f = reader(args.regions, header=False, sep="\t")
    keep = [int(k) for k in args.keep.strip().split(",") if k]
    report_cutoff = args.seed
    for key, region in gen_regions(f, args.skip, args.seed, args.threshold,
                                   args.group, keep, report_cutoff):
        print key + "\t" + "\t".join(map(str, region))
Beispiel #43
0
def rad_format(fmethylated, fcounts, fout):
    if isinstance(fout, basestring):
        fout = ts.nopen(fout, "w")
    for i, (m, c) in enumerate(
            it.izip(ts.reader(fmethylated, header=False),
                    ts.reader(fcounts, header=False))):
        if i == 0:
            fout.write("\t" + "\t".join(m[1:]) + "\n")
        else:
            assert m[0] == c[0]
            methyls = m[1:]
            counts = c[1:]
            pairs = "\t".join("%s %s" % (ci, mi)
                              for mi, ci in zip(methyls, counts))
            chrom, pos = c[0].split(":")
            pos = int(pos)
            site = "%s:%i:%i" % (chrom, pos, pos + 1)
            fout.write("%s\t%s\n" % (site, pairs))
    return fout.name
Beispiel #44
0
def bediter(fname, col_num):
    for i, l in enumerate(reader(fname, header=False)):
        if l[0][0] == "#": continue
        try:
            yield  {"chrom": l[0], "start": int(l[1]), "end": int(l[2]),
                "p": float(l[col_num])} # "stuff": l[3:][:]}
        except:
            print >>sys.stderr, l
            if i != 0:
                raise
Beispiel #45
0
def main(regions,
         bams,
         reads=None,
         flags="-F%i" % (0x100 | 0x4 | 0x200 | 0x400),
         pad=100):
    r2 = open(tempfile.mktemp(), 'w')
    for toks in reader(regions, header=False):
        if toks[0][0] == "@" or not (toks[1] + toks[2]).isdigit(): continue
        toks[1] = str(max(0, int(toks[1]) - pad))
        toks[2] = str(int(toks[2]) + pad)
        print >> r2, "\t".join(toks)
    r2.flush()
    regions = r2.name
    print reads
    if reads.isdigit():
        reads = int(reads)
    elif reads != "bam":
        reads = int(
            nopen(
                "|bioawk -c fastx 'END { print NR }' %s" % reads).next()) * 2.0

    counts = {}
    colors = cycle('rgbkmy')
    bam_reads = {}

    counts = dict(pmap(count_both, ((bam, regions, flags) for bam in bams)))

    for bam in bams:
        nreads = count_bam(bam, flags) if reads == "bam" else reads
        bam_reads[bam] = nreads
        symbol = 'o' if len(set(counts[bam][0])) < 3 else '.'
        pl.plot(counts[bam][0] / float(nreads),
                counts[bam][1] / float(nreads),
                '%s%s' % (colors.next(), symbol),
                label=name(bam))

    pl.xlabel('off target')
    pl.ylabel('on target')
    pl.legend(loc='lower right')
    pl.xlim(xmin=0)
    pl.ylim(ymin=0)
    pl.show()
    os.unlink(r2.name)

    out = sys.stdout
    print >> out, "qual\tmethod\toff\ton"

    for qual in range(0, 256):
        for b in bams:
            print >> out, "{qual}\t{bam}\t{off}\t{on}".format(
                qual=qual,
                bam=name(b),
                off=counts[b][0][qual] / bam_reads[bam],
                on=counts[b][1][qual] / bam_reads[bam])
    print >> sys.stderr, "wrote", out.name
Beispiel #46
0
def load_background_file(freq_background_filename):
    ''' load genome nuc frequencies from pre-computed background file '''
    result = defaultdict(dict)

    for row in reader(freq_background_filename):
        region_size = int(row['region.size'])
        nuc = row['nuc']
        freq = float(row['freq'])
        result[region_size][nuc] = freq

    return result
Beispiel #47
0
    def _set_structure(self, structure):
        """
        here, we want to intersect the query and subject bed files with the
        structure.bed file and give each set of intervals in query and bed
        that fall within (or have any overlap with) a unique, fake chromosome
        so that all shuffling is within that chromosome.
        in order to do this, we also have to create a fake genome file that
        contains the lengths of those chromosomes.
        """
        if structure in (None, ""): return
        self.chrom = True  # has to be by chromosome.

        n_query_before = sum(1 for _ in nopen(self.query))
        n_subject_before = sum(1 for _ in nopen(self.subject))

        new_genome = open(mktemp(suffix='.fake_genome'), 'w')
        structure = "<(cut -f 1-3 %s)" % structure
        seen_segs = {}
        for bed in ('query', 'subject', 'exclude', 'include'):
            bed_path = getattr(self, "_" + bed, getattr(self, bed))
            if not bed_path: continue
            new_fh = open(mktemp(suffix='%s.fake' % bed), 'w')
            for toks in reader("|bedtools intersect -wo -a %s -b '%s' \
                    | sort -k4,4 -k5,5g" % (structure, bed_path),
                               header=False):
                gtoks, btoks = toks[:3], toks[3:-1]  # drop the bp overlap
                new_chrom = "_".join(gtoks)

                gtoks[1:] = map(int, gtoks[1:])
                btoks[1:3] = map(int, btoks[1:3])

                glen = gtoks[2] - gtoks[1]  # fake chrom length.
                if new_chrom.startswith('chr'): new_chrom = new_chrom[3:]
                if not new_chrom in seen_segs:
                    # save it in the genome file.
                    print >> new_genome, "\t".join((new_chrom, str(glen)))
                seen_segs[new_chrom] = True

                # with partial overlap, we'll have a negative start or an
                # end outside the genome... for now, just truncate.

                # adjust the interval to its location the new chrom.
                btoks[0] = new_chrom
                btoks[1] = max(0,
                               btoks[1] - gtoks[1])  # don't let it go below 0
                # chop to end of fake chrom.
                btoks[2] = min(btoks[2] - gtoks[1], glen - 1)
                assert 0 <= btoks[1] <= btoks[2] < glen
                btoks[1:3] = map(str, btoks[1:3])
                print >> new_fh, "\t".join(btoks)
            new_fh.close()
            setattr(self, bed, new_fh.name)
        new_genome.close()
        self.genome_file = new_genome.name
Beispiel #48
0
def _get_genotypes(vcf, min_qual, min_genotype_qual, min_samples, as_vcf):

    fh = ts.nopen(vcf)
    if as_vcf:
        for header in fh:
            print(header.rstrip("\r\n"))
            if header.startswith("#CHROM"):
                header = header.split("\t")
                header[0] = "CHROM"
                break
        else:
            1 / 0
        vcf_iter = ts.reader(chain(["\t".join(header)], fh), header="ordered")
    else:
        vcf_iter = ts.reader(vcf,
                             skip_while=lambda l: l[0] != "#CHROM",
                             header="ordered")

    for i, variant in enumerate(vcf_iter):
        yield _get_genotype(i, variant, min_qual, min_genotype_qual,
                            min_samples)
Beispiel #49
0
def bediter(fnames, col_num, delta=None):
    """
    iterate over a bed file. turn col_num into a float
    and the start, stop column into an int and yield a dict
    for each row.
    """
    last_chrom = chr(0)
    last_start = -1
    if isinstance(fnames, basestring):
        fnames = [fnames]
    for fname in fnames:
        for i, l in enumerate(ts.reader(fname, header=False)):
            if l[0][0] == "#": continue
            if i == 0:  # allow skipping header
                try:
                    float(l[col_num])
                except ValueError:
                    continue
            chrom = l[0]
            start = int(float(l[1]))
            if chrom == last_chrom:
                assert start >= last_start, ("error at line: %i, %s" %
                                             (i, "\t".join(l)),
                                             "file is not sorted")
            else:
                assert last_chrom < chrom, (
                    "error at line: %i, %s "
                    " with file: %s" % (i, "\t".join(l), fname),
                    "chromosomes must be sorted as characters", last_chrom,
                    "is not < ", chrom)
                last_chrom = chrom

            last_start = start

            p = float(l[col_num])
            if not delta is None:
                if p > 1 - delta:
                    p -= delta  # the stouffer correction doesnt like values == 1
                if p < delta:
                    p = delta  # the stouffer correction doesnt like values == 0

            v = {
                "chrom": l[0],
                "start": start,
                "end": int(float(l[2])),
                "p": p
            }  # "stuff": l[3:][:]}
            if v['end'] - v['start'] > 100000:
                print(
                    "warning! large interval at %s will increase memory use." %
                    v)
            yield v
Beispiel #50
0
def target_size_from_mips(mips, pad=0):
    tmp = open(mktemp(suffix=".bed"), "w")
    for k in mips: # ext/lig_probe_start/stop
        for (chrom, pos), d in mips[k].items():
            posns = [int(d[p]) for p in "ext_probe_start ext_probe_stop lig_probe_start lig_probe_stop".split()]
            tmp.write("%s\t%i\t%i\n" % \
                       (chrom, max(0, min(posns) - pad), max(posns) + pad))
    tmp.close()
    size = 0
    for toks in ts.reader("|tail -n+2 %s | sort -k1,1 -k2,2n | bedtools merge -i stdin" % tmp.name,
            header=False):
        size += int(toks[2]) - int(toks[1])
    return size
Beispiel #51
0
def feature_gen(fname, row_handler=row_handler, feature_class=ClusterFeature, sep="\t",
        rho_min=0.3, skip_first_row=True, weights=None):
    """

    Parameters
    ----------
    fname : str
        file name containing methylation data

    row_handler: function
        function that takes a list of values for each line in `fname` and
        returns a tuple of chrom, start, end, values. e.g.
        def row_handler(tokens):
            chrom, pos = tokens[0].split(":")
            return (chrom, int(pos) - 1, int(pos), map(float, values[1:]))

    feature_class: class
        a class derived from `ClusterFeature` that accepts
        chrom, start, end, values and has those atributes and fulfills
        the requirements of aclust.aclust.

    rho_min: float
        the minimum spearman's r between 2 sets of values for them to be
        considered as correlated
    """
    if weights is not None:
        weights = reader(weights, header=False, sep=sep)
    for i, toks in enumerate(reader(fname, header=False, sep=sep)):
        if i == 0 and skip_first_row:
            if weights is not None: next(weights)
            continue
        vals = row_handler(toks)
        if weights is not None:
            chrom, start, end, weight_vals = row_handler(next(weights))
            assert chrom == vals[0]
            assert start == vals[1], (vals[1], start)
        else:
            weight_vals = None
        yield feature_class(*vals, **{'rho_min': rho_min, 'weights':weight_vals} )
Beispiel #52
0
def read_mips(mips_file):
    sys.stderr.write("reading %s\n" % mips_file)
    m = {'ext_probe_start':{}, 'lig_probe_start':{},
         'ext_probe_stop':{}, 'lig_probe_stop':{}}
    ss = m.keys()
    for d in ts.reader(mips_file):
        for key in ss:
            d[key] = int(d[key])
        m['ext_probe_start'][(d['chr'], int(d['ext_probe_start']))] = d
        m['lig_probe_start'][(d['chr'], int(d['lig_probe_start']))] = d
        m['lig_probe_stop'][(d['chr'], int(d['lig_probe_stop']))] = d
        m['ext_probe_stop'][(d['chr'], int(d['ext_probe_stop']))] = d
    return m
Beispiel #53
0
def _split_chroms(fname):
    import tempfile
    t = tempfile.mktemp(dir="/tmp", suffix=".cruzdb")
    chroms = {}
    for d in reader(fname, header="ordered"):
        if not d['chrom'] in chroms:
            chroms[d['chrom']] = open(t + "." + d['chrom'], "w")
            print >> chroms[d['chrom']], "\t".join(d.keys())
        print >> chroms[d['chrom']], "\t".join(d.values())
    for k in chroms:
        chroms[k].close()
        chroms[k] = (chroms[k], chroms[k].name + ".anno")
    return chroms.items()
Beispiel #54
0
def as_bam(pfile, fa, prefix, calmd=False, set_as_failed=None):
    """
    pfile: either a file or a |process to generate sam output
    fa: the reference fasta
    prefix: the output prefix or directory
    set_as_failed: None, 'f', or 'r'. If 'f'. Reads mapping to that strand
                      are given the sam flag of a failed QC alignment (0x200).
    """
    view = "samtools-0.1.18 view -bS - | samtools-0.1.18 sort -m 5005919104 - "
    if calmd:
        cmds = [
            view + "{bam}.tmp",
            "samtools_old calmd -AbEr {bam}.tmp.bam {fa} > {bam}.bam 2>/dev/null",
            "rm {bam}.tmp.bam"
        ]
    else:
        cmds = [view + "{bam}"]

    cmds.append("samtools-0.1.18 index {bam}.bam")
    cmds = [c.format(bam=prefix, fa=fa) for c in cmds]

    sys.stderr.write("writing to:\n%s\n" % cmds[0])

    p = nopen("|" + cmds[0], 'w')
    out = p.stdin
    # out = sys.stdout # useful for debugging
    bam_iter = reader("%s" % (pfile, ), header=False, quotechar=None)
    out.write('@HD\tVN:1.5\tSO:coordinate\n')
    for toks in bam_iter:
        if not toks[0].startswith("@"): break
        handle_header(toks, out)
    else:
        sys.stderr.flush()
        raise Exception("bad or empty fastqs")
    bam_iter2 = chain([toks], bam_iter)
    for read_name, pair_list in groupby(bam_iter2, itemgetter(0)):
        pair_list = [Bam(toks) for toks in pair_list]

        for aln in handle_reads(pair_list, set_as_failed):
            out.write(str(aln) + '\n')
    stdout, stderr = p.communicate()
    stdout = stdout.replace('\r', '\n')
    stderr = stderr.replace('\r', '\n')
    # p.stdin.flush()
    # p.stdout.flush()
    # p.stdin.close()
    # assert p.wait() == 0
    for cmd in cmds[1:]:
        sys.stderr.write("running: %s\n" % cmd.strip())
        assert check_call(cmd.strip(), shell=True) == 0
Beispiel #55
0
def roc_out(p_bed, p_col, truth_region_bed, exclude=('-1', 'NA', 'nan')):
    """Create ROC for a bed file of p-values given known truth regions.

    Parameters
    ----------

    p_bed : file

    p_col : int
            column containing the p-value from `p_bed`

    truth_region_bed : file
                       contains the true regions
    """
    p_col -= 1  # 0-based

    regions = defaultdict(list)
    for toks in ts.reader(truth_region_bed, header=False):
        if not (toks[1] + toks[2]).isdigit(): continue
        regions[toks[0]].append((int(toks[1]), int(toks[2])))

    truths = []
    vals = []
    for toks in ts.reader(p_bed, header=False):
        if not (toks[1] + toks[2]).isdigit(): continue
        reg = regions[toks[0]]

        s, e = int(toks[1]), int(toks[2])

        p = toks[p_col]
        if p in exclude: continue
        vals.append(1.0 - float(p))

        truth = any(rs <= s <= re or rs <= e <= re for rs, re in reg)
        truths.append(truth)

    return np.array(truths).astype(int), np.array(vals)
def get_bam_lookup(p="data/bam-lookups-from-1kg-site.tsv"):
    l = {}
    for d in ts.reader(p):
        if 'low_coverage' in d['url']: continue
        if 'chr20' in d['url']: continue
        if 'chrom20' in d['url']: continue
        if 'chrom11' in d['url']: continue
        if 'unmapped' in d['url']: continue
        # NOTE: we could also get some samples with cram.
        if not d['url'].endswith('.bam'): continue
        if d['Sample'] in l:
            print "XXX:", d['url']
            print "YYY:", l[d['Sample']]
        l[d['Sample']] = d['url']
    return l
Beispiel #57
0
def simplify_bed(fbed, has_header):
    """
    create a bed with no header and 6 columns.
    retain strand info.
    """
    line_gen = reader(fbed, header=False)
    header = line_gen.next() if has_header else None
    fh = open(BedTool._tmp(), "w")
    for toks in line_gen:
        new_toks = toks[:3] + [
            "Z_Z".join(toks), ".", toks[5] if len(toks) > 5 else "."
        ]
        fh.write("\t".join(new_toks) + "\n")
    fh.close()
    return BedTool(fh.name), header
Beispiel #58
0
def shared(fs):
    sets = []
    for f in fs:
        s = set()
        for d in ts.reader(f, sep=","):
            x = float(d['rel'])
            y = float(d['pedigree_relatedness'])
            #if abs(x - y) > 0.2: continue
            s.add((d['sample_a'], d['sample_b']))
        sets.append(s)
    sall = sets[0]
    for i, s in enumerate(sets):
        if i == 0: continue
        sall &= s
    return sall
Beispiel #59
0
def calc_genome_size(chrom_size_filename, only_chroms, ignore_chroms, verbose):

    genome_size = 0.0

    for row in reader(chrom_size_filename, header=['chrom', 'size']):
        if (only_chroms and row['chrom'] not in only_chroms) or \
           (ignore_chroms and row['chrom'] in ignore_chroms):
            continue

        genome_size += float(row['size'])

    if verbose:
        print >> sys.stderr, ">> genome size: %s" % str(genome_size)

    return genome_size