def main(args): # create a dataframe with all miRNA counts from all samples allcounts = {} for f in args.counts: fname = op.basename(f).split(args.fullext)[0] casecounts = {} for line in reader(f, header="chrom start stop name score strand count".split()): casecounts[line['name']] = int(line['count']) allcounts[fname] = casecounts countsdf = pd.DataFrame(allcounts) # create a set of unique miRNAs from all the miRNA lists uniquemirnas = [] for f in args.mirnalist: for line in reader(f, header=['name']): uniquemirnas.append(line['name']) uniquemirnas = set(uniquemirnas) # log the counts # countsdf = np.log(countsdf + 1) manojset = "MP1 MP2 MP9 MP20 MP21 MP24 MP34 MP35 MP36 MP38 MP42.ACTG MP43.ACTG MP43.TCGA MP44.ACTG MP44.TCGA MP45.ACTG MP45.TCGA".split() manojset = "MP2 MP9 MP20 MP21 MP24 MP34 MP35 MP36 MP38 MP42.ACTG MP43.ACTG MP43.TCGA MP44.ACTG MP44.TCGA MP45.ACTG MP45.TCGA".split() # manojset = "MP2 MP9 MP20 MP21 MP34 MP35 MP36 MP43.ACTG MP43.TCGA MP44.ACTG MP44.TCGA".split() peterset1 = "PK11 PK21 PK24 PK31 PK41 PK42 PK51 PK52 PK54".split() peterset2 = "PK11 PK12 PK21 PK22 PK31 PK32 PK41 PK51 PK52 PK53".split() # print matrix countsdf.ix[uniquemirnas,manojset].to_csv(args.out, sep=",", header=True) countsdf.ix[uniquemirnas,peterset1].to_csv("peter1_top50.csv", sep=",", header=True) countsdf.ix[uniquemirnas,peterset2].to_csv("peter2_top50.csv", sep=",", header=True)
def local_shuffle(bed, loc='500000'): """ Randomize the location of each interval in `bed` by moving its start location to within `loc` bp of its current location or to its containing interval in `loc`. Arguments: bed - input bed file loc - shuffle intervals to within this distance (+ or -). If not an integer, then this should be a BED file containing regions such that each interval in `bed` is shuffled within its containing interval in `loc` """ from random import randint if str(loc).isdigit(): dist = abs(int(loc)) with nopen(bed) as fh: for toks in (l.rstrip('\r\n').split('\t') for l in fh): d = randint(-dist, dist) toks[1:3] = [str(max(0, int(bloc) + d)) for bloc in toks[1:3]] print "\t".join(toks) else: # we are using dist as the windows within which to shuffle assert os.path.exists(loc) bed4 = mktemp() with open(bed4, 'w') as fh: # this step is so we don't have to track the number of columns in A for toks in reader(bed, header=False): fh.write("%s\t%s\n" % ("\t".join(toks[:3]), SEP.join(toks))) missing = 0 # we first find the b-interval that contains each a-interval by # using bedtools intersect for toks in reader("|bedtools intersect -wao -a {bed4} -b {loc}" .format(**locals()), header=False): ajoin = toks[:4] a = ajoin[3].split(SEP) # extract the full interval b = toks[4:] if int(b[-1]) == 0: missing += 1 continue assert a[0] == b[0], ('chroms dont match', a, b) alen = int(a[2]) - int(a[1]) # doesn't care if the new interval is completely contained in b astart = randint(int(b[1]), int(b[2])) # subtract half the time. aend = (astart - alen) if randint(0, 1) == 0 and astart > alen \ else (astart + alen) a[1], a[2] = map(str, (astart, aend) if astart < aend else (aend, astart)) print "\t".join(a) if missing > 0: print >> sys.stderr, ("found {missing} intervals in {bed} that " " were not contained in {loc}" .format(**locals()))
def multi_intersect(files, cutoff): """files = {sample_name:file_path}""" sitestmp = open(tempfile.mkstemp(suffix=".bed")[1], 'wb') snames = [op.basename(f).split(".")[0].split("_")[0] for f in files] cmd = ("|bedtools multiinter -cluster -header " "-names {names} -i {files}").format(names=" ".join(snames), files=" ".join(files)) # apply cutoff, name peaks for i, l in enumerate(reader(cmd, header=True)): if int(l['num']) < cutoff: continue print >>sitestmp, "\t".join([l['chrom'], l['start'], l['end'], "peak_{i}".format(i=i)]) sitestmp.close() # annotate the merged sites by intersecting with all of the files classtmp = open(tempfile.mkstemp(suffix=".bed")[1], 'wb') annotated_peaks = sitestmp.name # pull out peak classes from input files for f in files: annotated_peaks = map_peak_class(f, annotated_peaks) for peak in reader(annotated_peaks, header=AnnotatedPeak): if peak.name is None: continue print >>classtmp, "{chrom}\t{start}\t{stop}\t{name}\n".format( chrom=peak.chrom, start=peak.start, stop=peak.stop, name=peak.name) classtmp.close() return classtmp.name
def main(count_files, metadata): pools = defaultdict(list) for toks in reader(metadata): for k, v in toks.iteritems(): if k.startswith("Pool") and v == "TRUE": # get the samples pool_name = k.split("_")[-1] pools[pool_name].append(toks['alias']) for pool, samples in pools.iteritems(): print >>sys.stderr, ">> processing", pool for strand in ["pos", "neg"]: files = [f for f in count_files if os.path.basename(f).split(".")[0] in samples and strand in os.path.basename(f)] # simplest way to join files into a dataframe raw_count_data = {} for file_path in files: sample = get_sample_name(file_path) raw_count_data[sample] = {} for toks in reader(file_path, header=['gene', 'site', 'count']): raw_count_data[sample]["{gene}:{site}".format(gene=toks['gene'], site=toks['site'])] = int(toks['count']) # dataframe from dict of dicts count_data = pd.DataFrame(raw_count_data) # will need to split into multiindex here to match new count fmt count_data.index = pd.MultiIndex.from_tuples([x.split(":") for x in count_data.index], names=['gene','site']) # normalize the counts count_data = norm_deseq(count_data) # round the normalized counts up to int # don't want to throw out single counts at any site count_data = count_data.apply(np.ceil) # sum the rows count_data[pool] = count_data.sum(axis=1) # print results out_file = gzip.open("{pool}.{strand}.txt.gz".format(pool=pool, strand=strand), "wb") count_data[pool].astype('int').to_csv(out_file, sep="\t") out_file.close()
def search(args): """Given fasta, gff, and bam, parses for sequence, annotates feature, and reports coverage. Args: bedgraph, fasta, gff, seq, feature, verbose. """ match_seq = args.seq.upper() # write a temp bed of sequence match sites site_temp = open(tempfile.mktemp(suffix=".bed"), 'wb') with nopen(args.fasta) as fasta: for chrom, seq in read_fasta(fasta): if args.verbose: sys.stderr.write(">> processing %s...\n" % chrom) # for each sequence match for i, m in enumerate([s.start() for s in re.finditer(match_seq, seq)]): start = m stop = start + 2 name = "%s_%s_%d" % (chrom, match_seq, i) fields = [chrom, start, stop, name] site_temp.write("\t".join(map(str, fields)) + "\n") site_temp.close() # convert gff to bed with gene name as bed name field gff_temp = open(tempfile.mktemp(suffix=".bed"), 'wb') result_header = "chrom source feature start stop score strand frame attributes comments".split() # for filtering unique and storing start and stop for each gene genes = {} if args.verbose: sys.stderr.write(">> selecting %s from gff records...\n" % args.feature) for g in reader(args.gff, header=result_header): try: if not g['feature'] == args.feature: continue # regex gene name out gene_name = re.findall(r'Name=([\w\.]+)', g['attributes'])[0] # skip already seen if genes.has_key(gene_name): continue genes[gene_name] = {'start':int(g['start']), 'stop':int(g['stop']), 'strand':g['strand']} fields = [g['chrom'], g['start'], g['stop'], gene_name] gff_temp.write("\t".join(map(str, fields)) + "\n") except KeyError: if not g['chrom'].startswith("#"): sys.stderr.write("ERROR parsing gff!\n") sys.exit(1) gff_temp.close() # sort the gene bed, map and collapse genes onto site_temp, then add counts if args.verbose: sys.stderr.write(">> finding relative gene location per sequence match...\n") result_header = "chrom start stop name gene_name counts".split() cmd = "|sortBed -i %s | mapBed -a %s -b - -c 4 -o collapse | mapBed -a - -b %s -c 4 -o sum"\ % (gff_temp.name, site_temp.name, args.bedgraph) for b in reader(cmd, header=result_header): # sequence position(s) relative to gene(s) it overlaps locs = get_locs(int(b['start']), b['gene_name'], genes) fields = [b['chrom'], b['start'], b['stop'], b['name'], b['gene_name'], b['counts'], locs] print "\t".join(map(str, fields))
def uniprot(args): """Add Uniprot annotation to gene list. Args: genes, uniprotdb, column""" uniprot_db = {} uniprot_header = header(args.uniprotdb) for entry in reader(args.uniprotdb): for gene in entry['Gene names'].split(): uniprot_db[gene] = entry for entry in reader(args.genes, header=False): uniprot_fields = [] for gene in entry[int(args.column) - 1].split(","): uniprot = uniprot_db.get(gene) if uniprot: for h in uniprot_header: uniprot_fields.append(uniprot[h]) print "\t".join(entry) + "\t".join(map(str, uniprot_fields))
def merge_beds(excl_list, genome, prefix="ex"): if not os.path.exists(genome): fgen = mktemp() genome = Shuffler.genome(genome, fgen) if len(excl_list) == 1: excl = excl_list[0] else: excl = mktemp() _run("|cut -f 1-3 %s | sort -k1,1 -k2,2n | bedtools merge -i - > %s" \ % (" ".join(excl_list), excl)) bases = [] for i, f in enumerate((genome, excl)): n_bases = 0 for toks in reader(f, header=False): try: if i == 0: n_bases += int(toks[1]) else: n_bases += (int(toks[2]) - int(toks[1])) except ValueError: pass bases.append(n_bases) #print >>sys.stderr, "# %scluding %5g out of %5g total bases (%.3g%%) in the genome" % \ # (prefix, bases[1] , bases[0], 100. * bases[1] / float(bases[0])) return excl
def runcontingent(path): from entropy import entropy import toolshed as ts it = ts.reader(path) iterable = (Interval(**iv) for iv in it) values = defaultdict(list) genes = set() by_transcript = defaultdict(list) by_domain = defaultdict(list) for iv in iterable: by_domain[iv.domain].append(iv) by_transcript[iv.transcript].append(iv) for domain, ivs in by_domain.items(): if len(ivs) < 2: continue if sum(iv.mafs.count(',') for iv in ivs) < 3: continue if domain == ".": continue intervals = ivs[:] for iv in ivs: intervals.extend(by_transcript[iv.transcript]) intervals = set(intervals) if len(intervals) < 3: continue pval, ratio, tbl, gene = contingent(intervals, domain, nodoms_only=False) ent = entropy(intervals) values['domain'].append(domain) values['pval'].append(pval) values['ent'].append(ent) values['tbl'].append(tbl) values['ratio'].append(ratio) values['num_intervals'].append(len(intervals)) values['num_domains'].append(len(ivs)) [genes.add(x) for x in gene] values['genes'].append(",".join(genes)) genes=set() return values['domain'],values['pval'],values['ent'],values['tbl'],values['ratio'],values['num_intervals'],values['num_domains'],values['genes']
def read_pvalues(bedfilename, log_pvalues, verbose): ''' read in p-values from a bed file score field. returns: list sorted by signifance (most significant first)''' pvals = [] if verbose: print >>sys.stderr, ">> reading p-values from %s .." % bedfilename for d in reader(bedfilename, header=['chrom','start','end','name','score','strand']): if log_pvalues: pval = float(d['score']) else: pval = -1 * log10(pval) pvals.append(pval) if verbose: print >>sys.stderr, ">> read %d p-values" % len(pvals) # sort the pvalues from most to least signif (smallest to largest) and # reverse so largest are first pvals.sort() # if pvals are log transformed, biggest (i.e. most significant) are # first if log_pvalues: pvals.reverse() return pvals
def readX(fX, transpose, n=1, nan_value=0): """ n == 1 means to skip first column because it's the ID returns ids, probe_names, X """ fhX = reader(fX, header=False) X_probes = fhX.next()[1:] ids, X = [], [] #nan = float('nan') for toks in fhX: ids.append(toks[0]) try: vals = map(float, toks[n:]) except ValueError: vals = [ float(t) if not t in ("NA", "na", "") else nan_value for t in toks[n:] ] X.append(np.array(vals)) X = np.array(X) if transpose: return X_probes, np.array(ids), X #return np.array(ids), X_probes, X.T else: return np.array(ids), X_probes, X.T
def write_region_bed(feature_iter, true_regions, out_fh): """ Write a region bed file suitable for use in :func:`~evaluate`. given true regions (likely from an external program, otherwise use :func:`~write_modeled_regions`). Parameters ---------- feature_iter : iterable of Features true_regions : file BED file containing true regions out_fh : filehandle where to write the data """ fmt = "{chrom}\t{start}\t{end}\t{truth}\t{size}\n" out_fh.write(ts.fmt2header(fmt)) regions = defaultdict(InterLap) for i, toks in enumerate(ts.reader(true_regions, header=False)): # see if it's a header. if i == 0 and not (toks[1] + toks[2]).isdigit(): continue chrom, start, end = toks[0], int(toks[1]), int(toks[2]) regions[chrom].add((start, end)) for f in feature_iter: truth = 'true' if (f.position, f.position) in regions[f.chrom] else 'false' out_fh.write(fmt.format(chrom=f.chrom, start=f.position - 1, end=f.position, truth=truth, size=1)) out_fh.flush()
def main(): p = argparse.ArgumentParser(__doc__) p.add_argument("-g", dest="group", help="group by the first column (usually" " chromosome or probe) if this [optional]", default=False, action="store_true") p.add_argument("--skip", dest="skip", help="Maximum number of intervening " "basepairs to skip before seeing a value. If this number is " "exceeded, the region is ended chromosome or probe " "[default: %default]", type=int, default=50000) p.add_argument("--min-region-size", dest="min-region", help="minimum " "length of the region. regions shorter than this are not printed" "[default: %default] (no minimum)", type=int, default=0) p.add_argument("--seed", dest="seed", help="A value must be at least this" " large in order to seed a region. [default: %default]", type=float, default=5.0) p.add_argument("--keep-cols", dest="keep", help="comma separated list of" "columns to add to the output data", default="") p.add_argument("--threshold", dest="threshold", help="After seeding, a value" "of at least this number can extend a region [default: " "%default]", type=float, default=3.0) p.add_argument("regions") args = p.parse_args() f = reader(args.regions, header=False, sep="\t") keep = [int(k) for k in args.keep.strip().split(",") if k] report_cutoff = args.seed for key, region in gen_regions(f, args.skip, args.seed, args.threshold, args.group, keep, report_cutoff): print key + "\t" + "\t".join(map(str, region))
def main(bam, output): sample = path.basename(bam).rsplit(".bam", 1)[0] plot_file = output if output else bam.rsplit(".bam", 1)[0] + "_lorenz_curve.png" coverages = [] print("Calculating coverages", file=sys.stderr) for toks in reader("|bedtools genomecov -5 -d -ibam %s" % bam, header=['name', 'start', 'coverage']): coverages.append(int(toks['coverage'])) coverages_r = IntVector(coverages) print("Generating Lorenz curve", file=sys.stderr) # Gini coefficient G = ineq.Gini(coverages_r) l = "G = %.3f" % G[0] grdevices.png(plot_file, width=1200, height=800) # draw the plot plot(ineq.Lc(coverages_r), xlab="Genome Fraction", ylab="Coverage Fraction", bty="n", lwd=1, main="Lorenz Curve of %s" % sample, col="black", xaxs="r", yaxs="r") # add the Gini coefficient to the plot legend('topleft', legend=l, bty='n', cex=1.3) grdevices.dev_off() print("Gini Coefficient = %f" % G[0])
def read_king(king_file): pairs = {} import toolshed as ts for d in ts.reader(king_file): pairs[(d['ID1'], d['ID2'])] = float(d['Kinship']) pairs[(d['ID2'], d['ID1'])] = float(d['Kinship']) return pairs
def filter(p_bed, region_bed, max_p=None, p_col_name="P.Value"): ph = ['p' + h for h in get_header(p_bed)] rh = get_header(region_bed) if isinstance(p_col_name, (int, long)): p_col_name = ph[p_col_name][1:] a = dict(p_bed=p_bed, region_bed=region_bed) a['p_bed'] = fix_header(a['p_bed']) yield rh + ["t-pos", "t-neg", "t-sum", "n_gt_p05", "n_gt_p1"] for group, plist in groupby(reader('|bedtools intersect -b %(p_bed)s -a %(region_bed)s -wo' % a, header=rh + ph), itemgetter('chrom','start','end')): plist = list(plist) plist = [x for x in plist if (int(x['start']) <= int(x['pstart']) <= int(x['pend'])) and ((int(x['start']) <= int(x['pend']) <= int(x['end'])))] tscores = [float(row['pt']) for row in plist if 'pt' in row] if max_p: if any(float(row['p' + p_col_name]) > max_p for row in plist): continue ngt05 = sum(1 for row in plist if float(row['p' + p_col_name]) > 0.05) ngt1 = sum(1 for row in plist if float(row['p' + p_col_name]) > 0.1) tpos = sum(1 for ts in tscores if ts > 0) tneg = sum(1 for ts in tscores if ts < 0) tsum = sum(ts for ts in tscores) frow = [plist[0][h] for h in rh] + [str(tpos), str(tneg), str(tsum), str(ngt05), str(ngt1)] yield frow
def get_vdj_regions(counter, imgt): p = defaultdict(list) s = {} for l in reader(imgt, header=True): if not l['Functionality'] == "productive": continue try: v = l["V-GENE and allele"].split()[1] except IndexError: v = "na" try: j = l["J-GENE and allele"].split()[1] except IndexError: j = "na" try: d = l["D-GENE and allele"].split()[1] except IndexError: d = "na" composition = "%s,%s,%s" % (v, d, j) protein_seq = l["AA JUNCTION"] p[protein_seq].append(composition) try: if len(l['Sequence']) > len(s[protein_seq]): s[protein_seq] = l['Sequence'] except KeyError: s[protein_seq] = l['Sequence'] return p, s
def main(table): d = {} for toks in reader(table, header=True, sep=" "): row_gene = toks['Genes'] d[row_gene] = {} for col_gene in toks.keys(): # row 1, col 1 is a generic header entry if col_gene == "Genes": continue d[row_gene][col_gene] = int(toks[col_gene]) # print node size attributes node_out = open("node_attrs.txt", "wb") print >>node_out, "source\ttotal_mutations" for k in d.keys(): print >>node_out, "{gene}\t{count}".format(gene=k, count=d[k][k]) node_out.close() # print network and edge attributes interaction_type = "pp" network_out = open("network.txt", "wb") print >>network_out, "source\tinteraction_type\ttarget\tcomutation_count" seen = set() for row_gene in d.keys(): for col_gene, count in d[row_gene].iteritems(): if count == 0: continue # double checking these were filtered out if row_gene == col_gene: continue # check to see if the interaction was already added in the opposite direction if "{gene2}_{gene1}".format(gene2=col_gene, gene1=row_gene) in seen: continue print >>network_out, "{gene1}\t{interaction}\t{gene2}\t{count}".format(gene1=row_gene, interaction=interaction_type, gene2=col_gene, count=count) seen.add("{gene1}_{gene2}".format(gene1=row_gene, gene2=col_gene)) network_out.close()
def readccrs(path, gerp, phast, cadd): for i, d in enumerate(ts.reader(path, header="ordered")): d['gerp'] = ",".join( map( str, gerp.values("chr" + d['chrom'], int(d['start']), int(d['end'])))) d['phast'] = ",".join( map( str, phast.values("chr" + d['chrom'], int(d['start']), int(d['end'])))) region = d['chrom'] + ":" + d['start'] + "-" + d['end'] var = None vals = [] caddvals = [] for toks in (x.rstrip('\r\n').split("\t") for x in ts.nopen("| tabix " + cadd + " {region}".format(region=region)) if x[1] != "#"): #TODO replace w cyvcf2 if var == None or var == toks[1]: vals.append(float(toks[5])) elif var != toks[1] and var != None: caddvals.append(np.mean(vals)) vals = [] var = toks[1] d['cadd'] = ",".join(map(str, caddvals)) if i == 0: print "\t".join(d.keys()) print "\t".join(map(str, d.values()))
def main(args): gm = defaultdict(dict) for f in args.files: for l in reader(f, header="chrom start stop name score strand abundance".split()): try: if int(l['abundance']) == 0: continue # gm[<parsed file name>][<miRNA name>] = abundance value gm[op.basename(f).split(".mirna_abundance", 1)[0]][l['name']] = l['abundance'] except KeyError: # header failed to set l['abundance'] pass # the sample names caselist = sorted(gm.keys()) # only save lines where at least one sample has a positive value completeset = [] for i, case in enumerate(caselist): keys = sorted(gm[caselist[i]].keys()) for k in keys: completeset.append(k) mirnas = set(completeset) # print the matrix print "\t".join(k for k in caselist) for mirna in mirnas: fields = [mirna] for c in caselist: try: fields.append(gm[c][mirna]) except KeyError: # miRNA not present in this case fields.append("0.0") print "\t".join(map(str, fields))
def main(args): gm = defaultdict(dict) for f in args.files: for l in reader(f, header="chrom start stop name counts nonzero blength nonzerofracofb".split()): try: # gm[<parsed file name>][<peak name>] = count value fullname = "%s:%s:%s:%s" % (l["name"], l["chrom"], l["start"], l["stop"]) gm[f.split(".", 1)[0]][fullname] = l["counts"] except KeyError: # header failed to set l['val'] pass # print the matrix caselist = sorted(gm.keys()) # this step is unnecessary as they all have counts for the same peaks completeset = [] for i, case in enumerate(caselist): keys = sorted(gm[caselist[i]].keys()) for k in keys: completeset.append(k) peaks = set(completeset) print "#peak_name\t" + "\t".join(k for k in caselist) for peak in peaks: fields = [peak] for c in caselist: try: fields.append(gm[c][peak]) except KeyError: # miRNA not present in this case fields.append("0.0") print "\t".join(map(str, fields))
def get_unique_protein_seqs(imgt): c = Counter() for l in reader(imgt, header=True): if not l['Functionality'] == "productive": continue if len(l['AA JUNCTION']) < 2: continue c.update([l['AA JUNCTION']]) return c
def main(shifts, sites): refsites = sites_to_dict(sites) try: cols = reader(shifts, header=False).next() except StopIteration: print >>sys.stderr, ">> empty file:", shifts sys.exit(1) comparisons = cols[2:] shifts_d = shifts_to_dict(comparisons, shifts) for comparison, all_sites in shifts_d.iteritems(): lines = [] for (site, shift) in all_sites.iteritems(): a, b = site.split(",") a = refsites[a] b = refsites[b] lines.append(bed12line(a.chrom, a.start, b.stop, a.strand, shift)) lines = sorted(lines, key=operator.itemgetter(0, 1)) if len(lines) == 0: print >>sys.stderr, ">> nothing found in", comparison continue result = "{comparison}.dexseq.bed".format(**locals()) print >>sys.stderr, ">> writing", result f = open(result, 'wb') for line in lines: print >>f, "\t".join(map(str, line)) f.close()
def plot(f, axs, shared): diffs = [] xs, ys = [], [] for d in ts.reader(f, sep=","): if not (d['sample_a'], d['sample_b']) in shared: continue x = float(d['rel']) y = float(d['pedigree_relatedness']) #if abs(x - y) > 0.25: continue diffs.append(x - y) xs.append(x) ys.append(y) """ ax.scatter(xs, ys) ax.set_xlabel('relatedness by genotype') ax.set_ylabel('relatedness by ped file') ax.set_title(f) """ p5, p95 = np.percentile(diffs, [2.5, 97.5]) m, std = np.mean(diffs), np.std(diffs) ax2 = axs ax2.set_title(convert(f)) ax2.hist(diffs, 40) ax2.text(0.6, 0.8, "95%% range: %.3f - %.3f\nmean: %.3f std: %-3f" % (p5, p95, m, std), transform=ax2.transAxes) ax2.set_xlabel("genotype - expected") ax2.set_ylabel("count")
def make_tree(path): tree = defaultdict(IntervalTree) prev_chrom, prev_pos = None, 0 prev_hap = None added = defaultdict(int) for i, line in enumerate(ts.reader(path, sep=',')): chrom = line['chromosome'] pos = line['position(B38)'] hap_probs = list(line.items())[3:] hap_probs_np = np.array([v for k,v in hap_probs]) max_hap = [v for i,v in enumerate(hap_probs) if i == np.argmax(hap_probs_np)][0] hap, score = max_hap if float(score) < 0.8: continue if i == 0: prev_hap = hap prev_chrom = chrom prev_pos = pos if chrom == prev_chrom and hap != prev_hap: tree[prev_chrom].add(int(prev_pos), int(pos), other=prev_hap) # print ('added {}:{}-{}'.format(prev_chrom, prev_pos, pos)) prev_pos = pos prev_chrom = chrom prev_hap = hap added[chrom] = int(pos) elif chrom != prev_chrom: tree[prev_chrom].add(int(prev_pos), added[prev_chrom], other=prev_hap) # print ('added {}:{}-{}'.format(prev_chrom, prev_pos, added[prev_chrom])) prev_pos = 0 prev_chrom = chrom prev_hap = hap else: added[chrom] = int(pos) continue return tree
def read_values( path='/uufs/chpc.utah.edu/common/home/u1021864/analysis/scoredregions.bed' ): var = defaultdict(defaultdict) ccrs = defaultdict(list) genes = defaultdict(list) for i, region in enumerate(ts.reader(path, header="ordered")): ccrs['gerp'].append(np.mean(map(float, region['GERP'].split(",")))) ccrs['phast'].append( np.mean(map(float, region['phastCons'].split(",")))) ccrs['cadd'].append(np.mean(map(float, region['CADD'].split(",")))) length = sum([ int(i.split("-")[1]) - int(i.split("-")[0]) for i in region['ranges'].split(',') ]) ccrs['pct'].append(float(region['weighted_pct'])) ccrs['gene'].append(region['gene']) ccrs['chrom'].append(region['chrom']) ccrs['ranges'].append(region['ranges']) ccrs['length'].append(length) if genes[region['gene']]: genes[region['gene']][0] += 1 genes[region['gene']][1] += length else: genes[region['gene']] = [1, length] var['ccrs'] = ccrs var['genes'] = genes return var
def main(): args = get_args() if args.verbose: sys.stderr.write(">> building gene orthology cross-reference...\n") xref = get_xref(args.xref) if args.verbose: sys.stderr.write(">> building uniprot library...\n") uniprot = parse_uniprot_flat(args.uniprot) if args.verbose: sys.stderr.write(">> annotating matrisome...\n") header = nopen(args.matrisome).readline().rstrip("\r\n").split("\t") headerext = ['r_ENSRNOP', 'r_score', 'r_geneid', 'r_gene_description', \ 'r_uniprot', 'r_interpro', 'r_refseqn', 'r_refseqp', \ 'r_ensg', 'r_enst', 'r_ensp'] header.extend(headerext) print "\t".join(h for h in header) for entry in reader(args.matrisome): # reset vars for h in headerext: entry[h] = "" # handle multiple entries delimited by ":" for entryname in entry[args.xref_col].split(":"): # looping over entire defaultdict each time for uid, ddict in xref.iteritems(): # find a matching ortholog for orthoname in ddict['orthonames']: if orthoname == entryname: # use the uid to get the rat names and scores for ratname, ratscore in izip(xref[uid]['ratnames'], xref[uid]['ratscores']): # print ratname entry['r_ENSRNOP'] += "%s:" % ratname entry['r_score'] += "%s:" % ratscore # for each rat ENSP, add the corresponding annotation(s) for uniqueid, uniprot_entry in uniprot.iteritems(): for ensemblname in uniprot_entry['ensemblp']: if ensemblname == ratname: #print all of the info for this uid entry['r_geneid'] += ':'.join(t for t in uniprot[uniqueid]['geneid']) + ":" entry['r_gene_description'] += ':'.join(t for t in uniprot[uniqueid]['description']) + ":" entry['r_uniprot'] += ':'.join(t for t in uniprot[uniqueid]['uniprotid']) + ":" entry['r_interpro'] += ':'.join(t for t in uniprot[uniqueid]['interpro']) + ":" entry['r_refseqn'] += ':'.join(t for t in uniprot[uniqueid]['refseqn']) + ":" entry['r_refseqp'] += ':'.join(t for t in uniprot[uniqueid]['refseqp']) + ":" entry['r_ensg'] += ':'.join(t for t in uniprot[uniqueid]['ensemblg']) + ":" entry['r_enst'] += ':'.join(t for t in uniprot[uniqueid]['ensemblt']) + ":" entry['r_ensp'] += ':'.join(t for t in uniprot[uniqueid]['ensemblp']) + ":" print "\t".join(entry[h].rstrip(":") for h in header)
def parse_txt(txt): """returns dictionary of miRNAs present in the network.""" observed_mirs = set() for t in txt: for toks in reader(t, header=['name', 'chrom', 'start', 'stop', 'gene']): mir_name = toks['name'].split("|")[0] observed_mirs.add(mir_name) return observed_mirs
def lamina(): if not op.exists('lamina.bed'): fh = open('lamina.bed', 'w') fh.write("#chrom\tstart\tend\tvalue\n") for gff in reader('http://www.nature.com/nature/journal/v453/n7197/extref/nature06947-s2.txt', header=False): fh.write("\t".join([gff[0], gff[3], gff[4], gff[5]]) + "\n") fh.close() return 'lamina.bed'
def shifts_to_dict(cols, fname): d = OrderedDict() for c in cols: d[c] = {} for l in reader(fname): if not l[c] == "proximal" and not l[c] == "distal": continue d[c][l['Sites']] = l[c] return d
def bam2bedgraph(args): """Convert bam to bedgraph. Args: bedgraph, bam, strand""" cmd = "|bedtools genomecov -bg -5 -ibam %s" % (args.bam) if args.strand: cmd = "|bedtools genomecov -bg -5 -strand %s -ibam %s" % (args.strand, args.bam) result_header = "chrom start stop counts".split() for b in reader(cmd, header=result_header): print "\t".join(b[r] for r in result_header)
def rad_format(fmethylated, fcounts, fout): if isinstance(fout, basestring): fout = ts.nopen(fout, "w") for i, (m, c) in enumerate(it.izip(ts.reader(fmethylated, header=False), ts.reader(fcounts, header=False))): if i == 0: fout.write("\t" + "\t".join(m[1:]) + "\n") else: assert m[0] == c[0] methyls = m[1:] counts = c[1:] pairs = "\t".join("%s %s" % (ci, mi) for mi, ci in zip(methyls, counts)) chrom, pos = c[0].split(":") pos = int(pos) site = "%s:%i:%i" % (chrom, pos, pos + 1) fout.write("%s\t%s\n" % (site, pairs)) return fout.name
def example(): import toolshed as ts from collections import namedtuple it = ts.reader('/uufs/chpc.utah.edu/common/home/u6000294/lustre/u6000294/pmodel/y.sort.bed.gz') iterable = (Interval(**iv) for iv in it) for gene, val in slider(iterable, size_grouper(1), FRV_inline, maf_cutoff=0.005): print "%s\t%.3f\t%.3f" % (gene[0].autoregs, val, IAFI_inline(gene, 65000))
def partsort(afile, group_cols, sort_cols, sort_convertors, header=False): """ the converted columns are appended to the end of the row. then after the sort, these are removed. this removes problems with floating point reprs. """ the_first_line = get_header(afile) row_len = len(the_first_line) n_extra = len(sort_convertors) # maintain order of the sort cols, but use the appended columns for the # numeric ones. actual_sort_cols = [] n_extra = 0 # since we append floats to the end *and* want to maintain the # requested sort order, we create the `actual_sort_cols` for c in sort_cols: if not c in sort_convertors: actual_sort_cols.append(c) else: idx = row_len + n_extra actual_sort_cols.append(idx) n_extra += 1 # if it was stdin, then we read one line to get the header length. lines = reader(afile, header=header) if afile != "-" \ else chain([the_first_line], reader(afile, header)) # groupby the correct columns for keyed, group in groupby(lines, lambda toks: [toks[i] for i in group_cols]): # then generate the rows with the converted columns appended. def gen_converted_group(): for toks in group: # add the converted columns onto the end. yield toks + [ fn(toks[col_idx]) for col_idx, fn in sort_convertors.items() ] # then iterator over the sorted cols. for toks in sorted(gen_converted_group(), key=itemgetter(*actual_sort_cols)): # strip the extra columns. yield toks[:row_len]
def write_result(fanno, written=[False]): for i, d in enumerate(reader(fanno, header="ordered")): if i == 0 and written[0] == False: print >> out, "\t".join(d.keys()) written[0] = True print >> out, "\t".join(d.values()) os.unlink(fanno) os.unlink(fanno.replace(".anno", ""))
def main(dexseq, pval, pval_cutoff): dex_runs = OrderedDict() for fname in dexseq: cols = reader(fname, header=False).next()[1:] try: a, b = sample_names(cols, "log2fold") strand = gstrand(fname) except StrandNotFound: print >>sys.stderr, ">> strand (pos, neg) must be in file names." sys.exit(1) except UnboundLocalError: print >>sys.stderr, ">> failed to get sample names for", fname print >>sys.stderr, ">> skipping..." continue log2fold = cols[-1] assert a != b run_id = "{a}_to_{b}.{strand}".format(**locals()) dex_runs[run_id] = {} for group in grouper(reader(fname, header=True), "geneID"): results = OrderedDict() for site in group: try: # p-value threshold filtering if float(site[pval]) > pval_cutoff: continue except ValueError: continue # fold change should be recorded from dexseq assert site[log2fold] != "NA" site_id = int(site['exonID'].rsplit(".")[-1]) results[site_id] = {'fc':float(site[log2fold]), 'name':site['exonID'].lstrip('E')} if len(results) < 2: continue # iterating over the pairs involved in switching event for (aid, ad), (bid, bd) in pairs(results): # the direction of change direction = shift(aid, ad['fc'], bid, bd['fc']) comp = "{aname},{bname}".format(aname=ad['name'], bname=bd['name']) # complex name to ease creating multiindex dataframe dex_runs[run_id]["{gene}:{comp}".format(gene=site['geneID'], comp=comp)] = direction try: df = pd.DataFrame(dex_runs) # create multiindex via split df.index = pd.MultiIndex.from_tuples([x.split(":") for x in df.index], names=['Gene','Sites']) df.to_csv(sys.stdout, sep="\t", na_rep="na") except Exception: # empty dataframe print >>sys.stderr, "No significant sites were found."
def read_regions(fregions): tree = defaultdict(InterLap) for i, toks in enumerate(ts.reader(fregions, header=False)): if i == 0 and not (toks[1] + toks[2]).isdigit(): continue tree[toks[0]].add((int(toks[1]), int(toks[2]), toks)) sys.stderr.write("# read %i regions from %s\n" \ % (sum(len(v) for v in tree.values()), fregions)) return tree
def cross_ref(kgxref, table_id, table_symbol): """Returns dictionary of knownGene cross-reference table by the table identifier, ie. refseq. """ xref = {} for x in reader(kgxref): xref[x[table_id]] = x[table_symbol] return xref
def write_result(fanno, written=[False]): for i, d in enumerate(reader(fanno, header="ordered")): if i == 0 and written[0] == False: print >>out, "\t".join(d.keys()) written[0] = True print >>out, "\t".join(x if x else "NA" for x in d.values()) os.unlink(fanno) os.unlink(fanno.replace(".anno", ""))
def write_result(fanno, written=[False]): for i, d in enumerate(reader(fanno, header="ordered")): if i == 0 and written[0] == False: print(("\t".join(list(d.keys()))) >> out) written[0] = True print(("\t".join(x if x else "NA" for x in list(d.values()))) >> out) os.unlink(fanno) os.unlink(fanno.replace(".anno", ""))
def main(): p = argparse.ArgumentParser(__doc__) p.add_argument("-g", dest="group", help="group by the first column (usually" " chromosome or probe) if this [optional]", default=False, action="store_true") p.add_argument( "--skip", dest="skip", help="Maximum number of intervening " "basepairs to skip before seeing a value. If this number is " "exceeded, the region is ended chromosome or probe " "[default: %default]", type=int, default=50000) p.add_argument( "--min-region-size", dest="min-region", help="minimum " "length of the region. regions shorter than this are not printed" "[default: %default] (no minimum)", type=int, default=0) p.add_argument("--seed", dest="seed", help="A value must be at least this" " large in order to seed a region. [default: %default]", type=float, default=5.0) p.add_argument("--keep-cols", dest="keep", help="comma separated list of" "columns to add to the output data", default="") p.add_argument("--threshold", dest="threshold", help="After seeding, a value" "of at least this number can extend a region [default: " "%default]", type=float, default=3.0) p.add_argument("regions") args = p.parse_args() f = reader(args.regions, header=False, sep="\t") keep = [int(k) for k in args.keep.strip().split(",") if k] report_cutoff = args.seed for key, region in gen_regions(f, args.skip, args.seed, args.threshold, args.group, keep, report_cutoff): print key + "\t" + "\t".join(map(str, region))
def rad_format(fmethylated, fcounts, fout): if isinstance(fout, basestring): fout = ts.nopen(fout, "w") for i, (m, c) in enumerate( it.izip(ts.reader(fmethylated, header=False), ts.reader(fcounts, header=False))): if i == 0: fout.write("\t" + "\t".join(m[1:]) + "\n") else: assert m[0] == c[0] methyls = m[1:] counts = c[1:] pairs = "\t".join("%s %s" % (ci, mi) for mi, ci in zip(methyls, counts)) chrom, pos = c[0].split(":") pos = int(pos) site = "%s:%i:%i" % (chrom, pos, pos + 1) fout.write("%s\t%s\n" % (site, pairs)) return fout.name
def bediter(fname, col_num): for i, l in enumerate(reader(fname, header=False)): if l[0][0] == "#": continue try: yield {"chrom": l[0], "start": int(l[1]), "end": int(l[2]), "p": float(l[col_num])} # "stuff": l[3:][:]} except: print >>sys.stderr, l if i != 0: raise
def main(regions, bams, reads=None, flags="-F%i" % (0x100 | 0x4 | 0x200 | 0x400), pad=100): r2 = open(tempfile.mktemp(), 'w') for toks in reader(regions, header=False): if toks[0][0] == "@" or not (toks[1] + toks[2]).isdigit(): continue toks[1] = str(max(0, int(toks[1]) - pad)) toks[2] = str(int(toks[2]) + pad) print >> r2, "\t".join(toks) r2.flush() regions = r2.name print reads if reads.isdigit(): reads = int(reads) elif reads != "bam": reads = int( nopen( "|bioawk -c fastx 'END { print NR }' %s" % reads).next()) * 2.0 counts = {} colors = cycle('rgbkmy') bam_reads = {} counts = dict(pmap(count_both, ((bam, regions, flags) for bam in bams))) for bam in bams: nreads = count_bam(bam, flags) if reads == "bam" else reads bam_reads[bam] = nreads symbol = 'o' if len(set(counts[bam][0])) < 3 else '.' pl.plot(counts[bam][0] / float(nreads), counts[bam][1] / float(nreads), '%s%s' % (colors.next(), symbol), label=name(bam)) pl.xlabel('off target') pl.ylabel('on target') pl.legend(loc='lower right') pl.xlim(xmin=0) pl.ylim(ymin=0) pl.show() os.unlink(r2.name) out = sys.stdout print >> out, "qual\tmethod\toff\ton" for qual in range(0, 256): for b in bams: print >> out, "{qual}\t{bam}\t{off}\t{on}".format( qual=qual, bam=name(b), off=counts[b][0][qual] / bam_reads[bam], on=counts[b][1][qual] / bam_reads[bam]) print >> sys.stderr, "wrote", out.name
def load_background_file(freq_background_filename): ''' load genome nuc frequencies from pre-computed background file ''' result = defaultdict(dict) for row in reader(freq_background_filename): region_size = int(row['region.size']) nuc = row['nuc'] freq = float(row['freq']) result[region_size][nuc] = freq return result
def _set_structure(self, structure): """ here, we want to intersect the query and subject bed files with the structure.bed file and give each set of intervals in query and bed that fall within (or have any overlap with) a unique, fake chromosome so that all shuffling is within that chromosome. in order to do this, we also have to create a fake genome file that contains the lengths of those chromosomes. """ if structure in (None, ""): return self.chrom = True # has to be by chromosome. n_query_before = sum(1 for _ in nopen(self.query)) n_subject_before = sum(1 for _ in nopen(self.subject)) new_genome = open(mktemp(suffix='.fake_genome'), 'w') structure = "<(cut -f 1-3 %s)" % structure seen_segs = {} for bed in ('query', 'subject', 'exclude', 'include'): bed_path = getattr(self, "_" + bed, getattr(self, bed)) if not bed_path: continue new_fh = open(mktemp(suffix='%s.fake' % bed), 'w') for toks in reader("|bedtools intersect -wo -a %s -b '%s' \ | sort -k4,4 -k5,5g" % (structure, bed_path), header=False): gtoks, btoks = toks[:3], toks[3:-1] # drop the bp overlap new_chrom = "_".join(gtoks) gtoks[1:] = map(int, gtoks[1:]) btoks[1:3] = map(int, btoks[1:3]) glen = gtoks[2] - gtoks[1] # fake chrom length. if new_chrom.startswith('chr'): new_chrom = new_chrom[3:] if not new_chrom in seen_segs: # save it in the genome file. print >> new_genome, "\t".join((new_chrom, str(glen))) seen_segs[new_chrom] = True # with partial overlap, we'll have a negative start or an # end outside the genome... for now, just truncate. # adjust the interval to its location the new chrom. btoks[0] = new_chrom btoks[1] = max(0, btoks[1] - gtoks[1]) # don't let it go below 0 # chop to end of fake chrom. btoks[2] = min(btoks[2] - gtoks[1], glen - 1) assert 0 <= btoks[1] <= btoks[2] < glen btoks[1:3] = map(str, btoks[1:3]) print >> new_fh, "\t".join(btoks) new_fh.close() setattr(self, bed, new_fh.name) new_genome.close() self.genome_file = new_genome.name
def _get_genotypes(vcf, min_qual, min_genotype_qual, min_samples, as_vcf): fh = ts.nopen(vcf) if as_vcf: for header in fh: print(header.rstrip("\r\n")) if header.startswith("#CHROM"): header = header.split("\t") header[0] = "CHROM" break else: 1 / 0 vcf_iter = ts.reader(chain(["\t".join(header)], fh), header="ordered") else: vcf_iter = ts.reader(vcf, skip_while=lambda l: l[0] != "#CHROM", header="ordered") for i, variant in enumerate(vcf_iter): yield _get_genotype(i, variant, min_qual, min_genotype_qual, min_samples)
def bediter(fnames, col_num, delta=None): """ iterate over a bed file. turn col_num into a float and the start, stop column into an int and yield a dict for each row. """ last_chrom = chr(0) last_start = -1 if isinstance(fnames, basestring): fnames = [fnames] for fname in fnames: for i, l in enumerate(ts.reader(fname, header=False)): if l[0][0] == "#": continue if i == 0: # allow skipping header try: float(l[col_num]) except ValueError: continue chrom = l[0] start = int(float(l[1])) if chrom == last_chrom: assert start >= last_start, ("error at line: %i, %s" % (i, "\t".join(l)), "file is not sorted") else: assert last_chrom < chrom, ( "error at line: %i, %s " " with file: %s" % (i, "\t".join(l), fname), "chromosomes must be sorted as characters", last_chrom, "is not < ", chrom) last_chrom = chrom last_start = start p = float(l[col_num]) if not delta is None: if p > 1 - delta: p -= delta # the stouffer correction doesnt like values == 1 if p < delta: p = delta # the stouffer correction doesnt like values == 0 v = { "chrom": l[0], "start": start, "end": int(float(l[2])), "p": p } # "stuff": l[3:][:]} if v['end'] - v['start'] > 100000: print( "warning! large interval at %s will increase memory use." % v) yield v
def target_size_from_mips(mips, pad=0): tmp = open(mktemp(suffix=".bed"), "w") for k in mips: # ext/lig_probe_start/stop for (chrom, pos), d in mips[k].items(): posns = [int(d[p]) for p in "ext_probe_start ext_probe_stop lig_probe_start lig_probe_stop".split()] tmp.write("%s\t%i\t%i\n" % \ (chrom, max(0, min(posns) - pad), max(posns) + pad)) tmp.close() size = 0 for toks in ts.reader("|tail -n+2 %s | sort -k1,1 -k2,2n | bedtools merge -i stdin" % tmp.name, header=False): size += int(toks[2]) - int(toks[1]) return size
def feature_gen(fname, row_handler=row_handler, feature_class=ClusterFeature, sep="\t", rho_min=0.3, skip_first_row=True, weights=None): """ Parameters ---------- fname : str file name containing methylation data row_handler: function function that takes a list of values for each line in `fname` and returns a tuple of chrom, start, end, values. e.g. def row_handler(tokens): chrom, pos = tokens[0].split(":") return (chrom, int(pos) - 1, int(pos), map(float, values[1:])) feature_class: class a class derived from `ClusterFeature` that accepts chrom, start, end, values and has those atributes and fulfills the requirements of aclust.aclust. rho_min: float the minimum spearman's r between 2 sets of values for them to be considered as correlated """ if weights is not None: weights = reader(weights, header=False, sep=sep) for i, toks in enumerate(reader(fname, header=False, sep=sep)): if i == 0 and skip_first_row: if weights is not None: next(weights) continue vals = row_handler(toks) if weights is not None: chrom, start, end, weight_vals = row_handler(next(weights)) assert chrom == vals[0] assert start == vals[1], (vals[1], start) else: weight_vals = None yield feature_class(*vals, **{'rho_min': rho_min, 'weights':weight_vals} )
def read_mips(mips_file): sys.stderr.write("reading %s\n" % mips_file) m = {'ext_probe_start':{}, 'lig_probe_start':{}, 'ext_probe_stop':{}, 'lig_probe_stop':{}} ss = m.keys() for d in ts.reader(mips_file): for key in ss: d[key] = int(d[key]) m['ext_probe_start'][(d['chr'], int(d['ext_probe_start']))] = d m['lig_probe_start'][(d['chr'], int(d['lig_probe_start']))] = d m['lig_probe_stop'][(d['chr'], int(d['lig_probe_stop']))] = d m['ext_probe_stop'][(d['chr'], int(d['ext_probe_stop']))] = d return m
def _split_chroms(fname): import tempfile t = tempfile.mktemp(dir="/tmp", suffix=".cruzdb") chroms = {} for d in reader(fname, header="ordered"): if not d['chrom'] in chroms: chroms[d['chrom']] = open(t + "." + d['chrom'], "w") print >> chroms[d['chrom']], "\t".join(d.keys()) print >> chroms[d['chrom']], "\t".join(d.values()) for k in chroms: chroms[k].close() chroms[k] = (chroms[k], chroms[k].name + ".anno") return chroms.items()
def as_bam(pfile, fa, prefix, calmd=False, set_as_failed=None): """ pfile: either a file or a |process to generate sam output fa: the reference fasta prefix: the output prefix or directory set_as_failed: None, 'f', or 'r'. If 'f'. Reads mapping to that strand are given the sam flag of a failed QC alignment (0x200). """ view = "samtools-0.1.18 view -bS - | samtools-0.1.18 sort -m 5005919104 - " if calmd: cmds = [ view + "{bam}.tmp", "samtools_old calmd -AbEr {bam}.tmp.bam {fa} > {bam}.bam 2>/dev/null", "rm {bam}.tmp.bam" ] else: cmds = [view + "{bam}"] cmds.append("samtools-0.1.18 index {bam}.bam") cmds = [c.format(bam=prefix, fa=fa) for c in cmds] sys.stderr.write("writing to:\n%s\n" % cmds[0]) p = nopen("|" + cmds[0], 'w') out = p.stdin # out = sys.stdout # useful for debugging bam_iter = reader("%s" % (pfile, ), header=False, quotechar=None) out.write('@HD\tVN:1.5\tSO:coordinate\n') for toks in bam_iter: if not toks[0].startswith("@"): break handle_header(toks, out) else: sys.stderr.flush() raise Exception("bad or empty fastqs") bam_iter2 = chain([toks], bam_iter) for read_name, pair_list in groupby(bam_iter2, itemgetter(0)): pair_list = [Bam(toks) for toks in pair_list] for aln in handle_reads(pair_list, set_as_failed): out.write(str(aln) + '\n') stdout, stderr = p.communicate() stdout = stdout.replace('\r', '\n') stderr = stderr.replace('\r', '\n') # p.stdin.flush() # p.stdout.flush() # p.stdin.close() # assert p.wait() == 0 for cmd in cmds[1:]: sys.stderr.write("running: %s\n" % cmd.strip()) assert check_call(cmd.strip(), shell=True) == 0
def roc_out(p_bed, p_col, truth_region_bed, exclude=('-1', 'NA', 'nan')): """Create ROC for a bed file of p-values given known truth regions. Parameters ---------- p_bed : file p_col : int column containing the p-value from `p_bed` truth_region_bed : file contains the true regions """ p_col -= 1 # 0-based regions = defaultdict(list) for toks in ts.reader(truth_region_bed, header=False): if not (toks[1] + toks[2]).isdigit(): continue regions[toks[0]].append((int(toks[1]), int(toks[2]))) truths = [] vals = [] for toks in ts.reader(p_bed, header=False): if not (toks[1] + toks[2]).isdigit(): continue reg = regions[toks[0]] s, e = int(toks[1]), int(toks[2]) p = toks[p_col] if p in exclude: continue vals.append(1.0 - float(p)) truth = any(rs <= s <= re or rs <= e <= re for rs, re in reg) truths.append(truth) return np.array(truths).astype(int), np.array(vals)
def get_bam_lookup(p="data/bam-lookups-from-1kg-site.tsv"): l = {} for d in ts.reader(p): if 'low_coverage' in d['url']: continue if 'chr20' in d['url']: continue if 'chrom20' in d['url']: continue if 'chrom11' in d['url']: continue if 'unmapped' in d['url']: continue # NOTE: we could also get some samples with cram. if not d['url'].endswith('.bam'): continue if d['Sample'] in l: print "XXX:", d['url'] print "YYY:", l[d['Sample']] l[d['Sample']] = d['url'] return l
def simplify_bed(fbed, has_header): """ create a bed with no header and 6 columns. retain strand info. """ line_gen = reader(fbed, header=False) header = line_gen.next() if has_header else None fh = open(BedTool._tmp(), "w") for toks in line_gen: new_toks = toks[:3] + [ "Z_Z".join(toks), ".", toks[5] if len(toks) > 5 else "." ] fh.write("\t".join(new_toks) + "\n") fh.close() return BedTool(fh.name), header
def shared(fs): sets = [] for f in fs: s = set() for d in ts.reader(f, sep=","): x = float(d['rel']) y = float(d['pedigree_relatedness']) #if abs(x - y) > 0.2: continue s.add((d['sample_a'], d['sample_b'])) sets.append(s) sall = sets[0] for i, s in enumerate(sets): if i == 0: continue sall &= s return sall
def calc_genome_size(chrom_size_filename, only_chroms, ignore_chroms, verbose): genome_size = 0.0 for row in reader(chrom_size_filename, header=['chrom', 'size']): if (only_chroms and row['chrom'] not in only_chroms) or \ (ignore_chroms and row['chrom'] in ignore_chroms): continue genome_size += float(row['size']) if verbose: print >> sys.stderr, ">> genome size: %s" % str(genome_size) return genome_size