def parse_args(): parser = argparse.ArgumentParser(description='cnv pipeline') parser.add_argument('-c', '--calls', metavar='calls', required=True, dest='calls_f', action='store', help='input bed file with the calls of your classifier') parser.add_argument('-t', '--truth', metavar='truth', required=True, dest='truth_f', action='store', help='list of validated cnv calls') parser.add_argument('-o', '--output_f', metavar='output_fn', required=False, dest='output_f', action='store', default=sys.stdout, help='list of validated cnv calls') parser.add_argument('-w', '--buffer', metavar='win_buffer', required=False, dest='buffer_size', action='store', type=int, default=0, help='buffer to use when checking hit against truth') parser.add_argument('-m', '--min_size', metavar='min_size', required=False, dest='min_size', action='store', type=int, default=0, help='min size of the events you want to consider when loading the truth. ') parser.add_argument('-r', '--chrm', metavar='calls_chrm', required=False, dest='calls_chrm', action='store', help='Chrm we used for the calls.') args = parser.parse_args() args.calls_f = drdcommon.xopen(args.calls_f) args.truth_f = drdcommon.xopen(args.truth_f) return args
def main(): if len(sys.argv) == 4: fd_vcf = drdcommon.xopen("-") fd_csv = drdcommon.xopen(sys.argv[1]) do_work(fd_vcf, fd_csv) fd_vcf.close() fd_csv.close() else: drdcommon.error("Incorrect # of params.", usage)
def main(): if len(sys.argv) == 3: fd_vcf = drdcommon.xopen("-") fd_pheno_tsv = drdcommon.xopen(sys.argv[1]) fd_haplo_tsv = drdcommon.xopen(sys.argv[2]) do_work(fd_vcf, fd_pheno_tsv, fd_haplo_tsv) fd_vcf.close() fd_pheno_tsv.close() fd_haplo_tsv.close() else: drdcommon.error("Incorrect # of params.", usage)
def parse_args(): parser = argparse.ArgumentParser(description='cnv pipeline') parser.add_argument('-e', '--events', metavar='events', required=True, dest='events', action='store', help='List of events to introduce in the genome') parser.add_argument('-r', '--reference', metavar='reference', required=True, dest='reference', action='store', help='original fasta file of the reference') args = parser.parse_args() args.events_stream = drdcommon.xopen(args.events) args.reference_stream = drdcommon.xopen(args.reference) return args
def load_mapq(self): drdcommon.log("Loading mapq") for line in drdcommon.xopen(self.sam_fn): if line[0] != '@': s = line.split() probe_id, chrm, coor, mq = s[0], s[2], s[3], s[4] self.d_mq[probe_id] = [chrm, coor, mq]
def iterate_over_eg_cov(self): fd_hits = drdcommon.xopen(self.fn_hits) for line in fd_hits: s = line.split() n_hits, p_id = int(s[0]), s[1].rstrip() yield n_hits, p_id fd_hits.close()
def main(): args = parse_args() stream = drdcommon.xopen("-") if not drdcommon.data_in_stdin(): drdcommon.error(usage) print Saturation(stream, args.at_least_seen).csv("\t") stream.close()
def main(): if len(sys.argv) == 1: fd_reads = drdcommon.xopen("-") do_work(fd_reads) fd_reads.close() else: drdcommon.error("Incorrect # of params.", usage)
def method1(input1, input2): """ 4 3 5 1 i1 |------|-------|------------------|--------| 7 10 1 i2 |----------|-------------------------|-----| 3.5 4 1 out |----------|-------------------------|-----| """ a = np.zeros(shape=(three_hundre_mil)) # Save the metric values for all the bp from the stdin input working_chrm = "" for i in input1: working_chrm, start, stop, val = i.split() for j in range(int(start), int(stop) + 1): a[j] = val # Iterate over second output and generate new windows for first input for i in drd.xopen(input2): chrm, start, stop, val = i.split() if working_chrm == chrm: s, e = int(start), int(stop) print "%s\t%s\t%s\t%s" % (chrm, start, stop, int(np.median(a[s:e + 1])))
def load_mapq(self): drdcommon.log("Loading mapq") for line in drdcommon.xopen(self.sam_fn): if line[0] != '@': s = line.split() probe_id, chrm, coor, mq = s[0], s[2], s[3], s[4] self.d_mq[probe_id] = [ chrm, coor, mq ]
def main(): if len(sys.argv) != 2: drdcommon.error("Wrong # of args", usage) if not drdcommon.data_in_stdin(): drdcommon.error("No data in stdin.", usage) ratios_stream = drdcommon.xopen("-") threshold = float(sys.argv[1]) CnvStateMachine(ratios_stream, threshold).run()
def main(): if len(sys.argv) != 1: drdcommon.error("Wrong # of args", usage) if not drdcommon.data_in_stdin(): drdcommon.error("No data in stdin.", usage) fd_vcf = drdcommon.xopen("-") do_work(fd_vcf) fd_vcf.close()
def main(): if not drdcommon.data_in_stdin(): drdcommon.error("I need a data stream in stdin.", usage=_usage) if not len(sys.argv) == 4: drdcommon.error("Wrong number of parameters", usage=_usage) title, _xl, _yl = sys.argv[1:] x, y = process_data(drdcommon.xopen("-")) plot(x, y, title, xlabel=_xl, ylabel=_yl)
def main(): if len(sys.argv) != 2: drdcommon.error("Wrong # of args", usage) if not drdcommon.data_in_stdin(): drdcommon.error("No data in stdin.", usage) fd_vcf = drdcommon.xopen("-") w_size = int(sys.argv[1]) do_work(fd_vcf, w_size) fd_vcf.close()
def process_alignments(self): drdcommon.log("Processing alignments") init_doesnt_pass_eg_hits = False for line in drdcommon.xopen(self.fn_sam): if line[0] != '@': s = line.split() probe_id, mq = s[0], int(s[4]) has_good_qual = mq > self.min_mq self.probe_info[probe_id] = [ has_good_qual, init_doesnt_pass_eg_hits ]
def main(): if not drdcommon.data_in_stdin(): drdcommon.error("I need a data stream in stdin.", usage="-") if not len(sys.argv) == 2: drdcommon.error("Wrong number of parameters", usage="-") title = sys.argv[1] x, y = process_data(drdcommon.xopen("-")) plot(x, y, title, xlabel="genomic window", ylabel="Average Read Depth")
def run(self): logging.basicConfig(level=logging.INFO) fd_vcf = drdcommon.xopen(self.options.vcf_fn) sf = SnpFreq(fd_vcf, self.exp_type, self.options) if self.options.list_s_snps: sf.run() else: print "%f" % sf.run() fd_vcf.close()
def main(): if len(sys.argv) == 2: drdcommon.error("Wrong # of args", usage) if drdcommon.data_in_stdin() == False: drdcommon.error("Need data in stdin.", usage) fd_vcf = drdcommon.xopen("-") do_work(fd_vcf) fd_vcf.close()
def load_hits(self): drdcommon.log("Loading hits") for f in drdcommon.files_in_dir('.', self.pattern): sample_id = self.extract_id(f) drdcommon.log("fn: %s | id: %s" % (f, sample_id)) for line in drdcommon.xopen(f): s = line.split() assert len(s) == 2 n_hits, p_id = s[0], s[1].rstrip() self.d_hits[p_id][sample_id] = n_hits
def main(): if len(sys.argv) != 2: drdcommon.error("Wrong # of args", usage) if not drdcommon.data_in_stdin(): drdcommon.error("No data in stdin.", usage) windows = drdcommon.xopen("-") bam_name = sys.argv[1] if not os.path.isfile(bam_name): drdcommon.error("Invalid bam file.", usage) compute_ratios(windows, bam_name)
def main(): if len(sys.argv) != 2: drdcommon.error("Wrong # of args", usage) if drdcommon.data_in_stdin() == False: drdcommon.error("Need data in stdin.", usage) min_num_samples = int(sys.argv[1]) fd_vcf = drdcommon.xopen("-") do_work(fd_vcf, min_num_samples) fd_vcf.close()
def main(): if len(sys.argv) == 3: sample_id, vcf_file = sys.argv[1:] d = {} for l in drdcommon.xopen(vcf_file): if l[0] == "#": continue v_chrm, v_coor = l.split("\t")[0:2] d[v_chrm + "_" + v_coor] = True for l in drdcommon.xopen("-"): l = l.rstrip() s = l.split("\t") chrm, coor = s[0:2] if (chrm + "_" + coor) in d: print sample_id + "\t" + l else: print usage
def main(): if len(sys.argv) == 3: logratios = process_data(drdcommon.xopen("-")) bin_nums = range(1, len(logratios)+1) title = sys.argv[1] output_fn = sys.argv[2] plot(output_fn, bin_nums, logratios, title, xlabel="bin #", ylabel="log2ratios (sample/control)") else: drdcommon.error("Wrong number of args. <title> <output.filename>")
def main(): dep_fn = "deps." + str(randint(1,1000000)) if len(sys.argv) == 2: # Dirty hack since I don't know how to make pandas.read_table work # off of stdin. data = common.xopen(sys.argv[1]).read() f = tempfile.NamedTemporaryFile(delete=False) f.write(data) f.close() for i,s in pd.read_table(f.name).iterrows(): # index, pandas series (line) print Job(s, dep_fn, i == 0) else: main_help('Need input file (use - for stdin).', main_help=True)
def main(): if len(sys.argv) == 2: lo_file = sys.argv[1] link = {} for l in drdcommon.xopen(lo_file): s = l.rstrip().split("\t") chrm_from, start_from, end_from = s[0:3] # hsap chrm_to, start_to, end_to = s[3:6] # rhmac # human -> rhmac link[chrm_from + "_" + end_from] = chrm_to + "_" + end_to #link[chrm_from + "_" + start_from] = chrm_to + "_" + start_to for l in drdcommon.xopen("-"): s = l.rstrip().split("\t") key_hsap = "_".join(s[0:2]) if key_hsap in link: rh_coor = link[key_hsap] else: rh_coor = "-_-" print rh_coor.replace("_", "\t") + "\t" + l.rstrip() else: print usage
def main(): if len(sys.argv) == 1: fd = drdcommon.xopen("-") std, counts = process_data(fd) title = "std dev freq of var allele ratios" drdplots.scatter_plot("std.dist.png", std, log_it(counts, 10), title=title, xlabel="std deviation", ylabel="log10(counts)", dot_size=10) fd.close() else: drdcommon.error("Wrong number of args. Just need std values in stdin.")
def main(): if len(sys.argv) == 2: lines = [] prev_dep_file = None for idx, line in enumerate(common.xopen(sys.argv[1])): if line in ['\n', '\r\n']: print "#--------" prev_dep_file = cmd2submit(lines, prev_dep_file) lines = [] else: lines.append(line.rstrip()) else: main_help('Need input file (use - for stdin).', main_help=True)
def main(): dep_fn = "deps." + str(randint(1, 1000000)) if len(sys.argv) == 2: # Dirty hack since I don't know how to make pandas.read_table work # off of stdin. data = common.xopen(sys.argv[1]).read() f = tempfile.NamedTemporaryFile(delete=False) f.write(data) f.close() for i, s in pd.read_table( f.name).iterrows(): # index, pandas series (line) print Job(s, dep_fn, i == 0) else: main_help('Need input file (use - for stdin).', main_help=True)
def main(): if len(sys.argv) == 3: logratios = process_data(drdcommon.xopen("-")) bin_nums = range(1, len(logratios) + 1) title = sys.argv[1] output_fn = sys.argv[2] plot(output_fn, bin_nums, logratios, title, xlabel="bin #", ylabel="log2ratios (sample/control)") else: drdcommon.error("Wrong number of args. <title> <output.filename>")
def load_predictions(i_file, chrm_col, coor_col, columns): pre = {} header = True for l in drdcommon.xopen(i_file): if header: header=False continue s = l.split("\t") key = drdcommon.canonic_chrm(s[chrm_col]) + "_" + s[coor_col] _tmp = [] for c in columns: _tmp.append(s[c]) pre[key] = "\t".join(_tmp) return pre
def __load_species_snp_coordinates(self): fd = drdcommon.xopen(self.coor_fn) d = {} self.d_species_coor = d n = 0 for l in fd: n += 1 chrm, coor = l.split() if not d.has_key(chrm): d[chrm] = {} d[chrm][int(coor)] = 1 fd.close() logging.info("# of coordinates loaded: %d" % n) logging.info("current memory usage in %dkb" % drdcommon.memory_usage())
def process_file(wild_file, d, _passes): for f in glob.glob(wild_file): match = re.search("^(\d+)\.", f) if match: _id = int(match.group(1)) first_line = True for l in xopen(f): if first_line: sys.stderr.write("%s\n" % _id) first_line = False continue else: d.add(l, _passes) return d
def main(): if len(sys.argv) == 2: lines = [] prev_dep_file = None for idx, line in enumerate(common.xopen(sys.argv[1])): if line in ['\n', '\r\n']: print "" prev_dep_file = cmd2submit(lines, prev_dep_file) lines = [] else: lines.append(line.rstrip()) print "" cmd2submit(lines, prev_dep_file) else: main_help('Need input file (use - for stdin).', main_help=True)
def loadCalls(ds, fn, idx, chrm): log("Loading calls from %s; idx=%s" % (fn, idx)) chrm_found = False nbp = 0 for l in drdcommon.xopen(fn): c, start, end, cnv = l.strip().split() if c == chrm: chrm_found = True for i in range(int(start), int(end)+1): if nbp % 1000000 == 0: sys.stderr.write("MEM: %s nbp: %s\r" % (drdcommon.memory_usage(), nbp)) ds[idx][i] = round(float(cnv)) nbp += 1 if not chrm_found: error("\nCould not find chrm in file. Bailing out.") log("\n%s bp loaded" % nbp)
def loadCalls(ds, fn, idx, chrm): log("Loading calls from %s; idx=%s" % (fn, idx)) chrm_found = False nbp = 0 for l in drdcommon.xopen(fn): c, start, end, cnv = l.strip().split() if c == chrm: chrm_found = True for i in range(int(start), int(end) + 1): if nbp % 1000000 == 0: sys.stderr.write("MEM: %s nbp: %s\r" % (drdcommon.memory_usage(), nbp)) ds[idx][i] = round(float(cnv)) nbp += 1 if not chrm_found: error("\nCould not find chrm in file. Bailing out.") log("\n%s bp loaded" % nbp)
def loadChrm(ds, ref, chrm): if not os.path.exists(ref): error("Cannot find reference file: %s", ref) log("Reading reference genome chrm: %s" % chrm) i = 1 for l in drdcommon.xopen(ref): l = l.strip() if i == 1 and l[0] == '>' and l[1:] == chrm: continue if i > 1 and l[0] == '>': break for bp in l: if bp.upper() != 'N': ds[0][i] = 1 if i % 10000000 == 0: sys.stderr.write("MEM: %s nbp: %s\r" % (drdcommon.memory_usage(), i)) i += 1 log("\n%s bp read." % i)
def parse_args(): parser = argparse.ArgumentParser(description='cnv pipeline') parser.add_argument('-i', '--input_fn', metavar='input_fn', required=True, dest='input_fn', action='store', help='input data file') parser.add_argument('-o', '--output_fn', metavar='output_fn', required=False, dest='output_fn', action='store', help='output data file') parser.add_argument('-r', '--resolution', metavar='resolution', required=False, dest='change resolution to this value', action='store', type=int, help='Change the resolution of the resulting') parser.add_argument('-t', '--threshold', metavar='threshold', required=True, dest='threshold', action='store', type=float, help='read depth threashold for calling an event') args = parser.parse_args() args.input_fn = xopen(args.input_fn) if not args.output_fn: args.output_fn = sys.stdout return args
def method1(input1, input2): """ 4 3 5 1 i1 |------|-------|------------------|--------| 7 10 1 i2 |----------|-------------------------|-----| 3.5 4 1 out |----------|-------------------------|-----| """ a = np.zeros(shape=(three_hundre_mil)) # Save the metric values for all the bp from the stdin input working_chrm = "" for i in input1: working_chrm, start, stop, val = i.split() for j in range(int(start), int(stop)+1): a[j] = val # Iterate over second output and generate new windows for first input for i in drd.xopen(input2): chrm, start, stop, val = i.split() if working_chrm == chrm: s, e = int(start), int(stop) print "%s\t%s\t%s\t%s" % (chrm, start, stop, int(np.median(a[s:e+1])))
if num_ns > _max: return True return False if len(sys.argv) < 4 or not drdcommon.data_in_stdin(): sys.stderr.write("cat ref.fa | tool <n_events> <chrm> <chrm_size>" + "\n") sys.exit(1) _ = sys.argv n_events, chrm, chrm_size = int(_[1]), _[2], int(_[3]) # Store N locations sys.stderr.write("Loading N locations ..." + "\n") ref = BitMask() i = 0 for l in drdcommon.xopen("-"): if l[0] != '-': for c in l.rstrip(): if c.upper() == 'N': ref.set(i) i += 1 # Generate events sys.stderr.write("Generating events ..." + "\n") coor = next_coor(100) i = 0 while (i < n_events): # chrm start end 0..n s = next_size() if coor + s < chrm_size: if i % 2 == 0: # deletion
#!/usr/bin/env python import sys from drdcommon import xopen min_num_samples = int(sys.argv[1]) first_line = True for l in xopen("-"): if first_line: first_line = False continue else: #chrm start end gene exon_number transcript_number 32510 .. num = 0 for i in l.strip().split("\t")[6:]: i = int(i) if i > 0: num += 1 out = "\t".join(l.strip().split("\t")[0:6]) + "\t" + str(num) + "\n" if num >= min_num_samples: sys.stdout.write(out) else: sys.stderr.write(out)
s2 |------|-----------------------------|-----------| out |------|----|--------------|---------|-----------| 4,5 2,5 10,2 4,2 4,6 """ log("Creating arrays for first stream") a_s1_vals, a_s1_wins = compute_vals_wins(stream1, working_chrm) log("Creating arrays for second stream") a_s2_vals, a_s2_wins = compute_vals_wins(stream2, working_chrm) log("Finding coordinate locations") o_wins = a_s1_wins | a_s2_wins log("Iterating over %s windows" % len(o_wins)) _first, _prev = True, None for coor in np.where(o_wins == 1)[0]: if _first: _prev = coor _first = False else: s, e = _prev, coor print "%s\t%s\t%s\t%s\t%s" % (working_chrm, s, e, cv(a_s1_vals, s, e), cv(a_s2_vals, s, e)) _prev = coor if __name__ == "__main__": # method1(drd.xopen(sys.argv[1]), drd.xopen(sys.argv[2])) if len(sys.argv) != 4: sys.stderr.write("Usage: tool <bed_file1> <bed_file2> <chromosome>\n") exit(1) method2(drd.xopen(sys.argv[1]), drd.xopen(sys.argv[2]), sys.argv[3])
from pandas import Series if len(sys.argv) != 3: sys.stderr.write("tool <target/exons bed file> <bed base coverage>" + "\n") sys.exit(1) out = sys.stdout.write err = sys.stderr.write _ = sys.argv fn_targets, fn_base_cov = _[1], _[2] err("Loading exons/targets\n") depth = {} n = 0 for t in drdcommon.xopen(fn_targets): l = t.strip() sl = [c for c in l.split()] # splitted line chrm, start, end = sl[0:3] if chrm not in depth: depth[chrm] = {} if start in depth[chrm]: raise(Exception('Two exons starting in same location! bailing out: ' + l)) for i in range(int(start), int(end)+1): depth[chrm][i] = 0 n += 1 err("%s\n" % n) err("Reading read depth bed\n") total = n hits = 0
o = "" o += "chrm start end gene exon_number transcript_number " for i in ids: o += ("%s " % i) print re.sub("\s", self.sep, o) for k, means in self.d.items(): o = k + " " for _id in ids: if _id not in means: means[_id] = 0 o += "%s " % means[_id] print re.sub("\s", self.sep, o) d = Data() ids = [] for f in glob.glob("*.pass.gz"): match = re.search("^(\d+)\.", f) if match: _id = int(match.group(1)) first_line = True for l in xopen(f): if first_line: sys.stderr.write("%s\n" % _id) ids.append(_id) first_line = False continue else: d.add(l, _id) d.dump(ids)
#!/usr/bin/env python # # Given the enumeration from enumerate.sh as input, # report, per each location, how many samples we have # each filtering category # import sys import drdcommon d = {} _a = {} for l in drdcommon.xopen("-"): _id, chrm, start, end, _type = l.rstrip().split("\t") if _id not in _a: sys.stderr.write(_id + "\n") _a[_id] = {} k = "%s_%s_%s" % (chrm, start, end) if k not in _a[_id]: _a[_id][k] = True if not k in d: d[k] = {} for _t in ["min", "max", "pass"]: d[k][_t] = 0 d[k][_type] += 1 print "chrm start stop min max pass".replace("\s", "\t")
def load_data(sid, fn, h): for l in drdcommon.xopen(fn): # chrm start end n_reads n_reads_ref log2ratio chrm, start, end, nref, nr, log = l.split() chrm = re.sub(r'(^[cC]hrm?)', '', chrm) h[chrm][int(start)][sid] = (int(nr), float(log))
# Load data for all genes all samples data = {} ids = [] for f in drdcommon.files_in_dir(".", file_pattern): # extract sample id match = re_id.search(f) if match: _id = match.group(1) else: raise (Exception("Problems extracting id for: " + f)) err("Working on id: %s\n" % (_id)) ids.append(_id) first_line = True for l in drdcommon.xopen(f): if first_line: first_line = False continue chrm, start, end, g_name = l.strip().split() start, end = int(start), int(end) k = "%s %s %s" % (chrm, start, end) if k not in data: data[k] = {} data[k]["coor"] = [chrm, start, end, g_name] data[k]["samples"] = {} data[k]["samples"][ _id] = True # This sample (_id) passes the filters for that gene
import re def help(msg): sys.stderr.write("ERROR: " + msg + "\n") sys.stderr.write("Usage: cat gtf.txt | tool list_genes_names.txt > genes.coor.bed\n") sys.exit(1) # Main if not drdcommon.data_in_stdin(): help("Need data in stdin") if len(sys.argv) != 2: help("Invalid list of arguments") gene_names = {} for l in drdcommon.xopen(sys.argv[1]): name = l.split()[0] gene_names[name] = True drdcommon.log("%s genes loaded." % len(gene_names)) for l in drdcommon.xopen("-"): s = l.split("\t") if s[2] == "CDS": chrm, start, end, _list = s[0], s[3], s[4], s[8] g_name, e_name, t_name = None, None, None for e in _list.split(";"): _ = e.split() if len(_) == 2 and _[0] == "transcript_name": t_name = re.sub('\"', '', _[1]) if len(_) == 2 and _[0] == "gene_name":