def filtervcf(args): """ %prog filtervcf NA12878.hg38.vcf.gz Filter lobSTR VCF using script shipped in lobSTR. Input file can be a list of vcf files. """ p = OptionParser(filtervcf.__doc__) p.set_home("lobstr", default="/mnt/software/lobSTR") p.set_aws_opts(store="hli-mv-data-science/htang/str") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) samples, = args lhome = opts.lobstr_home store = opts.output_path if samples.endswith((".vcf", ".vcf.gz")): vcffiles = [samples] else: vcffiles = [x.strip() for x in must_open(samples)] vcffiles = [x for x in vcffiles if ".filtered." not in x] run_args = [(x, lhome, x.startswith("s3://") and store) for x in vcffiles] cpus = min(opts.cpus, len(run_args)) p = Pool(processes=cpus) for res in p.map_async(run_filter, run_args).get(): continue
def filterdata(args): """ %prog filterdata data.bin samples.ids STR.ids allele_freq remove.ids final.ids Filter subset of data after dropping remove.ids. """ p = OptionParser(filterdata.__doc__) p.set_cpus() opts, args = p.parse_args(args) if len(args) != 6: sys.exit(not p.print_help()) binfile, sampleids, strids, af, remove, final = args df, m, samples, loci = read_binfile(binfile, sampleids, strids) remove = [x.strip() for x in open(remove)] removes = set(remove) final = [x.strip() for x in open(final)] assert len(loci) == len(remove) + len(final) fp = open(af) percentiles = {} for row in fp: sname, counts = row.split() countsd = af_to_counts(counts) percentile = counts_to_percentile(countsd) percentiles[sname] = percentile run_args = [] for i, sname in enumerate(loci): if sname in removes: continue a = m[:, i] percentile = percentiles[sname] run_args.append((i, a, percentile)) cpus = min(opts.cpus, len(run_args)) p = Pool(processes=cpus) res = [] for r in p.map_async(convert_to_percentile, run_args).get(): res.append(r) res.sort() # Write mask (P-value) matrix ii, pvalues = zip(*res) m = np.vstack(pvalues).T write_csv("final.mask.tsv", m, samples, final) df.drop(remove, inplace=True, axis=1) df.columns = final # Save a copy of the raw numpy array filtered_bin = "filtered.bin" m = df.as_matrix() m[m < 0] = -1 m.tofile(filtered_bin) logging.debug("Binary matrix written to `{}`".format(filtered_bin)) # Write data output df.to_csv("final.data.tsv", sep="\t", index_label="SampleKey")
def compilevcf(args): """ %prog compilevcf samples.csv Compile vcf results into master spreadsheet. """ p = OptionParser(compilevcf.__doc__) p.add_option("--db", default="hg38", help="Use these lobSTR db") p.add_option( "--nofilter", default=False, action="store_true", help="Do not filter the variants", ) p.set_home("lobstr") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str-data") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (samples, ) = args workdir = opts.workdir store = opts.output_path cleanup = not opts.nocleanup filtered = not opts.nofilter dbs = opts.db.split(",") cwd = os.getcwd() mkdir(workdir) os.chdir(workdir) samples = op.join(cwd, samples) stridsfile = "STR.ids" if samples.endswith((".vcf", ".vcf.gz")): vcffiles = [samples] else: vcffiles = [x.strip() for x in must_open(samples)] if not op.exists(stridsfile): ids = [] for db in dbs: ids.extend(STRFile(opts.lobstr_home, db=db).ids) uids = uniqify(ids) logging.debug("Combined: {} Unique: {}".format(len(ids), len(uids))) fw = open(stridsfile, "w") print("\n".join(uids), file=fw) fw.close() run_args = [(x, filtered, cleanup, store) for x in vcffiles] cpus = min(opts.cpus, len(run_args)) p = Pool(processes=cpus) for _ in p.map_async(run_compile, run_args).get(): continue
def compile(args): """ %prog compile samples.csv Compile vcf results into master spreadsheet. """ from multiprocessing import Pool p = OptionParser(compile.__doc__) p.add_option("--db", default="hg38,hg38-named", help="Use these lobSTR db") p.set_home("lobstr") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) samples, = args workdir = opts.workdir dbs = opts.db.split(",") mkdir(workdir) os.chdir(workdir) stridsfile = "STR.ids" vcffiles = [x.strip() for x in must_open(samples)] if not op.exists(stridsfile): ids = [] for db in dbs: ids.extend(STRFile(opts.lobstr_home, db=db).ids) uids = uniqify(ids) logging.debug("Combined: {} Unique: {}".format(len(ids), len(uids))) fw = open(stridsfile, "w") print >> fw, "\n".join(uids) fw.close() # Generate two alleles dipuids = [] for uid in uids: dipuids.extend([uid + ".1", uid + ".2"]) fw = open("header.ids", "w") print >> fw, ",".join(dipuids) fw.close() p = Pool(processes=opts.cpus) run_args = [(x, opts.store, opts.cleanup) for x in vcffiles] #run(run_args[0]) for res in p.map_async(run, run_args).get(): continue
def compilevcf(args): """ %prog compilevcf samples.csv Compile vcf results into master spreadsheet. """ p = OptionParser(compilevcf.__doc__) p.add_option("--db", default="hg38", help="Use these lobSTR db") p.add_option("--nofilter", default=False, action="store_true", help="Do not filter the variants") p.set_home("lobstr") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str-data") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) samples, = args workdir = opts.workdir store = opts.output_path cleanup = not opts.nocleanup filtered = not opts.nofilter dbs = opts.db.split(",") cwd = os.getcwd() mkdir(workdir) os.chdir(workdir) samples = op.join(cwd, samples) stridsfile = "STR.ids" if samples.endswith((".vcf", ".vcf.gz")): vcffiles = [samples] else: vcffiles = [x.strip() for x in must_open(samples)] if not op.exists(stridsfile): ids = [] for db in dbs: ids.extend(STRFile(opts.lobstr_home, db=db).ids) uids = uniqify(ids) logging.debug("Combined: {} Unique: {}".format(len(ids), len(uids))) fw = open(stridsfile, "w") print >> fw, "\n".join(uids) fw.close() run_args = [(x, filtered, cleanup, store) for x in vcffiles] cpus = min(opts.cpus, len(run_args)) p = Pool(processes=cpus) for res in p.map_async(run_compile, run_args).get(): continue
def compile(args): """ %prog compile samples.csv Compile vcf results into master spreadsheet. """ p = OptionParser(compile.__doc__) p.add_option("--db", default="hg38", help="Use these lobSTR db") p.set_home("lobstr") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str-data") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) samples, = args workdir = opts.workdir store = opts.output_path cleanup = not opts.nocleanup dbs = opts.db.split(",") cwd = os.getcwd() mkdir(workdir) os.chdir(workdir) samples = op.join(cwd, samples) stridsfile = "STR.ids" vcffiles = [x.strip() for x in must_open(samples)] if not op.exists(stridsfile): ids = [] for db in dbs: ids.extend(STRFile(opts.lobstr_home, db=db).ids) uids = uniqify(ids) logging.debug("Combined: {} Unique: {}".format(len(ids), len(uids))) fw = open(stridsfile, "w") print >> fw, "\n".join(uids) fw.close() p = Pool(processes=opts.cpus) run_args = [(x, store, cleanup) for x in vcffiles] for res in p.map_async(run, run_args).get(): continue
def compare(args): """ %prog compare NA12878_array_hg38.bed *.seg Compare cnv output to known ground truths. """ p = OptionParser(compare.__doc__) p.set_cpus() opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) truths = args[0] cnvoutputs = args[1:] cpus = min(len(cnvoutputs), opts.cpus) p = Pool(processes=cpus) results = [] files = [(x, truths) for x in cnvoutputs] r = p.map_async(compare_worker, files, callback=results.append) r.wait() for res in results: print("\n".join(res))
def compare(args): """ %prog compare NA12878_array_hg38.bed *.seg Compare cnv output to known ground truths. """ p = OptionParser(compare.__doc__) p.set_cpus() opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) truths = args[0] cnvoutputs = args[1:] cpus = min(len(cnvoutputs), opts.cpus) p = Pool(processes=cpus) results = [] files = [(x, truths) for x in cnvoutputs] r = p.map_async(compare_worker, files, callback=results.append) r.wait() for res in results: print "\n".join(res)