def spa(args): """ %prog spa spafiles Convert chromosome ordering from SPA to simple lists. First column is the reference order. """ from jcvi.algorithms.graph import merge_paths from jcvi.utils.cbook import uniqify p = OptionParser(spa.__doc__) p.add_option( "--unmapped", default=False, action="store_true", help="Include unmapped scaffolds in the list [default: %default]") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) spafiles = args paths = [] mappings = [] missings = [] for spafile in spafiles: fp = open(spafile) path = [] mapping = [] missing = [] for row in fp: if row[0] == '#' or not row.strip(): continue atoms = row.rstrip().split('\t') if len(atoms) == 2: a, c2 = atoms assert a == "unmapped" missing.append(c2) continue c1, c2, orientation = atoms path.append(c1) mapping.append(c2) paths.append(uniqify(path)) mappings.append(mapping) missings.append(missing) ref = merge_paths(paths) print "ref", len(ref), ",".join(ref) for spafile, mapping, missing in zip(spafiles, mappings, missings): mapping = [x for x in mapping if "random" not in x] mapping = uniqify(mapping) if len(mapping) < 50 and opts.unmapped: mapping = uniqify(mapping + missing) print spafile, len(mapping), ",".join(mapping)
def spa(args): """ %prog spa spafiles Convert chromosome ordering from SPA to simple lists. First column is the reference order. """ from jcvi.algorithms.graph import merge_paths from jcvi.utils.cbook import uniqify p = OptionParser(spa.__doc__) p.add_option("--unmapped", default=False, action="store_true", help="Include unmapped scaffolds in the list [default: %default]") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) spafiles = args paths = [] mappings = [] missings = [] for spafile in spafiles: fp = open(spafile) path = [] mapping = [] missing = [] for row in fp: if row[0] == '#' or not row.strip(): continue atoms = row.rstrip().split('\t') if len(atoms) == 2: a, c2 = atoms assert a == "unmapped" missing.append(c2) continue c1, c2, orientation = atoms path.append(c1) mapping.append(c2) paths.append(uniqify(path)) mappings.append(mapping) missings.append(missing) ref = merge_paths(paths) print "ref", len(ref), ",".join(ref) for spafile, mapping, missing in zip(spafiles, mappings, missings): mapping = [x for x in mapping if "random" not in x] mapping = uniqify(mapping) if len(mapping) < 50 and opts.unmapped: mapping = uniqify(mapping + missing) print spafile, len(mapping), ",".join(mapping)
def compilevcf(args): """ %prog compilevcf samples.csv Compile vcf results into master spreadsheet. """ p = OptionParser(compilevcf.__doc__) p.add_option("--db", default="hg38", help="Use these lobSTR db") p.add_option( "--nofilter", default=False, action="store_true", help="Do not filter the variants", ) p.set_home("lobstr") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str-data") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (samples, ) = args workdir = opts.workdir store = opts.output_path cleanup = not opts.nocleanup filtered = not opts.nofilter dbs = opts.db.split(",") cwd = os.getcwd() mkdir(workdir) os.chdir(workdir) samples = op.join(cwd, samples) stridsfile = "STR.ids" if samples.endswith((".vcf", ".vcf.gz")): vcffiles = [samples] else: vcffiles = [x.strip() for x in must_open(samples)] if not op.exists(stridsfile): ids = [] for db in dbs: ids.extend(STRFile(opts.lobstr_home, db=db).ids) uids = uniqify(ids) logging.debug("Combined: {} Unique: {}".format(len(ids), len(uids))) fw = open(stridsfile, "w") print("\n".join(uids), file=fw) fw.close() run_args = [(x, filtered, cleanup, store) for x in vcffiles] cpus = min(opts.cpus, len(run_args)) p = Pool(processes=cpus) for _ in p.map_async(run_compile, run_args).get(): continue
def compile(args): """ %prog compile samples.csv Compile vcf results into master spreadsheet. """ from multiprocessing import Pool p = OptionParser(compile.__doc__) p.add_option("--db", default="hg38,hg38-named", help="Use these lobSTR db") p.set_home("lobstr") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) samples, = args workdir = opts.workdir dbs = opts.db.split(",") mkdir(workdir) os.chdir(workdir) stridsfile = "STR.ids" vcffiles = [x.strip() for x in must_open(samples)] if not op.exists(stridsfile): ids = [] for db in dbs: ids.extend(STRFile(opts.lobstr_home, db=db).ids) uids = uniqify(ids) logging.debug("Combined: {} Unique: {}".format(len(ids), len(uids))) fw = open(stridsfile, "w") print >> fw, "\n".join(uids) fw.close() # Generate two alleles dipuids = [] for uid in uids: dipuids.extend([uid + ".1", uid + ".2"]) fw = open("header.ids", "w") print >> fw, ",".join(dipuids) fw.close() p = Pool(processes=opts.cpus) run_args = [(x, opts.store, opts.cleanup) for x in vcffiles] #run(run_args[0]) for res in p.map_async(run, run_args).get(): continue
def compilevcf(args): """ %prog compilevcf samples.csv Compile vcf results into master spreadsheet. """ p = OptionParser(compilevcf.__doc__) p.add_option("--db", default="hg38", help="Use these lobSTR db") p.add_option("--nofilter", default=False, action="store_true", help="Do not filter the variants") p.set_home("lobstr") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str-data") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) samples, = args workdir = opts.workdir store = opts.output_path cleanup = not opts.nocleanup filtered = not opts.nofilter dbs = opts.db.split(",") cwd = os.getcwd() mkdir(workdir) os.chdir(workdir) samples = op.join(cwd, samples) stridsfile = "STR.ids" if samples.endswith((".vcf", ".vcf.gz")): vcffiles = [samples] else: vcffiles = [x.strip() for x in must_open(samples)] if not op.exists(stridsfile): ids = [] for db in dbs: ids.extend(STRFile(opts.lobstr_home, db=db).ids) uids = uniqify(ids) logging.debug("Combined: {} Unique: {}".format(len(ids), len(uids))) fw = open(stridsfile, "w") print >> fw, "\n".join(uids) fw.close() run_args = [(x, filtered, cleanup, store) for x in vcffiles] cpus = min(opts.cpus, len(run_args)) p = Pool(processes=cpus) for res in p.map_async(run_compile, run_args).get(): continue
def compile(args): """ %prog compile samples.csv Compile vcf results into master spreadsheet. """ p = OptionParser(compile.__doc__) p.add_option("--db", default="hg38", help="Use these lobSTR db") p.set_home("lobstr") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str-data") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) samples, = args workdir = opts.workdir store = opts.output_path cleanup = not opts.nocleanup dbs = opts.db.split(",") cwd = os.getcwd() mkdir(workdir) os.chdir(workdir) samples = op.join(cwd, samples) stridsfile = "STR.ids" vcffiles = [x.strip() for x in must_open(samples)] if not op.exists(stridsfile): ids = [] for db in dbs: ids.extend(STRFile(opts.lobstr_home, db=db).ids) uids = uniqify(ids) logging.debug("Combined: {} Unique: {}".format(len(ids), len(uids))) fw = open(stridsfile, "w") print >> fw, "\n".join(uids) fw.close() p = Pool(processes=opts.cpus) run_args = [(x, store, cleanup) for x in vcffiles] for res in p.map_async(run, run_args).get(): continue