def treds(args): """ %prog treds hli.tred.tsv Compile allele_frequency for TREDs results. Write data.tsv, meta.tsv and mask.tsv in one go. """ from jcvi.apps.base import datafile p = OptionParser(treds.__doc__) p.add_option("--csv", default=False, action="store_true", help="Also write `meta.csv`") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (tredresults, ) = args df = pd.read_csv(tredresults, sep="\t") tredsfile = datafile("TREDs.meta.csv") tf = pd.read_csv(tredsfile) tds = list(tf["abbreviation"]) ids = list(tf["id"]) tags = ["SampleKey"] final_columns = ["SampleKey"] afs = [] for td, id in zip(tds, ids): tag1 = "{}.1".format(td) tag2 = "{}.2".format(td) if tag2 not in df: afs.append("{}") continue tags.append(tag2) final_columns.append(id) a = np.array(list(df[tag1]) + list(df[tag2])) counts = alleles_to_counts(a) af = counts_to_af(counts) afs.append(af) tf["allele_frequency"] = afs metafile = "TREDs_{}_SEARCH.meta.tsv".format(timestamp()) tf.to_csv(metafile, sep="\t", index=False) logging.debug("File `{}` written.".format(metafile)) if opts.csv: metacsvfile = metafile.rsplit(".", 1)[0] + ".csv" tf.to_csv(metacsvfile, index=False) logging.debug("File `{}` written.".format(metacsvfile)) pp = df[tags] pp.columns = final_columns datafile = "TREDs_{}_SEARCH.data.tsv".format(timestamp()) pp.to_csv(datafile, sep="\t", index=False) logging.debug("File `{}` written.".format(datafile)) mask([datafile, metafile])
def treds(args): """ %prog treds hli.tred.tsv Compile allele_frequency for TREDs results. Write data.tsv, meta.tsv and mask.tsv in one go. """ p = OptionParser(treds.__doc__) p.add_option("--csv", default=False, action="store_true", help="Also write `meta.csv`") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) tredresults, = args df = pd.read_csv(tredresults, sep="\t") tredsfile = op.join(datadir, "TREDs.meta.hg38.csv") tf = pd.read_csv(tredsfile) tds = list(tf["abbreviation"]) ids = list(tf["id"]) tags = ["SampleKey"] final_columns = ["SampleKey"] afs = [] for td, id in zip(tds, ids): tag1 = "{}.1".format(td) tag2 = "{}.2".format(td) if tag2 not in df: afs.append("{}") continue tags.append(tag2) final_columns.append(id) a = np.array(list(df[tag1]) + list(df[tag2])) counts = alleles_to_counts(a) af = counts_to_af(counts) afs.append(af) tf["allele_frequency"] = afs metafile = "TREDs_{}_SEARCH.meta.tsv".format(timestamp()) tf.to_csv(metafile, sep="\t", index=False) logging.debug("File `{}` written.".format(metafile)) if opts.csv: metacsvfile = metafile.rsplit(".", 1)[0] + ".csv" tf.to_csv(metacsvfile, index=False) logging.debug("File `{}` written.".format(metacsvfile)) pp = df[tags] pp.columns = final_columns datafile = "TREDs_{}_SEARCH.data.tsv".format(timestamp()) pp.to_csv(datafile, sep="\t", index=False) logging.debug("File `{}` written.".format(datafile)) mask([datafile, metafile])
def stop(args): """ %prog stop Stop EC2 instance. """ p = OptionParser(stop.__doc__) p.add_option("--profile", default="mvrad-datasci-role", help="Profile name") opts, args = p.parse_args(args) if len(args) != 0: sys.exit(not p.print_help()) role(["htang"]) session = boto3.Session(profile_name=opts.profile) client = session.client("ec2") s = InstanceSkeleton() # Make sure the instance id is NOT empty instance_id = s.instance_id if instance_id == "": logging.error("Cannot find instance_id {}".format(instance_id)) sys.exit(1) block_device_mappings = [] for volume in s.volumes: block_device_mappings.append({"DeviceName": volume["Device"], "NoDevice": ""}) new_image_name = "htang-dev-{}-{}".format(timestamp(), int(time.time())) response = client.create_image( InstanceId=instance_id, Name=new_image_name, BlockDeviceMappings=block_device_mappings, ) print(response, file=sys.stderr) new_image_id = response["ImageId"] image_status = "" while image_status != "available": logging.debug("Waiting for image to be ready") time.sleep(10) response = client.describe_images(ImageIds=[new_image_id]) image_status = response["Images"][0]["State"] # Delete old image, snapshot and shut down instance old_image_id = s.image_id response = client.describe_images(ImageIds=[old_image_id]) old_snapshot_id = response["Images"][0]["BlockDeviceMappings"][0]["Ebs"][ "SnapshotId" ] response = client.deregister_image(ImageId=old_image_id) print(response, file=sys.stderr) response = client.delete_snapshot(SnapshotId=old_snapshot_id) print(response, file=sys.stderr) response = client.terminate_instances(InstanceIds=[instance_id]) print(response, file=sys.stderr) # Save new image id s.save_image_id(new_image_id) s.save_instance_id("", "")
def stop(args): """ %prog stop Stop EC2 instance. """ p = OptionParser(stop.__doc__) p.add_option("--profile", default="mvrad-datasci-role", help="Profile name") opts, args = p.parse_args(args) if len(args) != 0: sys.exit(not p.print_help()) role(["205134639408", "htang", "114692162163", "mvrad-datasci-role"]) session = boto3.Session(profile_name=opts.profile) client = session.client('ec2') s = InstanceSkeleton() # Create image instance_id = s.instance_id block_device_mappings = [] for volume in s.volumes: block_device_mappings.append( { "DeviceName": volume["Device"], "NoDevice": "" } ) new_image_name = "htang-dev-{}-{}".format(timestamp(), int(time.time())) response = client.create_image( InstanceId=instance_id, Name=new_image_name, BlockDeviceMappings=block_device_mappings ) print >> sys.stderr, response new_image_id = response["ImageId"] image_status = "" while image_status != "available": logging.debug("Waiting for image to be ready") time.sleep(10) response = client.describe_images(ImageIds=[new_image_id]) image_status = response["Images"][0]["State"] # Delete old image, snapshot and shut down instance old_image_id = s.image_id response = client.describe_images(ImageIds=[old_image_id]) old_snapshot_id = response["Images"][0]["BlockDeviceMappings"][0]["Ebs"]["SnapshotId"] response = client.deregister_image(ImageId=old_image_id) print >> sys.stderr, response response = client.delete_snapshot(SnapshotId=old_snapshot_id) print >> sys.stderr, response response = client.terminate_instances(InstanceIds=[instance_id]) print >> sys.stderr, response # Save new image id s.save_image_id(new_image_id) s.save_instance_id("")
def treds(args): """ %prog treds hli.tred.tsv Compile allele_frequency for TREDs results. Write data.tsv, meta.tsv and mask.tsv in one go. """ p = OptionParser(treds.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) tredresults, = args df = pd.read_csv(tredresults, sep="\t") tredsfile = op.join(datadir, "TREDs.meta.hg38.csv") tf = pd.read_csv(tredsfile) tds = list(tf["abbreviation"]) ids = list(tf["id"]) tags = ["SampleKey"] final_columns = ["SampleKey"] afs = [] for td, id in zip(tds, ids): tag = "{}.2".format(td) tags.append(tag) a = df["{}.2".format(td)] final_columns.append(id) counts = alleles_to_counts(a) af = counts_to_af(counts) afs.append(af) tf["allele_frequency"] = afs metafile = "TREDs_{}_SEARCH.meta.tsv".format(timestamp()) tf.to_csv(metafile, sep="\t", index=False) logging.debug("File `{}` written.".format(metafile)) pp = df[tags] pp.columns = final_columns datafile = "TREDs_{}_SEARCH.data.tsv".format(timestamp()) pp.to_csv(datafile, sep="\t", index=False) logging.debug("File `{}` written.".format(datafile)) mask([datafile, metafile])
def data(args): """ %prog data data.bin samples.ids STR.ids meta.tsv Make data.tsv based on meta.tsv. """ p = OptionParser(data.__doc__) p.add_option( "--notsv", default=False, action="store_true", help="Do not write data.tsv" ) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) databin, sampleids, strids, metafile = args final_columns, percentiles = read_meta(metafile) df, m, samples, loci = read_binfile(databin, sampleids, strids) # Clean the data m %= 1000 # Get the larger of the two alleles m[m == 999] = -1 # Missing data final = set(final_columns) remove = [] for i, locus in enumerate(loci): if locus not in final: remove.append(locus) continue pf = "STRs_{}_SEARCH".format(timestamp()) filteredstrids = "{}.STR.ids".format(pf) fw = open(filteredstrids, "w") print("\n".join(final_columns), file=fw) fw.close() logging.debug( "Dropped {} columns; Retained {} columns (`{}`)".format( len(remove), len(final_columns), filteredstrids ) ) # Remove low-quality columns! df.drop(remove, inplace=True, axis=1) df.columns = final_columns filtered_bin = "{}.data.bin".format(pf) if need_update(databin, filtered_bin): m = df.as_matrix() m.tofile(filtered_bin) logging.debug("Filtered binary matrix written to `{}`".format(filtered_bin)) # Write data output filtered_tsv = "{}.data.tsv".format(pf) if not opts.notsv and need_update(databin, filtered_tsv): df.to_csv(filtered_tsv, sep="\t", index_label="SampleKey")
def data(args): """ %prog data data.bin samples.ids STR.ids meta.tsv Make data.tsv based on meta.tsv. """ p = OptionParser(data.__doc__) p.add_option("--notsv", default=False, action="store_true", help="Do not write data.tsv") opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) databin, sampleids, strids, metafile = args final_columns, percentiles = read_meta(metafile) df, m, samples, loci = read_binfile(databin, sampleids, strids) # Clean the data m %= 1000 # Get the larger of the two alleles m[m == 999] = -1 # Missing data final = set(final_columns) remove = [] for i, locus in enumerate(loci): if locus not in final: remove.append(locus) continue pf = "STRs_{}_SEARCH".format(timestamp()) filteredstrids = "{}.STR.ids".format(pf) fw = open(filteredstrids, "w") print >> fw, "\n".join(final_columns) fw.close() logging.debug("Dropped {} columns; Retained {} columns (`{}`)".\ format(len(remove), len(final_columns), filteredstrids)) # Remove low-quality columns! df.drop(remove, inplace=True, axis=1) df.columns = final_columns filtered_bin = "{}.data.bin".format(pf) if need_update(databin, filtered_bin): m = df.as_matrix() m.tofile(filtered_bin) logging.debug("Filtered binary matrix written to `{}`".format(filtered_bin)) # Write data output filtered_tsv = "{}.data.tsv".format(pf) if not opts.notsv and need_update(databin, filtered_tsv): df.to_csv(filtered_tsv, sep="\t", index_label="SampleKey")
def get_vcfstanza(fastafile, fasta, sampleid="SAMP_001"): from jcvi.formats.base import timestamp # VCF spec m = "##fileformat=VCFv4.1\n" m += "##fileDate={0}\n".format(timestamp()) m += "##source={0}\n".format(__file__) m += "##reference=file://{0}\n".format(op.abspath(fastafile).strip("/")) m += '##INFO=<ID=PR,Number=0,Type=Flag,Description="Provisional genotype">\n' m += '##INFO=<ID=IM,Number=0,Type=Flag,Description="Imputed genotype">\n' m += '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n' m += '##FORMAT=<ID=GP,Number=3,Type=Float,Description="Estimated Genotype Probability">\n' header = "CHROM POS ID REF ALT QUAL FILTER INFO FORMAT\n".split() + [sampleid] m += "#" + "\t".join(header) return m
def get_vcfstanza(fastafile, fasta, sampleid="SAMP_001"): from jcvi.formats.base import timestamp # VCF spec m = "##fileformat=VCFv4.1\n" m += "##fileDate={0}\n".format(timestamp()) m += "##source={0}\n".format(__file__) m += "##reference=file://{0}\n".format(op.abspath(fastafile).strip("/")) m += '##INFO=<ID=PR,Number=0,Type=Flag,Description="Provisional genotype">\n' m += '##INFO=<ID=IM,Number=0,Type=Flag,Description="Imputed genotype">\n' m += '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n' m += '##FORMAT=<ID=GP,Number=3,Type=Float,Description="Estimated Genotype Probability">\n' header = "CHROM POS ID REF ALT QUAL FILTER INFO FORMAT\n".split() + [ sampleid ] m += "#" + "\t".join(header) return m
def mask(args): """ %prog mask data.bin samples.ids STR.ids meta.tsv OR %prog mask data.tsv meta.tsv Compute P-values based on meta and data. The `data.bin` should be the matrix containing filtered loci and the output mask.tsv will have the same dimension. """ p = OptionParser(mask.__doc__) opts, args = p.parse_args(args) if len(args) not in (2, 4): sys.exit(not p.print_help()) if len(args) == 4: databin, sampleids, strids, metafile = args df, m, samples, loci = read_binfile(databin, sampleids, strids) mode = "STRs" elif len(args) == 2: databin, metafile = args df = pd.read_csv(databin, sep="\t", index_col=0) m = df.as_matrix() samples = df.index loci = list(df.columns) mode = "TREDs" pf = "{}_{}_SEARCH".format(mode, timestamp()) final_columns, percentiles = read_meta(metafile) maskfile = pf + ".mask.tsv" run_args = [] for i, locus in enumerate(loci): a = m[:, i] percentile = percentiles[locus] run_args.append((i, a, percentile)) if mode == "TREDs" or need_update(databin, maskfile): cpus = min(8, len(run_args)) write_mask(cpus, samples, final_columns, run_args, filename=maskfile) logging.debug("File `{}` written.".format(maskfile))
def meta(args): """ %prog meta data.bin samples STR.ids STR-exons.wo.bed Compute allele frequencies and prune sites based on missingness. Filter subset of loci that satisfy: 1. no redundancy (unique chr:pos) 2. variable (n_alleles > 1) 3. low level of missing data (>= 50% autosomal + X, > 25% for Y) Write meta file with the following infor: 1. id 2. title 3. gene_name 4. variant_type 5. motif 6. allele_frequency `STR-exons.wo.bed` can be generated like this: $ tail -n 694105 /mnt/software/lobSTR/hg38/index.tab | cut -f1-3 > all-STR.bed $ intersectBed -a all-STR.bed -b all-exons.bed -wo > STR-exons.wo.bed """ p = OptionParser(meta.__doc__) p.add_option("--cutoff", default=.5, type="float", help="Percent observed required (chrY half cutoff)") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) binfile, sampleids, strids, wobed = args cutoff = opts.cutoff af_file = "allele_freq" if need_update(binfile, af_file): df, m, samples, loci = read_binfile(binfile, sampleids, strids) nalleles = len(samples) fw = must_open(af_file, "w") for i, locus in enumerate(loci): a = m[:, i] counts = alleles_to_counts(a) af = counts_to_af(counts) seqid = locus.split("_")[0] remove = counts_filter(counts, nalleles, seqid, cutoff=cutoff) print("\t".join((locus, af, remove)), file=fw) fw.close() logging.debug("Load gene intersections from `{}`".format(wobed)) fp = open(wobed) gene_map = defaultdict(set) for row in fp: chr1, start1, end1, chr2, start2, end2, name, ov = row.split() gene_map[(chr1, start1)] |= set(name.split(",")) for k, v in gene_map.items(): non_enst = sorted(x for x in v if not x.startswith("ENST")) #enst = sorted(x.rsplit(".", 1)[0] for x in v if x.startswith("ENST")) gene_map[k] = ",".join(non_enst) TREDS, df = read_treds() metafile = "STRs_{}_SEARCH.meta.tsv".format(timestamp()) write_meta(af_file, gene_map, TREDS, filename=metafile) logging.debug("File `{}` written.".format(metafile))
def meta(args): """ %prog meta data.bin samples STR.ids STR-exons.wo.bed Compute allele frequencies and prune sites based on missingness. Filter subset of loci that satisfy: 1. no redundancy (unique chr:pos) 2. variable (n_alleles > 1) 3. low level of missing data (>= 50% autosomal + X, > 25% for Y) Write meta file with the following infor: 1. id 2. title 3. gene_name 4. variant_type 5. motif 6. allele_frequency `STR-exons.wo.bed` can be generated like this: $ tail -n 694105 /mnt/software/lobSTR/hg38/index.tab | cut -f1-3 > all-STR.bed $ intersectBed -a all-STR.bed -b all-exons.bed -wo > STR-exons.wo.bed """ p = OptionParser(meta.__doc__) p.add_option("--cutoff", default=.5, type="float", help="Percent observed required (chrY half cutoff)") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) binfile, sampleids, strids, wobed = args cutoff = opts.cutoff af_file = "allele_freq" if need_update(binfile, af_file): df, m, samples, loci = read_binfile(binfile, sampleids, strids) nalleles = len(samples) fw = must_open(af_file, "w") for i, locus in enumerate(loci): a = m[:, i] counts = alleles_to_counts(a) af = counts_to_af(counts) seqid = locus.split("_")[0] remove = counts_filter(counts, nalleles, seqid, cutoff=cutoff) print >> fw, "\t".join((locus, af, remove)) fw.close() logging.debug("Load gene intersections from `{}`".format(wobed)) fp = open(wobed) gene_map = defaultdict(set) for row in fp: chr1, start1, end1, chr2, start2, end2, name, ov = row.split() gene_map[(chr1, start1)] |= set(name.split(",")) for k, v in gene_map.items(): non_enst = sorted(x for x in v if not x.startswith("ENST")) #enst = sorted(x.rsplit(".", 1)[0] for x in v if x.startswith("ENST")) gene_map[k] = ",".join(non_enst) tredsfile = op.join(datadir, "TREDs.meta.hg38.csv") TREDS = read_treds(tredsfile) metafile = "STRs_{}_SEARCH.meta.tsv".format(timestamp()) write_meta(af_file, gene_map, TREDS, filename=metafile) logging.debug("File `{}` written.".format(metafile))