def main(args): # Construct pandas table from all VCF records rows = [] regions = [] for record in vcf.Reader(filename=args.vcf): call = record.genotype("HG002") rows.append(( record.ID or variant_descriptor(record), record.INFO["SVTYPE"], abs(record.INFO["SVLEN"][0]), record.INFO["MatchGT"] == "TRUE", )) table = pd.DataFrame.from_records( rows, columns=["ID","TYPE","SIZE","MatchGT"], ) # Potential offset (based on PBSV calls) if args.offset_file: offset_table = pd.read_table(args.offset_file) merge_table = table.merge(offset_table, on="ID") offset = merge_table[["StartDistance", "EndDistance"]].abs().max(axis=1) offset_intervals = pd.IntervalIndex.from_breaks([0, 1, 2, 5, 10, 20, 50, np.iinfo(np.int32).max], closed="left") offset_bins = pd.cut(offset, offset_intervals) offset_counts = merge_table.groupby(offset_bins).agg({ "MatchGT": "value_counts" }).groupby(level=0) totals = offset_counts.sum().rename(columns={ "MatchGT": "Total"}) idx = pd.IndexSlice concordance = offset_counts.transform(lambda x: x / x.sum()) concordance = concordance.loc[idx[:,True],:].reset_index(level=1, drop=True).rename(columns={"MatchGT": "Concordance"}) totals.join(concordance).to_csv(sys.stdout, index_label="Offset")
def plot_features( args, sim_path: str, real_path: str, vcf_path: str, out_dir_path: str ): """Generate pairwise plot of simulated and 'real' features Args: args (argparse.Namespace): Additional command line arguments sim_path (str): Path to NPSV features from 'simulated' data real_path (str): Path to NPSV features from 'real' data vcf_path (str): Path to input VCF file out_dir_path (str): Directory for plot files """ # Create output directory if it doesn't exist os.makedirs(out_dir_path, exist_ok=True) logging.info("Generating plots in %s", out_dir_path) # Group the data to prepare for querying variants sim_data = pd.read_table(sim_path, na_values=".", dtype={"#CHROM": str, "AC": int}) add_derived_features(sim_data) sim_data = sim_data.groupby(VARIANT_COL) real_data = pd.read_table(real_path, na_values=".", dtype={"#CHROM": str}) add_derived_features(real_data) real_data = real_data.groupby(VARIANT_COL) # Depending on feature extractor, not all features may be available available_features = set(sim_data.obj) & set(real_data.obj) features = [feature for feature in FEATURE_COL if feature in available_features] vcf_reader = vcf.Reader(filename=vcf_path) for record in vcf_reader: variant = ( record.CHROM, int(record.POS), int(record.sv_end), record.var_subtype, ) try: current_sim = sim_data.get_group(variant) current_real = real_data.get_group(variant) except KeyError: # No data available for this variant, skipping logging.debug( "No simulated or real data found for %s. Skipping.", variant_descriptor(record), ) continue current_real["AC"] = [-1] # Remove outliers with Z score above threshold with warnings.catch_warnings(): warnings.simplefilter("ignore") current_sim = ( current_sim.groupby("AC") .apply(filter_by_zscore, features, 5) .reset_index(drop=True) ) plot_data = current_sim.append(current_real) # Don't yet know how to encode AC directly (need strings for plotting) plot_data["AC"] = pd.Categorical( plot_data["AC"], categories=[0, 1, 2, -1] ).rename_categories(["REF", "HET", "HOM", "Act"]) colors = sns.mpl_palette("Set1", 3) + [(0, 0, 0)] # Actual data is black markers = { "REF": "o", "HET": "o", "HOM": "o", "Act": "s"} fig, ((ax11, ax12, ax13, ax14), (ax21, ax22, ax23, ax24)) = plt.subplots(2, 4, figsize=(14, 8)) sns.scatterplot(ax=ax11, x="REF_READ", y="ALT_READ", data=plot_data, hue="AC", style="AC", markers=markers, palette=colors) _set_axis_limits(ax11) sns.scatterplot(ax=ax12, x="REF_WEIGHTED_SPAN", y="ALT_WEIGHTED_SPAN", data=plot_data, hue="AC", style="AC", markers=markers, palette=colors) _set_axis_limits(ax12) sns.scatterplot(ax=ax13, x="INSERT_LOWER", y="INSERT_UPPER", data=plot_data, hue="AC", style="AC", markers=markers, palette=colors) plot_hist(ax=ax14, col="CLIP_PRIMARY", data=plot_data, colors=colors) plot_hist(ax=ax21, col="COVERAGE", data=plot_data, colors=colors) plot_hist(ax=ax22, col="DHFC", data=plot_data, colors=colors) plot_hist(ax=ax23, col="DHBFC", data=plot_data, colors=colors) plot_hist(ax=ax24, col="DHFFC", data=plot_data, colors=colors) # Make plots square for ax in fig.get_axes(): ax.set_aspect(1.0/ax.get_data_ratio(), adjustable='box') fig.suptitle("{}:{}-{}".format(*variant), size=16) fig.subplots_adjust(top=0.95, wspace=0.3, hspace=0.3) # Save plot to file name based on variant descriptor description = variant_descriptor(record) logging.info("Plotting variant into %s.pdf", description) plt.savefig(os.path.join(out_dir_path, f"{description}.pdf"))
def main(): parser = make_argument_parser() args = parser.parse_args() logging.basicConfig(level=args.loglevel) # Create any directories that are needed logging.info( f"Creating {args.output} output and {args.tempdir} temporary directories if they don't exist" ) os.makedirs(args.output, exist_ok=True) os.makedirs(args.tempdir, exist_ok=True) # Initialize parallel computing setup ray.init(num_cpus=args.threads, _temp_dir=args.tempdir, include_dashboard=False) # TODO: If library is not specified compute statistics, i.e. mean insert size, tec. if args.stats_path is not None: logging.info("Extracting BAM stats from NPSV stats file") sample = Sample.from_npsv(args.stats_path, bam_path=args.bam, ped_path=args.ped_path) elif None not in ( args.fragment_mean, args.fragment_sd, args.read_length, args.depth, ): logging.info("Using Normal distribution for BAM stats") sample = Sample.from_distribution( args.bam, args.fragment_mean, args.fragment_sd, args.read_length, mean_coverage=args.depth, ) else: raise parser.error( "Library information needed. Either provide distribution parameters or run `npsvg preprocess` to generate stats file." ) # Select directory for variant files if args.keep_synth_bams: variant_dir = args.output else: variant_dir = args.tempdir # For each variant generate synthetic bam file(s) and extract relevant evidence observed_variants = {} record_results = [] vcf_reader = vcf.Reader(filename=args.input) for i, record in enumerate(tqdm(vcf_reader, desc="Preparing variants")): variant = Variant.from_pyvcf(record, args.reference) # npsv currently only supports deletions if variant is None: continue # NPSV currently does not support variants with duplicate start and end coordinates description = variant_descriptor(record) if observed_variants.setdefault(description, i) != i: logging.warning("Skipping variant with duplicate description %s", description) continue # Construct single variant VCF outside of worker so we don't need to pass the reader into the thread variant_vcf_path = os.path.join(variant_dir, description + ".vcf") if not args.reuse or not os.path.exists(variant_vcf_path + ".gz"): variant_vcf_path = write_record_to_indexed_vcf( record, vcf_reader, variant_vcf_path) else: # Variant file already exists, no need to recreate variant_vcf_path += ".gz" record_results.append( simulate_and_extract.remote(args, sample, variant, variant_vcf_path, description)) # Concatenate output files to create feature files sim_tsv_path = os.path.join(args.output, args.prefix + ".sim.tsv") real_tsv_path = os.path.join(args.output, args.prefix + ".real.tsv") logging.info("Extracting features (to %s and %s)", sim_tsv_path, real_tsv_path) with open(sim_tsv_path, "w") as file: Features.header(out_file=file, ac=True) with open(real_tsv_path, "w") as file: Features.header(out_file=file, ac=False) with open(sim_tsv_path, "ab") as sim_sink, open(real_tsv_path, "ab") as real_sink: for sim_result, real_result in tqdm( ray_iterator(record_results), total=len(record_results), desc="Extracting features", ): with open(sim_result, "rb") as source: shutil.copyfileobj(source, sim_sink) sim_sink.flush() with open(real_result, "rb") as source: shutil.copyfileobj(source, real_sink) real_sink.flush() # Perform genotyping with open(os.path.join(args.output, args.prefix + ".npsv.vcf"), "w") as gt_vcf_file: logging.info("Determining genotypes (output in %s)", gt_vcf_file.name) genotyping_args = argparse.Namespace(**vars(args)) genotype_vcf( genotyping_args, args.input, sim_tsv_path, real_tsv_path, gt_vcf_file, samples=[sample.name], )
def main(args): mother_sample, father_sample, proband_sample = args.trio.split(",") rows = [] for record in vcf.Reader(filename=args.vcf): proband = record.genotype(proband_sample) mother = record.genotype(mother_sample) father = record.genotype(father_sample) genotypes = (proband.gt_type, mother.gt_type, father.gt_type) error_type = MER_GENOTYPES.get(genotypes, "Other") try: gqs = [ call.data.GQ for call in (proband, mother, father) if "GQ" in call.data._fields ] if len(gqs) == 0: gqs = [ np.partition(call.data.PL, 1)[1] for call in (proband, mother, father) if "PL" in call.data._fields ] if len(gqs) == 0: gqs = [ call.data.SQ for call in (proband, mother, father) if "SQ" in call.data._fields ] min_gq = np.min(gqs) except: min_gq = None rows.append(( record.ID or variant_descriptor(record), error_type, min_gq, )) table = pd.DataFrame.from_records( rows, columns=["ID", "TYPE", "MIN_GQ"], ) table["TYPE"] = table["TYPE"].astype("category") if args.command == "counts": type_counts = table.groupby("TYPE").size().to_frame("Count") type_counts[ "Fraction"] = type_counts["Count"] / type_counts["Count"].sum() type_counts.to_csv(sys.stdout) elif args.command == "gq": gq_bins = pd.cut(table.MIN_GQ, [0, 10, 20, 30, 40, 100], right=False) gq_counts = table.groupby(gq_bins).size().to_frame("Count") gq_counts["Fraction"] = gq_counts["Count"] / gq_counts["Count"].sum() gq_counts.to_csv(sys.stdout, index_label="GQ") elif args.command == "list": list_table = table.sort_values(by=["MIN_GQ"], ascending=False).reset_index(drop=True) list_table["GIAB"] = False list_table.loc[(list_table["ID"] == "HG2_Ill_SpiralSDKrefine_6835") | ( list_table["ID"] == "HG2_PB_SVrefine2PB10Xhap12_10613"), "GIAB"] = True list_table.to_csv(sys.stdout, index=False)
def main(args): # Construct pandas table from all VCF records rows = [] regions = [] for record in vcf.Reader(filename=args.vcf): call = record.genotype(args.sample) if "GQ" in call.data._fields: gq = call.data.GQ elif "PL" in call.data._fields and None not in call.data.PL: gq = np.partition(call.data.PL, 1)[1] elif "SQ" in call.data._fields: gq = call.data.SQ else: gq = None svlen = record.INFO["SVLEN"] if isinstance(svlen, collections.Sequence): svlen = svlen[0] rows.append(( record.ID or variant_descriptor(record), record.INFO["SVTYPE"], abs(svlen), record.INFO["MatchGT"] == "TRUE", gq, record.INFO.get("TRall") == "TRUE", record.INFO.get("TRgt100") == "TRUE", record.INFO.get("TRgt10k") == "TRUE", record.INFO.get("segdup") == "TRUE", sum(map(lambda x: abs(x), record.INFO.get("CIPOS", [0,0]))), sum(map(lambda x: abs(x), record.INFO.get("CIEND", [0,0]))), )) table = pd.DataFrame.from_records( rows, columns=["ID", "TYPE", "SVLEN", "MatchGT", "GQ", "TRall", "TRgt100", "TRgt10k", "SegDup", "CIPOS", "CIEND"], ) if args.command == "gq": gq_bins = pd.cut(table.GQ, [0, 10, 20, 30, 40, 100], right=False) gq_table = table.groupby(gq_bins, observed=True).agg({"MatchGT": "value_counts"}).groupby( level=0).transform(lambda x: x / x.sum()) idx = pd.IndexSlice gq_table.loc[idx[:, True], :].reset_index(level=1, drop=True).rename( columns={"MatchGT": "Concordance"}).to_csv(sys.stdout, index_label="GQ") elif args.command == "vntr": enrichment = table.groupby("MatchGT").agg({ "TRall": "value_counts", "TRgt100": "value_counts", "TRgt10k": "value_counts", "SegDup": "value_counts", }, normalize=True).dropna(axis=1, how="any").groupby(level=0).transform(lambda x: x/x.sum()) idx = pd.IndexSlice enrichment.loc[idx[:, True], :].reset_index(level=1, drop=True).to_csv( sys.stdout, index_label=["Concordant"]) elif args.command == "len": len_bins = pd.cut( table.SVLEN, [50, 100, 300, 1000, np.iinfo(np.int32).max], right=False) len_table = table.groupby(len_bins).agg({"MatchGT": "value_counts"}).groupby( level=0).transform(lambda x: x / x.sum()) idx = pd.IndexSlice len_table.loc[idx[:, True], :].reset_index(level=1, drop=True).rename( columns={"MatchGT": "Concordance"}).to_csv(sys.stdout) elif args.command == "list": table.to_csv(sys.stdout, index=False) elif args.command == "vntr-conc": table["SVLEN_RANGE"] = pd.cut(table.SVLEN, [50, 100, 300, 1000, np.iinfo(np.int32).max], right=False) vntr_table = table.groupby(["SVLEN_RANGE","TRgt100"]).agg({"MatchGT": "value_counts"}) vntr_table["Count"] = vntr_table.groupby(level=[0, 1]).transform(lambda x: x.sum()) vntr_table["Concordance"] = vntr_table["MatchGT"] / vntr_table["Count"] idx = pd.IndexSlice vntr_table.loc[idx[:, :, True], :].reset_index(level=2, drop=True).drop(columns="MatchGT").to_csv(sys.stdout)