Exemple #1
0
def main(args):
    # Construct pandas table from all VCF records
    rows = []
    regions = []
    for record in vcf.Reader(filename=args.vcf):
        call = record.genotype("HG002")
        rows.append((
            record.ID or variant_descriptor(record),
            record.INFO["SVTYPE"],
            abs(record.INFO["SVLEN"][0]),
            record.INFO["MatchGT"] == "TRUE",
        ))

    table = pd.DataFrame.from_records(
        rows, 
        columns=["ID","TYPE","SIZE","MatchGT"],
    )
    
    # Potential offset (based on PBSV calls)
    if args.offset_file:
        offset_table = pd.read_table(args.offset_file)
        merge_table = table.merge(offset_table, on="ID")

        offset = merge_table[["StartDistance", "EndDistance"]].abs().max(axis=1)
        
        offset_intervals = pd.IntervalIndex.from_breaks([0, 1, 2, 5, 10, 20, 50, np.iinfo(np.int32).max], closed="left")
        offset_bins = pd.cut(offset, offset_intervals)
        offset_counts = merge_table.groupby(offset_bins).agg({ "MatchGT": "value_counts" }).groupby(level=0)
        
        totals = offset_counts.sum().rename(columns={ "MatchGT": "Total"})
        
        idx = pd.IndexSlice
        concordance = offset_counts.transform(lambda x: x / x.sum())
        concordance = concordance.loc[idx[:,True],:].reset_index(level=1, drop=True).rename(columns={"MatchGT": "Concordance"})
        
        totals.join(concordance).to_csv(sys.stdout, index_label="Offset")
Exemple #2
0
def plot_features(
    args, sim_path: str, real_path: str, vcf_path: str, out_dir_path: str
):
    """Generate pairwise plot of simulated and 'real' features
    
    Args:
        args (argparse.Namespace): Additional command line arguments
        sim_path (str): Path to NPSV features from 'simulated' data
        real_path (str): Path to NPSV features from 'real' data
        vcf_path (str): Path to input VCF file
        out_dir_path (str): Directory for plot files
    """
    # Create output directory if it doesn't exist
    os.makedirs(out_dir_path, exist_ok=True)
    logging.info("Generating plots in %s", out_dir_path)

    # Group the data to prepare for querying variants
    sim_data = pd.read_table(sim_path, na_values=".", dtype={"#CHROM": str, "AC": int})
    add_derived_features(sim_data)
    sim_data = sim_data.groupby(VARIANT_COL)

    real_data = pd.read_table(real_path, na_values=".", dtype={"#CHROM": str})
    add_derived_features(real_data)
    real_data = real_data.groupby(VARIANT_COL)

    # Depending on feature extractor, not all features may be available
    available_features = set(sim_data.obj) & set(real_data.obj)
    features = [feature for feature in FEATURE_COL if feature in available_features]

    vcf_reader = vcf.Reader(filename=vcf_path)
    for record in vcf_reader:
        variant = (
            record.CHROM,
            int(record.POS),
            int(record.sv_end),
            record.var_subtype,
        )

        try:
            current_sim = sim_data.get_group(variant)
            current_real = real_data.get_group(variant)
        except KeyError:
            # No data available for this variant, skipping
            logging.debug(
                "No simulated or real data found for %s. Skipping.",
                variant_descriptor(record),
            )
            continue
        current_real["AC"] = [-1]

        # Remove outliers with Z score above threshold
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            current_sim = (
                current_sim.groupby("AC")
                .apply(filter_by_zscore, features, 5)
                .reset_index(drop=True)
            )

        plot_data = current_sim.append(current_real)
        # Don't yet know how to encode AC directly (need strings for plotting)
        plot_data["AC"] = pd.Categorical(
            plot_data["AC"], categories=[0, 1, 2, -1]
        ).rename_categories(["REF", "HET", "HOM", "Act"])

        colors = sns.mpl_palette("Set1", 3) + [(0, 0, 0)]  # Actual data is black
        markers = { "REF": "o", "HET": "o", "HOM": "o", "Act": "s"}
        
        fig, ((ax11, ax12, ax13, ax14), (ax21, ax22, ax23, ax24)) = plt.subplots(2, 4, figsize=(14, 8))
        
        sns.scatterplot(ax=ax11, x="REF_READ", y="ALT_READ", data=plot_data, hue="AC", style="AC", markers=markers, palette=colors)
        _set_axis_limits(ax11)
        sns.scatterplot(ax=ax12, x="REF_WEIGHTED_SPAN", y="ALT_WEIGHTED_SPAN", data=plot_data, hue="AC", style="AC", markers=markers, palette=colors)
        _set_axis_limits(ax12)
        sns.scatterplot(ax=ax13, x="INSERT_LOWER", y="INSERT_UPPER", data=plot_data, hue="AC", style="AC", markers=markers, palette=colors)
        plot_hist(ax=ax14, col="CLIP_PRIMARY", data=plot_data, colors=colors)
        
        plot_hist(ax=ax21, col="COVERAGE", data=plot_data, colors=colors)
        plot_hist(ax=ax22, col="DHFC", data=plot_data, colors=colors)
        plot_hist(ax=ax23, col="DHBFC", data=plot_data, colors=colors)
        plot_hist(ax=ax24, col="DHFFC", data=plot_data, colors=colors)

        # Make plots square
        for ax in fig.get_axes():
            ax.set_aspect(1.0/ax.get_data_ratio(), adjustable='box')

        fig.suptitle("{}:{}-{}".format(*variant), size=16)
        fig.subplots_adjust(top=0.95, wspace=0.3, hspace=0.3)

        # Save plot to file name based on variant descriptor
        description = variant_descriptor(record)
        logging.info("Plotting variant into %s.pdf", description)
        plt.savefig(os.path.join(out_dir_path, f"{description}.pdf"))
Exemple #3
0
def main():
    parser = make_argument_parser()
    args = parser.parse_args()

    logging.basicConfig(level=args.loglevel)

    # Create any directories that are needed
    logging.info(
        f"Creating {args.output} output and {args.tempdir} temporary directories if they don't exist"
    )
    os.makedirs(args.output, exist_ok=True)
    os.makedirs(args.tempdir, exist_ok=True)

    # Initialize parallel computing setup
    ray.init(num_cpus=args.threads,
             _temp_dir=args.tempdir,
             include_dashboard=False)

    # TODO: If library is not specified compute statistics, i.e. mean insert size, tec.
    if args.stats_path is not None:
        logging.info("Extracting BAM stats from NPSV stats file")
        sample = Sample.from_npsv(args.stats_path,
                                  bam_path=args.bam,
                                  ped_path=args.ped_path)
    elif None not in (
            args.fragment_mean,
            args.fragment_sd,
            args.read_length,
            args.depth,
    ):
        logging.info("Using Normal distribution for BAM stats")
        sample = Sample.from_distribution(
            args.bam,
            args.fragment_mean,
            args.fragment_sd,
            args.read_length,
            mean_coverage=args.depth,
        )
    else:
        raise parser.error(
            "Library information needed. Either provide distribution parameters or run `npsvg preprocess` to generate stats file."
        )

    # Select directory for variant files
    if args.keep_synth_bams:
        variant_dir = args.output
    else:
        variant_dir = args.tempdir

    # For each variant generate synthetic bam file(s) and extract relevant evidence
    observed_variants = {}
    record_results = []
    vcf_reader = vcf.Reader(filename=args.input)
    for i, record in enumerate(tqdm(vcf_reader, desc="Preparing variants")):
        variant = Variant.from_pyvcf(record, args.reference)
        # npsv currently only supports deletions
        if variant is None:
            continue

        # NPSV currently does not support variants with duplicate start and end coordinates
        description = variant_descriptor(record)
        if observed_variants.setdefault(description, i) != i:
            logging.warning("Skipping variant with duplicate description %s",
                            description)
            continue

        # Construct single variant VCF outside of worker so we don't need to pass the reader into the thread
        variant_vcf_path = os.path.join(variant_dir, description + ".vcf")
        if not args.reuse or not os.path.exists(variant_vcf_path + ".gz"):
            variant_vcf_path = write_record_to_indexed_vcf(
                record, vcf_reader, variant_vcf_path)
        else:
            # Variant file already exists, no need to recreate
            variant_vcf_path += ".gz"

        record_results.append(
            simulate_and_extract.remote(args, sample, variant,
                                        variant_vcf_path, description))

    # Concatenate output files to create feature files
    sim_tsv_path = os.path.join(args.output, args.prefix + ".sim.tsv")
    real_tsv_path = os.path.join(args.output, args.prefix + ".real.tsv")
    logging.info("Extracting features (to %s and %s)", sim_tsv_path,
                 real_tsv_path)

    with open(sim_tsv_path, "w") as file:
        Features.header(out_file=file, ac=True)
    with open(real_tsv_path, "w") as file:
        Features.header(out_file=file, ac=False)

    with open(sim_tsv_path, "ab") as sim_sink, open(real_tsv_path,
                                                    "ab") as real_sink:
        for sim_result, real_result in tqdm(
                ray_iterator(record_results),
                total=len(record_results),
                desc="Extracting features",
        ):
            with open(sim_result, "rb") as source:
                shutil.copyfileobj(source, sim_sink)
            sim_sink.flush()
            with open(real_result, "rb") as source:
                shutil.copyfileobj(source, real_sink)
            real_sink.flush()

    # Perform genotyping
    with open(os.path.join(args.output, args.prefix + ".npsv.vcf"),
              "w") as gt_vcf_file:
        logging.info("Determining genotypes (output in %s)", gt_vcf_file.name)
        genotyping_args = argparse.Namespace(**vars(args))
        genotype_vcf(
            genotyping_args,
            args.input,
            sim_tsv_path,
            real_tsv_path,
            gt_vcf_file,
            samples=[sample.name],
        )
Exemple #4
0
def main(args):
    mother_sample, father_sample, proband_sample = args.trio.split(",")

    rows = []
    for record in vcf.Reader(filename=args.vcf):
        proband = record.genotype(proband_sample)
        mother = record.genotype(mother_sample)
        father = record.genotype(father_sample)

        genotypes = (proband.gt_type, mother.gt_type, father.gt_type)
        error_type = MER_GENOTYPES.get(genotypes, "Other")

        try:
            gqs = [
                call.data.GQ for call in (proband, mother, father)
                if "GQ" in call.data._fields
            ]
            if len(gqs) == 0:
                gqs = [
                    np.partition(call.data.PL, 1)[1]
                    for call in (proband, mother, father)
                    if "PL" in call.data._fields
                ]
            if len(gqs) == 0:
                gqs = [
                    call.data.SQ for call in (proband, mother, father)
                    if "SQ" in call.data._fields
                ]
            min_gq = np.min(gqs)
        except:
            min_gq = None

        rows.append((
            record.ID or variant_descriptor(record),
            error_type,
            min_gq,
        ))

    table = pd.DataFrame.from_records(
        rows,
        columns=["ID", "TYPE", "MIN_GQ"],
    )
    table["TYPE"] = table["TYPE"].astype("category")

    if args.command == "counts":
        type_counts = table.groupby("TYPE").size().to_frame("Count")
        type_counts[
            "Fraction"] = type_counts["Count"] / type_counts["Count"].sum()
        type_counts.to_csv(sys.stdout)
    elif args.command == "gq":
        gq_bins = pd.cut(table.MIN_GQ, [0, 10, 20, 30, 40, 100], right=False)
        gq_counts = table.groupby(gq_bins).size().to_frame("Count")
        gq_counts["Fraction"] = gq_counts["Count"] / gq_counts["Count"].sum()
        gq_counts.to_csv(sys.stdout, index_label="GQ")
    elif args.command == "list":
        list_table = table.sort_values(by=["MIN_GQ"],
                                       ascending=False).reset_index(drop=True)
        list_table["GIAB"] = False
        list_table.loc[(list_table["ID"] == "HG2_Ill_SpiralSDKrefine_6835") | (
            list_table["ID"] == "HG2_PB_SVrefine2PB10Xhap12_10613"),
                       "GIAB"] = True
        list_table.to_csv(sys.stdout, index=False)
Exemple #5
0
def main(args):
    # Construct pandas table from all VCF records
    rows = []
    regions = []
    for record in vcf.Reader(filename=args.vcf):
        call = record.genotype(args.sample)
        if "GQ" in call.data._fields:
            gq = call.data.GQ
        elif "PL" in call.data._fields and None not in call.data.PL:
            gq = np.partition(call.data.PL, 1)[1]
        elif "SQ" in call.data._fields:
            gq = call.data.SQ
        else:
            gq = None

        svlen = record.INFO["SVLEN"]
        if isinstance(svlen, collections.Sequence):
            svlen = svlen[0]

        rows.append((
            record.ID or variant_descriptor(record),
            record.INFO["SVTYPE"],
            abs(svlen),
            record.INFO["MatchGT"] == "TRUE",
            gq,
            record.INFO.get("TRall") == "TRUE",
            record.INFO.get("TRgt100") == "TRUE",
            record.INFO.get("TRgt10k") == "TRUE",
            record.INFO.get("segdup") == "TRUE",
            sum(map(lambda x: abs(x), record.INFO.get("CIPOS", [0,0]))),
            sum(map(lambda x: abs(x), record.INFO.get("CIEND", [0,0]))),
        ))

    table = pd.DataFrame.from_records(
        rows,
        columns=["ID", "TYPE", "SVLEN", "MatchGT", "GQ",
                 "TRall", "TRgt100", "TRgt10k", "SegDup", "CIPOS", "CIEND"],
    )

    if args.command == "gq":
        gq_bins = pd.cut(table.GQ, [0, 10, 20, 30, 40, 100], right=False)
        gq_table = table.groupby(gq_bins, observed=True).agg({"MatchGT": "value_counts"}).groupby(
            level=0).transform(lambda x: x / x.sum())

        idx = pd.IndexSlice
        gq_table.loc[idx[:, True], :].reset_index(level=1, drop=True).rename(
            columns={"MatchGT": "Concordance"}).to_csv(sys.stdout, index_label="GQ")
    elif args.command == "vntr":    
        enrichment = table.groupby("MatchGT").agg({
            "TRall": "value_counts",
            "TRgt100": "value_counts",
            "TRgt10k": "value_counts",
            "SegDup": "value_counts",
        }, normalize=True).dropna(axis=1, how="any").groupby(level=0).transform(lambda x: x/x.sum())
        
        idx = pd.IndexSlice
        enrichment.loc[idx[:, True], :].reset_index(level=1, drop=True).to_csv(
            sys.stdout, index_label=["Concordant"])
    elif args.command == "len":
        len_bins = pd.cut(
            table.SVLEN, [50, 100, 300, 1000, np.iinfo(np.int32).max], right=False)
        len_table = table.groupby(len_bins).agg({"MatchGT": "value_counts"}).groupby(
            level=0).transform(lambda x: x / x.sum())
        idx = pd.IndexSlice
        len_table.loc[idx[:, True], :].reset_index(level=1, drop=True).rename(
            columns={"MatchGT": "Concordance"}).to_csv(sys.stdout)
    elif args.command == "list":
        table.to_csv(sys.stdout, index=False)
    elif args.command == "vntr-conc":
        table["SVLEN_RANGE"] = pd.cut(table.SVLEN, [50, 100, 300, 1000, np.iinfo(np.int32).max], right=False)
        vntr_table = table.groupby(["SVLEN_RANGE","TRgt100"]).agg({"MatchGT": "value_counts"})
        
        vntr_table["Count"] = vntr_table.groupby(level=[0, 1]).transform(lambda x: x.sum())
        vntr_table["Concordance"] = vntr_table["MatchGT"] / vntr_table["Count"]

        idx = pd.IndexSlice
        vntr_table.loc[idx[:, :, True], :].reset_index(level=2, drop=True).drop(columns="MatchGT").to_csv(sys.stdout)