Exemple #1
0
    def het_check(self,
                  vcf_path,
                  plot=False,
                  ncpus=1,
                  min_depth=8,
                  sites=op.join(op.dirname(__file__), 'GRCH37.sites'),
                  **kwargs):
        """
        kwargs is not used, but added here to allow same args as ped_check
        """

        import cyvcf2
        import numpy as np
        if ncpus > 16:
            ncpus = 16

        sitesfile = sites
        samps = [x.sample_id for x in self.samples()]
        vcf = cyvcf2.VCF(vcf_path, gts012=True, samples=samps)
        if sorted(vcf.samples) != sorted(samps):
            log.warning(
                "sample overlap issues\n\tin vcf, not in ped: %s\n\tin ped, not in vcf: %s"
                % (",".join(set(vcf.samples) - set(samps)),
                   ",".join(set(samps) - set(vcf.samples))))
        if set(vcf.samples) - set(samps) == set(vcf.samples):
            raise Exception("error: no samples from VCF found in ped")

        samps = vcf.samples
        sample_ranges, sites, gt_types = cyvcf2.par_het(vcf_path,
                                                        samps,
                                                        ncpus,
                                                        sites,
                                                        min_depth=min_depth)

        call_rate = (gt_types != 3).mean(axis=1)

        from .pca import pca
        if plot:
            pca_plot = plot.replace('het_', 'pca_').replace('het-', 'pca-')
            if pca_plot == plot:
                pca_plot, ext = pca_plot.rsplit(".", 1)
                pca_plot = "%s.%s%s" % (pca_plot, "pca.", ext)
        else:
            pca_plot = False
        pca_df, background_pca_df = pca(pca_plot, sitesfile, gt_types, sites)

        # not find outliers.
        depth = np.array([v['median_depth'] for v in sample_ranges.values()])
        #ranges = np.array([d['range'] for d in sample_ranges.values()])
        ratios = np.array([d['het_ratio'] for d in sample_ranges.values()])

        bot = depth.mean() - 2 * depth.std()
        # remove outliers and re-calc.
        bot = depth[depth > bot].mean() - 5 * depth[depth > bot].std()
        # care less if we have really high samples so make it 5.
        top = depth.mean() + 8 * depth.std()

        depth_outlier = ((depth < bot) | (depth > top))

        for k, v in sample_ranges.items():
            v['sample_id'] = k

        for d, depth_o in zip(sample_ranges.values(), depth_outlier):
            d['depth_outlier'] = depth_o
            d['idr_baf'] = d.pop('range')

        import pandas as pd
        if sys.version_info[0] == 2:
            df = pd.DataFrame(sample_ranges.values())
        else:
            df = pd.DataFrame(list(sample_ranges.values()))
        cols = ['sample_id'] + sorted(
            [x for x in df.columns if x != 'sample_id'])
        df = df[cols]

        df.index = df['sample_id']
        l = {s: i for i, s in enumerate(samps)}
        df['call_rate'] = [call_rate[l[s]] for s in df.index]

        if pca_df is not None:
            # merge the 2 dataframes.
            pca_df.index = samps
            df = pd.concat((df, pca_df), axis=1)

        if not plot:
            return df, background_pca_df

        from matplotlib import pyplot as plt
        import seaborn as sns
        colors = sns.color_palette('Set1', 4)

        cs = [
            colors[1 - int(v['depth_outlier'])]
            for v in sample_ranges.values()
        ]
        ecs = ['none' for k in sample_ranges]

        s = get_s(np.array([v['median_depth']
                            for v in sample_ranges.values()]))

        plt.scatter(depth, ratios, c=cs, edgecolors=ecs, s=s)

        for k, v in ((k, v) for k, v in sample_ranges.items()
                     if v['depth_outlier']):
            plt.text(v['median_depth'],
                     v['het_ratio'],
                     k,
                     color=colors[1],
                     fontsize=7)

        plt.xlabel('median depth')
        plt.ylabel('proportion het calls')
        plt.savefig(plot)
        return df, background_pca_df
Exemple #2
0
    def het_check(self, vcf_path, plot=False, ncpus=1, min_depth=8,
                  sites=op.join(op.dirname(__file__), '1kg.sites'),
                  **kwargs):
        """
        kwargs is not used, but added here to allow same args as ped_check
        """
        import cyvcf2
        import numpy as np
        if ncpus > 16:
            ncpus = 16

        samps = [x.sample_id for x in self.samples()]
        vcf = cyvcf2.VCF(vcf_path, gts012=True, samples=samps)
        if sorted(vcf.samples) != sorted(samps):
            print("warning: sample overlap issues\n\tin vcf, not in ped: %s\n\tin ped, not in vcf: %s" % (
                  ",".join(set(vcf.samples) - set(samps)),
                  ",".join(set(samps) - set(vcf.samples))), file=sys.stderr)
        if set(vcf.samples) - set(samps) == set(vcf.samples):
            raise Exception("error: no samples from VCF found in ped")

        samps = vcf.samples
        sample_ranges, sites, gt_types = cyvcf2.par_het(vcf_path, samps, ncpus,
                sites, min_depth=min_depth)

        call_rate = (gt_types != 3).mean(axis=1)

        from .pca import pca
        if plot:
            pca_plot = plot.replace('het_', 'pca_').replace('het-', 'pca-')
            if pca_plot == plot:
                pca_plot, ext = pca_plot.rsplit(".", 1)
                pca_plot = "%s.%s%s" % (pca_plot, "pca.", ext)
        else:
            pca_plot = False
        pca_df, background_pca_df = pca(pca_plot, gt_types, sites)

        # not find outliers.
        depth = np.array([v['median_depth'] for v in sample_ranges.values()])
        #ranges = np.array([d['range'] for d in sample_ranges.values()])
        ratios = np.array([d['het_ratio'] for d in sample_ranges.values()])
        ratios_outlier = ((ratios < 0.305) | (ratios > 0.41))
        #ranges_outlier = ((ranges < 0.08) | (ranges > 0.31))

        bot = depth.mean() - 2 * depth.std()
        # remove outliers and re-calc.
        bot = depth[depth > bot].mean() - 2 * depth[depth > bot].std()
        # care less if we have really high samples so make it 5.
        top = depth.mean() + 5 * depth.std()

        depth_outlier = ((depth < bot) | (depth > top))

        for k, v in sample_ranges.items():
            v['sample_id'] = k


        for d, depth_o, ratio_o in zip(sample_ranges.values(), depth_outlier,
                                       ratios_outlier):
            d['ratio_outlier'] = ratio_o
            d['depth_outlier'] = depth_o
            d['idr_baf'] = d.pop('range')

        import pandas as pd
        if sys.version_info[0] == 2:
            df = pd.DataFrame(sample_ranges.values())
        else:
            df = pd.DataFrame(list(sample_ranges.values()))
        cols = ['sample_id'] + sorted([x for x in df.columns if x != 'sample_id'])
        df = df[cols]

        df.index = df['sample_id']
        l = {s: i for i, s in enumerate(samps)}
        df['call_rate'] = [call_rate[l[s]] for s in df.index]

        if pca_df is not None:
            # merge the 2 dataframes.
            pca_df.index = samps
            df = pd.concat((df, pca_df), axis=1)

        if not plot:
            return df, background_pca_df

        from matplotlib import pyplot as plt
        import seaborn as sns
        colors = sns.color_palette('Set1', 4)

        cs = [colors[1 - int(v['depth_outlier'])] for v in sample_ranges.values()]
        ecs = ['none' if not v['ratio_outlier'] else 'k' for v in sample_ranges.values()]

        s = get_s(np.array([v['median_depth'] for v in sample_ranges.values()]))

        plt.scatter(depth, ratios, c=cs, edgecolors=ecs, s=s)

        for k, v in ((k, v) for k, v in sample_ranges.items()
                      if v['ratio_outlier'] or v['depth_outlier']):
          plt.text(v['median_depth'], v['het_ratio'], k, color=colors[1], fontsize=7)

        plt.xlabel('median depth')
        plt.ylabel('proportion het calls')
        plt.savefig(plot)
        return df, background_pca_df