Esempio n. 1
0
    def ped_check(self,
                  vcf,
                  ncpus=1,
                  plot=False,
                  min_depth=5,
                  each=1,
                  prefix='',
                  sites=op.join(op.dirname(__file__), 'GRCH37.sites')):
        """
        Given the current pedigree and a VCF of genotypes, find sample-pairs where
        the relationship reported in the pedigree file do not match those inferred
        from the genotypes. Returns a dataframe containing all sample-pairs with
        columns for IBS0, IBS2, rel, IBS2*, pedigree_relatedness (relatedness
        coefficient expected)

        :param vcf str:  path to vcf
        :param min_depth int: minimum required depth.
        :return: pandas.DataFrame
        """

        import cyvcf2
        import numpy as np
        import pandas as pd
        vcf_str = vcf
        np.random.seed(42)

        ped_samples = list(self.samples())
        if isinstance(vcf, basestring):
            vcf = cyvcf2.VCF(vcf,
                             gts012=True,
                             samples=[x.sample_id for x in ped_samples])

        d = cyvcf2.par_relatedness(vcf_str, [x.sample_id for x in ped_samples],
                                   ncpus,
                                   sites,
                                   min_depth=min_depth,
                                   each=each)
        cols = ['sample_a', 'sample_b']
        cols += [
            c for c in d
            if not c in ('sample_a', 'sample_b') and not c.endswith('error')
        ]
        cols += [c for c in d if c.endswith('error')]
        if len(ped_samples) > 200:
            log.info(
                "large dataset: only reporting pedigree checks where in same" +
                " family or relationship does not match expected")
        df = pd.DataFrame(d, columns=cols)
        del d
        import gc
        gc.collect()

        # most of these 2 will be false and 0, respectively
        df['pedigree_parents'] = np.zeros(len(df), dtype=bool)
        df['pedigree_relatedness'] = np.zeros(len(df), dtype=np.float32)

        fam_lookup = {s.sample_id: s.family_id for s in ped_samples}
        same_fam = np.array([
            fam_lookup[a] == fam_lookup[b]
            for a, b in zip(df.sample_a, df.sample_b)
        ])

        asample_ids, bsample_ids = df.sample_a[same_fam], df.sample_b[same_fam]
        idxs, = np.where(same_fam)

        import array
        parent_is = array.array('L', [])
        rels = array.array('f', [])

        # if they aren't in the same fam, cant be related.
        for (i, aid, bid) in zip(idxs, asample_ids, bsample_ids):

            a_sample, b_sample = self.get(aid, samples=ped_samples), self.get(
                bid, samples=ped_samples)
            if b_sample in (a_sample.mom,
                            a_sample.dad) or a_sample in (b_sample.mom,
                                                          b_sample.dad):
                parent_is.append(i)
                #df.loc[i, 'pedigree_parents'] = True

            # setting directly is expensive, but we expect that for big cohorts, the above 2 sets will
            # be relatively rare (and they default to False). Since we need to set the relatedness fcolors
            # every sample, we use an array.array and then set at the end.
            #df.loc[i, 'pedigree_relatedness'] = self.relatedness_coefficient(aid, bid)
            rels.append(max(0, self.relatedness_coefficient(aid, bid)))

        df.loc[np.asarray(parent_is), 'pedigree_parents'] = True
        df.loc[idxs, 'pedigree_relatedness'] = np.asarray(rels, np.float32)

        df['tmpibs0'] = (df['ibs0'] / df['n'].astype(np.float32)).astype(
            np.float32)
        df["predicted_parents"] = df['tmpibs0'] < 0.012
        df["parent_error"] = df['pedigree_parents'] != df['predicted_parents']
        df["sample_duplication_error"] = (df['tmpibs0'] < 0.012) & (df['rel'] >
                                                                    0.75)

        pr = df['pedigree_relatedness']
        df["rel_difference"] = (pr - df['rel']).astype(np.float32)
        # make the column order a bit more sane.
        if len(ped_samples) > 200:
            rd = np.abs(df['rel_difference']) > 0.17

            sampling_rate = 1 / (len(ped_samples)**0.6)
            ru = (np.random.uniform(size=df.shape[0]) < sampling_rate)
            df['keep'] = df.eval(
                'parent_error | sample_duplication_error | predicted_parents| @rd | @ru '
                +
                '| (rel > 0.17) | (tmpibs0 < 0.04) | (pedigree_relatedness > 0)'
            )

            df['keep'] |= same_fam

        if not plot:
            df.drop('tmpibs0', axis=1, inplace=True)
            return df

        def asum(a):
            return np.abs(a).sum()
            a = np.abs(a)
            return a[a > 0.08].sum()

        from matplotlib import pyplot as plt
        plt.close()
        import seaborn as sns
        sns.set_style('whitegrid')

        # get total rel_difference by sample. large values indicate the likely problem
        #df['rel_difference'][df['rel_difference'] == 0] = 0.001
        df.loc[df['rel_difference'] == 0, 'rel_difference'] == 0.001

        sub = df.eval(
            '((rel > 0.1) & (pedigree_relatedness < 0.05)) | ((rel < 0.05) & (pedigree_relatedness > 0.1)) | (abs(rel_difference) > 0.1)'
        )
        da = df[sub].groupby('sample_a')['rel_difference'].agg(asum)
        db = df[sub].groupby('sample_b')['rel_difference'].agg(asum)
        diff = da.add(db, fill_value=0)
        diff.sort_values(inplace=True, ascending=False)

        diff.to_csv(plot.rsplit(".", 1)[0] + ".rel-difference.csv",
                    index=True,
                    index_label="sample",
                    header=True)
        del diff
        del da
        del db
        del sub

        colors = [(0.85, 0.85, 0.85)] + sns.color_palette(
            'Set1', len(set(df['pedigree_relatedness'])))
        n = df['n'] / df['n'].mean()
        if len(df) < 100:
            colors[0] = (0.3, 0.3, 0.3)

        mult = 24 if len(df) < 50 else 12

        fig, axesb = plt.subplots(2, 2, figsize=(12, 12))
        df['tmpibs2'] = df['ibs2'] / df['n'].astype(float)
        axes = axesb[0]
        log.info("plotting")

        for k, key in enumerate(('tmpibs0', 'tmpibs2')):

            for i, rc in enumerate(sorted(set(df['pedigree_relatedness']))):
                sel = df['pedigree_relatedness'] == rc
                src = ("%.3f" % rc).rstrip('0')
                # outline parent kid relationships
                #ec = ['k' if p else 'none' for p in df['pedigree_parents'][sel]]
                ec = np.where(df.loc[sel, 'pedigree_parents'], 'k', 'none')

                axes[k].scatter(df.loc[sel, 'rel'],
                                df.loc[sel, key],
                                c=colors[i],
                                linewidth=1,
                                edgecolors=ec,
                                s=((mult * (i > 0 or len(df) < 36)) +
                                   mult * n[sel]),
                                zorder=i,
                                alpha=0.80,
                                label="ped coef: %s" % src)
                axes[k].set_xlabel('coefficient of relatedness')
                axes[k].set_ylabel(key[3:])

        if prefix:
            fig.suptitle(prefix)
        xmin, xmax = axes[0].get_xlim()
        if xmin < -0.3:
            axes[0].set_xlim(xmin=-0.3)
        if xmax > 1.25:
            axes[0].set_xlim(xmax=1.25)
        df.drop(['tmpibs0', 'tmpibs2'], axis=1, inplace=True)

        axes = axesb[1]
        for k, key in enumerate(('ibs2', 'shared_hets')):
            for i, rc in enumerate(sorted(set(df['pedigree_relatedness']))):
                sel = df['pedigree_relatedness'] == rc
                src = ("%.3f" % rc).rstrip('0')
                # outline parent kid relationships
                #ec = ['k' if p else 'none' for p in df['pedigree_parents'][sel]]
                ec = np.where(df.loc[sel, 'pedigree_parents'], 'k', 'none')
                axes[k].scatter(df.loc[sel, 'ibs0'],
                                df.loc[sel, key],
                                c=colors[i],
                                linewidth=1,
                                edgecolors=ec,
                                s=((mult * (i > 0 or len(df) < 36)) +
                                   mult * n[sel]),
                                zorder=i,
                                alpha=0.80,
                                label="ped coef: %s" % src)
                axes[k].set_xlabel('ibs0')
                axes[k].set_ylabel(key)

        plt.legend()

        if plot is True:
            plt.show()
        else:
            plt.savefig(plot)
        plt.close()
        return df
Esempio n. 2
0
    def ped_check(self, vcf, ncpus=1, plot=False, min_depth=5, each=1,
            prefix='',
            sites=op.join(op.dirname(__file__), '1kg.sites')):
        """
        Given the current pedigree and a VCF of genotypes, find sample-pairs where
        the relationship reported in the pedigree file do not match those inferred
        from the genotypes. Returns a dataframe containing all sample-pairs with
        columns for IBS0, IBS2, rel, IBS2*, pedigree_relatedness (relatedness
        coefficient expected)

        :param vcf str:  path to vcf
        :param min_depth int: minimum required depth.
        :return: pandas.DataFrame
        """
        import cyvcf2
        import numpy as np
        import pandas as pd
        vcf_str = vcf
        np.random.seed(42)

        ped_samples = list(self.samples())
        if isinstance(vcf, basestring):
            vcf = cyvcf2.VCF(vcf, gts012=True, samples=[x.sample_id for x in ped_samples])

        d = cyvcf2.par_relatedness(vcf_str,
                                   [x.sample_id for x in ped_samples],
                                   ncpus,
                                   sites,
                                   min_depth=min_depth, each=each)
        cols = ['sample_a', 'sample_b']
        cols += [c for c in d if not c in ('sample_a', 'sample_b') and not c.endswith('error')]
        cols += [c for c in d if c.endswith('error')]
        if len(ped_samples) > 200:
            print("large dataset: only reporting pedigree checks where in same"
                  + " family or relationship does not match expected",
                  file=sys.stderr)
        df = pd.DataFrame(d, columns=cols)
        del d
        import gc; gc.collect()


        # most of these 2 will be false and 0, respectively
        df['pedigree_parents'] = np.zeros(len(df), dtype=bool)
        df['pedigree_relatedness'] = np.zeros(len(df), dtype=np.float32)

        fam_lookup = {s.sample_id: s.family_id for s in ped_samples}
        same_fam = np.array([fam_lookup[a] == fam_lookup[b] for a, b in zip(df.sample_a, df.sample_b)])

        asample_ids, bsample_ids = df.sample_a[same_fam], df.sample_b[same_fam]
        idxs, = np.where(same_fam)

        import array
        parent_is = array.array('L', [])
        rels = array.array('f', [])

        # if they aren't in the same fam, cant be related.
        for (i, aid, bid) in zip(idxs, asample_ids, bsample_ids):

            a_sample, b_sample = self.get(aid, samples=ped_samples), self.get(bid, samples=ped_samples)
            if b_sample in (a_sample.mom, a_sample.dad) or a_sample in (b_sample.mom, b_sample.dad):
                parent_is.append(i)
                #df.loc[i, 'pedigree_parents'] = True

            # setting directly is expensive, but we expect that for big cohorts, the above 2 sets will
            # be relatively rare (and they default to False). Since we need to set the relatedness fcolors
            # every sample, we use an array.array and then set at the end.
            #df.loc[i, 'pedigree_relatedness'] = self.relatedness_coefficient(aid, bid)
            rels.append(max(0, self.relatedness_coefficient(aid, bid)))

        df.loc[np.asarray(parent_is), 'pedigree_parents'] = True
        df.loc[idxs, 'pedigree_relatedness'] = np.asarray(rels, np.float32)


        df['tmpibs0'] = (df['ibs0'] / df['n'].astype(np.float32)).astype(np.float32)
        df["predicted_parents"] = df['tmpibs0'] < 0.012
        df["parent_error"] = df['pedigree_parents'] != df['predicted_parents']
        df["sample_duplication_error"] = (df['tmpibs0'] < 0.012) & (df['rel'] > 0.75)

        pr = df['pedigree_relatedness']
        df["rel_difference"] = (pr - df['rel']).astype(np.float32)
        # make the column order a bit more sane.
        if len(ped_samples) > 200:
            rd = np.abs(df['rel_difference']) > 0.17

            sampling_rate = 1 / (len(ped_samples)**0.6)
            ru = (np.random.uniform(size=df.shape[0]) < sampling_rate)
            df['keep'] = df.eval('parent_error | sample_duplication_error | predicted_parents| @rd | @ru' +
                    '| (rel > 0.17) | (tmpibs0 < 0.04) | (pedigree_relatedness > 0)')

            df['keep'] |= same_fam


        if not plot:
            df.drop('tmpibs0', axis=1, inplace=True)
            return df


        def asum(a):
            return np.abs(a).sum()
            a = np.abs(a)
            return a[a > 0.08].sum()

        from matplotlib import pyplot as plt
        plt.close()
        import seaborn as sns
        sns.set_style('whitegrid')

        # get total rel_difference by sample. large values indicate the likely problem
        sub = df.eval('((rel > 0.1) & (pedigree_relatedness < 0.05)) | ((rel < 0.05) & (pedigree_relatedness > 0.1)) | (rel_difference > 0.1) | (rel_difference < -0.1)')
        da = df[sub].groupby('sample_a')['rel_difference'].agg(asum)
        db = df[sub].groupby('sample_b')['rel_difference'].agg(asum)
        diff = da.add(db, fill_value=0)
        diff.sort_values(inplace=True, ascending=False)

        diff.to_csv(plot.rsplit(".", 1)[0] + ".rel-difference.csv", index=True,
                index_label="sample", header=True)
        del diff; del da; del db; del sub


        colors = [(0.85, 0.85, 0.85)] + sns.color_palette('Set1', len(set(df['pedigree_relatedness'])))
        n = df['n'] / df['n'].mean()
        if len(df) < 100:
            colors[0] = (0.3, 0.3, 0.3)

        mult = 24 if len(df) < 50 else 12

        fig, axesb = plt.subplots(2, 2, figsize=(12, 12))
        df['tmpibs2'] = df['ibs2'] / df['n'].astype(float)
        axes = axesb[0]
        print("plotting")

        for k, key in enumerate(('tmpibs0', 'tmpibs2')):

            for i, rc in enumerate(sorted(set(df['pedigree_relatedness']))):
                sel = df['pedigree_relatedness'] == rc
                src = ("%.3f" % rc).rstrip('0')
                # outline parent kid relationships
                #ec = ['k' if p else 'none' for p in df['pedigree_parents'][sel]]
                ec = np.where(df.loc[sel, 'pedigree_parents'], 'k', 'none')

                axes[k].scatter(df.loc[sel, 'rel'], df.loc[sel, key],
                        c=colors[i], linewidth=1, edgecolors=ec,
                        s=((mult * (i > 0 or len(df) < 36)) + mult * n[sel]),
                        zorder=i,
                        alpha=0.80,
                        label="ped coef: %s" % src)
                axes[k].set_xlabel('coefficient of relatedness')
                axes[k].set_ylabel(key[3:])

        if prefix:
            fig.suptitle(prefix)
        xmin, xmax = axes[0].get_xlim()
        if xmin < -0.3:
            axes[0].set_xlim(xmin=-0.3)
        if xmax > 1.25:
            axes[0].set_xlim(xmax=1.25)
        df.drop(['tmpibs0', 'tmpibs2'], axis=1, inplace=True)

        axes = axesb[1]
        for k, key in enumerate(('ibs2', 'shared_hets')):
            for i, rc in enumerate(sorted(set(df['pedigree_relatedness']))):
                    sel = df['pedigree_relatedness'] == rc
                    src = ("%.3f" % rc).rstrip('0')
                    # outline parent kid relationships
                    #ec = ['k' if p else 'none' for p in df['pedigree_parents'][sel]]
                    ec = np.where(df.loc[sel, 'pedigree_parents'], 'k', 'none')
                    axes[k].scatter(df.loc[sel, 'ibs0'], df.loc[sel, key],
                            c=colors[i], linewidth=1, edgecolors=ec,
                            s=((mult * (i > 0 or len(df) < 36)) + mult * n[sel]),
                            zorder=i,
                            alpha=0.80,
                            label="ped coef: %s" % src)
                    axes[k].set_xlabel('ibs0')
                    axes[k].set_ylabel(key)

        plt.legend()

        if plot is True:
            plt.show()
        else:
            plt.savefig(plot)
        plt.close()
        return df