def ped_check(self, vcf, ncpus=1, plot=False, min_depth=5, each=1, prefix='', sites=op.join(op.dirname(__file__), 'GRCH37.sites')): """ Given the current pedigree and a VCF of genotypes, find sample-pairs where the relationship reported in the pedigree file do not match those inferred from the genotypes. Returns a dataframe containing all sample-pairs with columns for IBS0, IBS2, rel, IBS2*, pedigree_relatedness (relatedness coefficient expected) :param vcf str: path to vcf :param min_depth int: minimum required depth. :return: pandas.DataFrame """ import cyvcf2 import numpy as np import pandas as pd vcf_str = vcf np.random.seed(42) ped_samples = list(self.samples()) if isinstance(vcf, basestring): vcf = cyvcf2.VCF(vcf, gts012=True, samples=[x.sample_id for x in ped_samples]) d = cyvcf2.par_relatedness(vcf_str, [x.sample_id for x in ped_samples], ncpus, sites, min_depth=min_depth, each=each) cols = ['sample_a', 'sample_b'] cols += [ c for c in d if not c in ('sample_a', 'sample_b') and not c.endswith('error') ] cols += [c for c in d if c.endswith('error')] if len(ped_samples) > 200: log.info( "large dataset: only reporting pedigree checks where in same" + " family or relationship does not match expected") df = pd.DataFrame(d, columns=cols) del d import gc gc.collect() # most of these 2 will be false and 0, respectively df['pedigree_parents'] = np.zeros(len(df), dtype=bool) df['pedigree_relatedness'] = np.zeros(len(df), dtype=np.float32) fam_lookup = {s.sample_id: s.family_id for s in ped_samples} same_fam = np.array([ fam_lookup[a] == fam_lookup[b] for a, b in zip(df.sample_a, df.sample_b) ]) asample_ids, bsample_ids = df.sample_a[same_fam], df.sample_b[same_fam] idxs, = np.where(same_fam) import array parent_is = array.array('L', []) rels = array.array('f', []) # if they aren't in the same fam, cant be related. for (i, aid, bid) in zip(idxs, asample_ids, bsample_ids): a_sample, b_sample = self.get(aid, samples=ped_samples), self.get( bid, samples=ped_samples) if b_sample in (a_sample.mom, a_sample.dad) or a_sample in (b_sample.mom, b_sample.dad): parent_is.append(i) #df.loc[i, 'pedigree_parents'] = True # setting directly is expensive, but we expect that for big cohorts, the above 2 sets will # be relatively rare (and they default to False). Since we need to set the relatedness fcolors # every sample, we use an array.array and then set at the end. #df.loc[i, 'pedigree_relatedness'] = self.relatedness_coefficient(aid, bid) rels.append(max(0, self.relatedness_coefficient(aid, bid))) df.loc[np.asarray(parent_is), 'pedigree_parents'] = True df.loc[idxs, 'pedigree_relatedness'] = np.asarray(rels, np.float32) df['tmpibs0'] = (df['ibs0'] / df['n'].astype(np.float32)).astype( np.float32) df["predicted_parents"] = df['tmpibs0'] < 0.012 df["parent_error"] = df['pedigree_parents'] != df['predicted_parents'] df["sample_duplication_error"] = (df['tmpibs0'] < 0.012) & (df['rel'] > 0.75) pr = df['pedigree_relatedness'] df["rel_difference"] = (pr - df['rel']).astype(np.float32) # make the column order a bit more sane. if len(ped_samples) > 200: rd = np.abs(df['rel_difference']) > 0.17 sampling_rate = 1 / (len(ped_samples)**0.6) ru = (np.random.uniform(size=df.shape[0]) < sampling_rate) df['keep'] = df.eval( 'parent_error | sample_duplication_error | predicted_parents| @rd | @ru ' + '| (rel > 0.17) | (tmpibs0 < 0.04) | (pedigree_relatedness > 0)' ) df['keep'] |= same_fam if not plot: df.drop('tmpibs0', axis=1, inplace=True) return df def asum(a): return np.abs(a).sum() a = np.abs(a) return a[a > 0.08].sum() from matplotlib import pyplot as plt plt.close() import seaborn as sns sns.set_style('whitegrid') # get total rel_difference by sample. large values indicate the likely problem #df['rel_difference'][df['rel_difference'] == 0] = 0.001 df.loc[df['rel_difference'] == 0, 'rel_difference'] == 0.001 sub = df.eval( '((rel > 0.1) & (pedigree_relatedness < 0.05)) | ((rel < 0.05) & (pedigree_relatedness > 0.1)) | (abs(rel_difference) > 0.1)' ) da = df[sub].groupby('sample_a')['rel_difference'].agg(asum) db = df[sub].groupby('sample_b')['rel_difference'].agg(asum) diff = da.add(db, fill_value=0) diff.sort_values(inplace=True, ascending=False) diff.to_csv(plot.rsplit(".", 1)[0] + ".rel-difference.csv", index=True, index_label="sample", header=True) del diff del da del db del sub colors = [(0.85, 0.85, 0.85)] + sns.color_palette( 'Set1', len(set(df['pedigree_relatedness']))) n = df['n'] / df['n'].mean() if len(df) < 100: colors[0] = (0.3, 0.3, 0.3) mult = 24 if len(df) < 50 else 12 fig, axesb = plt.subplots(2, 2, figsize=(12, 12)) df['tmpibs2'] = df['ibs2'] / df['n'].astype(float) axes = axesb[0] log.info("plotting") for k, key in enumerate(('tmpibs0', 'tmpibs2')): for i, rc in enumerate(sorted(set(df['pedigree_relatedness']))): sel = df['pedigree_relatedness'] == rc src = ("%.3f" % rc).rstrip('0') # outline parent kid relationships #ec = ['k' if p else 'none' for p in df['pedigree_parents'][sel]] ec = np.where(df.loc[sel, 'pedigree_parents'], 'k', 'none') axes[k].scatter(df.loc[sel, 'rel'], df.loc[sel, key], c=colors[i], linewidth=1, edgecolors=ec, s=((mult * (i > 0 or len(df) < 36)) + mult * n[sel]), zorder=i, alpha=0.80, label="ped coef: %s" % src) axes[k].set_xlabel('coefficient of relatedness') axes[k].set_ylabel(key[3:]) if prefix: fig.suptitle(prefix) xmin, xmax = axes[0].get_xlim() if xmin < -0.3: axes[0].set_xlim(xmin=-0.3) if xmax > 1.25: axes[0].set_xlim(xmax=1.25) df.drop(['tmpibs0', 'tmpibs2'], axis=1, inplace=True) axes = axesb[1] for k, key in enumerate(('ibs2', 'shared_hets')): for i, rc in enumerate(sorted(set(df['pedigree_relatedness']))): sel = df['pedigree_relatedness'] == rc src = ("%.3f" % rc).rstrip('0') # outline parent kid relationships #ec = ['k' if p else 'none' for p in df['pedigree_parents'][sel]] ec = np.where(df.loc[sel, 'pedigree_parents'], 'k', 'none') axes[k].scatter(df.loc[sel, 'ibs0'], df.loc[sel, key], c=colors[i], linewidth=1, edgecolors=ec, s=((mult * (i > 0 or len(df) < 36)) + mult * n[sel]), zorder=i, alpha=0.80, label="ped coef: %s" % src) axes[k].set_xlabel('ibs0') axes[k].set_ylabel(key) plt.legend() if plot is True: plt.show() else: plt.savefig(plot) plt.close() return df
def ped_check(self, vcf, ncpus=1, plot=False, min_depth=5, each=1, prefix='', sites=op.join(op.dirname(__file__), '1kg.sites')): """ Given the current pedigree and a VCF of genotypes, find sample-pairs where the relationship reported in the pedigree file do not match those inferred from the genotypes. Returns a dataframe containing all sample-pairs with columns for IBS0, IBS2, rel, IBS2*, pedigree_relatedness (relatedness coefficient expected) :param vcf str: path to vcf :param min_depth int: minimum required depth. :return: pandas.DataFrame """ import cyvcf2 import numpy as np import pandas as pd vcf_str = vcf np.random.seed(42) ped_samples = list(self.samples()) if isinstance(vcf, basestring): vcf = cyvcf2.VCF(vcf, gts012=True, samples=[x.sample_id for x in ped_samples]) d = cyvcf2.par_relatedness(vcf_str, [x.sample_id for x in ped_samples], ncpus, sites, min_depth=min_depth, each=each) cols = ['sample_a', 'sample_b'] cols += [c for c in d if not c in ('sample_a', 'sample_b') and not c.endswith('error')] cols += [c for c in d if c.endswith('error')] if len(ped_samples) > 200: print("large dataset: only reporting pedigree checks where in same" + " family or relationship does not match expected", file=sys.stderr) df = pd.DataFrame(d, columns=cols) del d import gc; gc.collect() # most of these 2 will be false and 0, respectively df['pedigree_parents'] = np.zeros(len(df), dtype=bool) df['pedigree_relatedness'] = np.zeros(len(df), dtype=np.float32) fam_lookup = {s.sample_id: s.family_id for s in ped_samples} same_fam = np.array([fam_lookup[a] == fam_lookup[b] for a, b in zip(df.sample_a, df.sample_b)]) asample_ids, bsample_ids = df.sample_a[same_fam], df.sample_b[same_fam] idxs, = np.where(same_fam) import array parent_is = array.array('L', []) rels = array.array('f', []) # if they aren't in the same fam, cant be related. for (i, aid, bid) in zip(idxs, asample_ids, bsample_ids): a_sample, b_sample = self.get(aid, samples=ped_samples), self.get(bid, samples=ped_samples) if b_sample in (a_sample.mom, a_sample.dad) or a_sample in (b_sample.mom, b_sample.dad): parent_is.append(i) #df.loc[i, 'pedigree_parents'] = True # setting directly is expensive, but we expect that for big cohorts, the above 2 sets will # be relatively rare (and they default to False). Since we need to set the relatedness fcolors # every sample, we use an array.array and then set at the end. #df.loc[i, 'pedigree_relatedness'] = self.relatedness_coefficient(aid, bid) rels.append(max(0, self.relatedness_coefficient(aid, bid))) df.loc[np.asarray(parent_is), 'pedigree_parents'] = True df.loc[idxs, 'pedigree_relatedness'] = np.asarray(rels, np.float32) df['tmpibs0'] = (df['ibs0'] / df['n'].astype(np.float32)).astype(np.float32) df["predicted_parents"] = df['tmpibs0'] < 0.012 df["parent_error"] = df['pedigree_parents'] != df['predicted_parents'] df["sample_duplication_error"] = (df['tmpibs0'] < 0.012) & (df['rel'] > 0.75) pr = df['pedigree_relatedness'] df["rel_difference"] = (pr - df['rel']).astype(np.float32) # make the column order a bit more sane. if len(ped_samples) > 200: rd = np.abs(df['rel_difference']) > 0.17 sampling_rate = 1 / (len(ped_samples)**0.6) ru = (np.random.uniform(size=df.shape[0]) < sampling_rate) df['keep'] = df.eval('parent_error | sample_duplication_error | predicted_parents| @rd | @ru' + '| (rel > 0.17) | (tmpibs0 < 0.04) | (pedigree_relatedness > 0)') df['keep'] |= same_fam if not plot: df.drop('tmpibs0', axis=1, inplace=True) return df def asum(a): return np.abs(a).sum() a = np.abs(a) return a[a > 0.08].sum() from matplotlib import pyplot as plt plt.close() import seaborn as sns sns.set_style('whitegrid') # get total rel_difference by sample. large values indicate the likely problem sub = df.eval('((rel > 0.1) & (pedigree_relatedness < 0.05)) | ((rel < 0.05) & (pedigree_relatedness > 0.1)) | (rel_difference > 0.1) | (rel_difference < -0.1)') da = df[sub].groupby('sample_a')['rel_difference'].agg(asum) db = df[sub].groupby('sample_b')['rel_difference'].agg(asum) diff = da.add(db, fill_value=0) diff.sort_values(inplace=True, ascending=False) diff.to_csv(plot.rsplit(".", 1)[0] + ".rel-difference.csv", index=True, index_label="sample", header=True) del diff; del da; del db; del sub colors = [(0.85, 0.85, 0.85)] + sns.color_palette('Set1', len(set(df['pedigree_relatedness']))) n = df['n'] / df['n'].mean() if len(df) < 100: colors[0] = (0.3, 0.3, 0.3) mult = 24 if len(df) < 50 else 12 fig, axesb = plt.subplots(2, 2, figsize=(12, 12)) df['tmpibs2'] = df['ibs2'] / df['n'].astype(float) axes = axesb[0] print("plotting") for k, key in enumerate(('tmpibs0', 'tmpibs2')): for i, rc in enumerate(sorted(set(df['pedigree_relatedness']))): sel = df['pedigree_relatedness'] == rc src = ("%.3f" % rc).rstrip('0') # outline parent kid relationships #ec = ['k' if p else 'none' for p in df['pedigree_parents'][sel]] ec = np.where(df.loc[sel, 'pedigree_parents'], 'k', 'none') axes[k].scatter(df.loc[sel, 'rel'], df.loc[sel, key], c=colors[i], linewidth=1, edgecolors=ec, s=((mult * (i > 0 or len(df) < 36)) + mult * n[sel]), zorder=i, alpha=0.80, label="ped coef: %s" % src) axes[k].set_xlabel('coefficient of relatedness') axes[k].set_ylabel(key[3:]) if prefix: fig.suptitle(prefix) xmin, xmax = axes[0].get_xlim() if xmin < -0.3: axes[0].set_xlim(xmin=-0.3) if xmax > 1.25: axes[0].set_xlim(xmax=1.25) df.drop(['tmpibs0', 'tmpibs2'], axis=1, inplace=True) axes = axesb[1] for k, key in enumerate(('ibs2', 'shared_hets')): for i, rc in enumerate(sorted(set(df['pedigree_relatedness']))): sel = df['pedigree_relatedness'] == rc src = ("%.3f" % rc).rstrip('0') # outline parent kid relationships #ec = ['k' if p else 'none' for p in df['pedigree_parents'][sel]] ec = np.where(df.loc[sel, 'pedigree_parents'], 'k', 'none') axes[k].scatter(df.loc[sel, 'ibs0'], df.loc[sel, key], c=colors[i], linewidth=1, edgecolors=ec, s=((mult * (i > 0 or len(df) < 36)) + mult * n[sel]), zorder=i, alpha=0.80, label="ped coef: %s" % src) axes[k].set_xlabel('ibs0') axes[k].set_ylabel(key) plt.legend() if plot is True: plt.show() else: plt.savefig(plot) plt.close() return df