def cleaner(): snps = Bed(args.snps, count_A1=False) patients = pd.read_csv( '/Users/ioneliabuzatu/PycharmProjects/biobank/obesity/data/bmi_clean.csv', sep=' ', index_col=0) patients_id = pd.read_csv( '/Users/ioneliabuzatu/PycharmProjects/biobank/obesity/data/bmi_clean.csv', sep=' ') pats = patients_id.iloc[:, 0] count_not_there = 0 for p in pats: search = str(p).encode('ascii') try: hi = snps.iid_to_index([[search, search]]) except: patients = patients.drop([p]) count_not_there += 1 print(patients.shape) return patients
#[[ 5 4000 4000] # [ 5 4001 4001] # [ 5 4002 4002] # ..., # [ 5 4997 4997] # [ 5 4998 4998] # [ 5 4999 4999]] #In one-line: chr5data = Bed("all.bed")[:, snpreader.pos[:, 0] == 5].read() # You can turn iid or sid names into indexes snpreader = Bed("all.bed") iid0 = [['cid499P1', 'cid499P1'], ['cid489P1', 'cid489P1'], ['cid479P1', 'cid479P1']] indexes0 = snpreader.iid_to_index(iid0) print indexes0 #array([499, 489, 479]) snpreader0 = snpreader[indexes0, :] print snpreader0.iid #[['cid499P1' 'cid499P1'] # ['cid489P1' 'cid489P1'] # ['cid479P1' 'cid479P1']] # more condensed snpreader0 = snpreader[snpreader.iid_to_index(iid0), :] #both a once snpdata0chr5 = snpreader[snpreader.iid_to_index(iid0), snpreader.pos[:, 0] == 5].read() print np.mean(snpdata0chr5.val)
class Mapping(): def __init__(self, prefix, case_file): self.prefix = prefix self.case_file = case_file self.snpreader = Bed(f"{prefix}.bed", count_A1=False) if self.snpreader.pos.dtype != 'int64': self.snpreader.pos[:,0] = np.vectorize(replace)(self.snpreader.pos[:,0]) self.snpreader.pos[:,1] = self.snpreader.pos[:,0] * 100000000000 + self.snpreader.pos[:,2] self.snpdata = self.snpreader.read() print('SNP data loaded.') self.chr_list = list(set(self.snpreader.pos[:,0])) self.Chr = self.snpreader.pos[:,0] self.Position = self.snpreader.pos[:,1] self.bp = self.snpreader.pos[:,2] self.SNPID = self.snpreader.sid self.case = np.loadtxt(case_file, dtype=self.snpreader.iid.dtype)[:,:2] self.case_list = list(self.case) self.all_list = list([tuple(x) for x in self.snpreader.iid]) self.caseset = set([tuple(x) for x in self.case]) self.control_list = [list(x) for x in self.all_list if x not in self.caseset] self.numSNP = self.snpreader.sid_count self.numSample = len(self.all_list) self.numCase = len(self.case_list) self.numControl = len(self.control_list) self.case_geno = self.snpdata.val[self.snpreader.iid_to_index(self.case)] L = [] for i in self.case_list: L.append(i[1].decode('utf-8')) self.case_list_print = '\n'.join(L) print('Case individuals are: \n') print(self.case_list_print) print('\n') def ibdmapping_gw(self, Windowkb, Stretchkb, numGapSNP, numMinSNP, WindowGap, out, point): '''Performs genome-wide IBD mapping based on SNP streak''' print("********************\n" f"IBD mapping started.\ninput file prefix: {self.prefix}\nWindowkb: {Windowkb}\nStretchkb: " f"{Stretchkb}\nnumGapSNP: {numGapSNP}\nnumMinSNP: {numMinSNP}\nWindowGap: {WindowGap}\noutput file prefix: {out}") with open(f"{out}.txt", 'w') as f: out = (f"Log_for_NonparametricIBDmapping\n\nInput_Genotype_File:\t{self.prefix}.bim/fam/bed\nInput_Case_File:\t{self.case_file}" f"\nNo.SNP:\t{self.numSNP}\nWindow_kb:\t{Windowkb}\nStretch_kb:\t{Stretchkb}\nWindowGap_kb:\t{WindowGap}" f"\nNo.InconsistentSNP:\t{numGapSNP}\nNo.MinSNP:\t{numMinSNP}\n\nNo.Samples:\t{self.numSample}\nNo.Cases:\t{self.numCase}\n" f"\nCase individuals are: \n{self.case_list_print}\n\n") f.write(out) StretchLong = LOCH_MappingTools.LOCHMappingAll(self.case_geno, self.Position, Windowkb, numGapSNP, numMinSNP, Stretchkb, WindowGap) if point: PointHitFlag = LOCH_MappingTools.PointHitonStretch(StretchLong, self.Position) numStretchLong = len(StretchLong) if len(StretchLong[0]) else 0 out = f'No.IBD stretch CaseOnly:\t{numStretchLong}' print(out) out = (f"\nNo.IBD_Stretch_in_All_Cases:\t{numStretchLong}\nIBD_stretch\tChr\tStart_SNP\tEnd_SNP" f"\tStart_Position(bp)\tEnd_Position(bp)\tLength(bp)\n") f.write(out) for i in range(numStretchLong): start = StretchLong[i][0] end = StretchLong[i][1] L = [str(i+1), str(self.Chr[start]), self.SNPID[start].decode('utf-8'), self.SNPID[end].decode('utf-8'), str(self.bp[start]), str(self.bp[end]), f"{(self.bp[end] - self.bp[start])}\n"] out = '\t'.join(L) f.write(out) if point: PointHitResult = [[0] * self.numSNP for i in range(self.numControl)] StretchLongControl = [None] * self.numControl if self.numControl: for i in range(self.numControl): case_control_list = self.case_list + [self.control_list[i]] case_control_geno = self.snpdata.val[self.snpreader.iid_to_index(case_control_list)] StretchLongControl[i] = LOCH_MappingTools.LOCHMappingAll(case_control_geno, self.Position, Windowkb, numGapSNP, numMinSNP, Stretchkb, WindowGap) if point: PointHitFlagControl = LOCH_MappingTools.PointHitonStretch(StretchLongControl[i], self.Position) PointHitResult[i] = PointHitFlagControl numStretchLongControl = len(StretchLongControl[i]) if len(StretchLongControl[i][0]) else 0 out = (f"\nNo.IBD_stretch_in_All_Cases_and_1_Control({self.control_list[i][1].decode('utf-8')}):\t{numStretchLongControl}" f"\nIBD_stretch\tChr\tStart_SNP\tEnd_SNP\tStart_Position(bp)\tEnd_Position(bp)\tLength(bp)\n") f.write(out) numStretchLongControl = len(StretchLongControl[i]) if not StretchLongControl[i][0]: numStretchLongControl = 0 for j in range(numStretchLongControl): start = StretchLongControl[i][j][0] end = StretchLongControl[i][j][1] L = [str(j+1), str(self.Chr[start]), self.SNPID[start].decode('utf-8'), self.SNPID[end].decode('utf-8'), str(self.bp[start]), str(self.bp[end]), f"{(self.bp[end] - self.bp[start])}\n"] out = '\t'.join(L) f.write(out) print('IBD mapping for Cases and Controls finished!!') if point: PointHitSumControl = [0] * len(self.Position) for i in range(len(PointHitResult)): for j in range(len(PointHitResult[0])): PointHitSumControl[j] += PointHitResult[i][j] out = ("\n\nIndividual_Marker/Sample_IBDstatus_in_IBDregions\n\nSNP\tChr\tbp\tNo.Cases_in_IBD" "\tNo.Controls_in_IBD\tAll_Cases_in_IBD(Yes:1/No:0)") f.write(out) L = [] for i in range(len(self.control_list)): L.append("\t{0}_in_IBD(Yes:1/No:0)".format(self.control_list[i][1].decode('utf-8'))) L.append('\n') out = ''.join(L) f.write(out) for j in range(len(PointHitFlag)): if PointHitFlag[j]: L = [f"{self.SNPID[j].decode('utf-8')}\t{str(self.Chr[j])}\t{str(self.bp[j])}" f"\t{str(len(self.case_list))}\t{str(PointHitSumControl[j])}\t{PointHitFlag[j]}"] for i in range(len(PointHitResult)): L.append(f'\t{PointHitResult[i][j]}') L.append('\n') out = ''.join(L) f.write(out) case_regions = np.asarray(StretchLong) if self.numControl: ctrl_regions = [np.asarray(i) for i in StretchLongControl] num_ctrl_regions = len(ctrl_regions) if not len(ctrl_regions[0][0]): num_ctrl_regions = 0 else: num_ctrl_regions = 0 num_case_regions = len(case_regions) if not len(case_regions[0]): num_case_regions = 0 edges = [] for i in range(num_case_regions): edges.append([case_regions[i][0], 0, 0]) edges.append([case_regions[i][1], 0, 1]) for i in range(num_ctrl_regions): num_ctrl_region = len(ctrl_regions[i]) if not len(ctrl_regions[i][0]): num_ctrl_region = 0 for j in range(num_ctrl_region): edges.append([ctrl_regions[i][j][0], i+1, 0]) edges.append([ctrl_regions[i][j][1], i+1, 1]) state = [0] * (num_ctrl_regions+1) edges = sorted(edges) self.ibd_regions = [] for i in range(len(edges)): if not edges[i][2]: state[edges[i][1]] += 1 else: state[edges[i][1]] -= 1 if i+1 == len(edges) or edges[i][0] != edges[i+1][0]: self.ibd_regions.append([edges[i][0], copy.copy(state)]) L = ["\n\nChr\tStart\tEnd\tIBD_in_Controls\tIBD_Case_specificity"] for i in range(len(self.ibd_regions) - 1): if self.ibd_regions[i][1][0]!=0: prop = round(1.0-sum(self.ibd_regions[i][1][1:])/self.numControl, 3) if self.numControl else 1.0 L.append(f"{self.Chr[self.ibd_regions[i][0]]}\t{self.bp[self.ibd_regions[i][0]]}" f"\t{self.bp[self.ibd_regions[i+1][0]]}\t{sum(self.ibd_regions[i][1][1:])}\t{prop}") L.append('\nCalculation_finished_at:\t{}\n'.format(datetime.now().strftime("%Y/%m/%d %H:%M:%S"))) out = '\n'.join(L) f.write(out) def rohmapping_gw(self, Windowkb, Stretchkb, numGapSNP, numMinSNP, WindowGap, out): '''Performs genome-wide runs of homozygosity mapping''' print("********************\n" f"ROH mapping started.\ninput file prefix: {self.prefix}\nWindowkb: {Windowkb}\nStretchkb: {Stretchkb}" f"\nnumGapSNP: {numGapSNP}\nnumMinSNP: {numMinSNP}\nWindowGap: {WindowGap}\noutput file prefix: {out}") with open(f"{out}.txt", 'w') as f: out = (f"Log_for_ROHmapping\n\nInput_File:\t{self.prefix}.ped/map/info/case\nNo.SNP:\t{self.numSNP}\nWindow_kb:\t{Windowkb}" f"\nStretch_kb:\t{Stretchkb}\nWindowGap_kb:\t{WindowGap}\nNo.InconsistentSNP:\t{numGapSNP}" f"\nNo.MinSNP:\t{numMinSNP}\n\nNo.Samples:\t{self.numSample}\nNo.Cases:\t{self.numCase}\n\n") f.write(out) ROHwin = LOCH_MappingTools.MakeROHonWindowMulti(self.case_geno, self.Position, Windowkb, numGapSNP, numMinSNP, Stretchkb, WindowGap) StretchLongArray = LOCH_MappingTools.DecideROHStretchMulti(ROHwin, self.Position, Stretchkb, WindowGap) for i in range(self.numCase): numStretchLongCase = len(StretchLongArray[i]) if (StretchLongArray[i][0][1]) else 0 out = (f"\n\nNo.ROH_in_1_Case({self.case_list[i][1].decode('utf-8')}):\t{numStretchLongCase}" f"\nROH\tChr\tStart_SNP\tEnd_SNP\tStart_Position(bp)\tEnd_Position(bp)\tLength(bp)\n") f.write(out) for j in range(len(StretchLongArray[i])): start = StretchLongArray[i][j][0] end = StretchLongArray[i][j][1] if end == 0: continue L = [str(j+1), str(self.Chr[start]), self.SNPID[start].decode('utf-8'), self.SNPID[end].decode('utf-8'), str(self.bp[start]), str(self.bp[end]), f"{(self.bp[end] - self.bp[start])}\n"] out = '\t'.join(L) f.write(out) if self.numControl: control_geno = self.snpdata.val[self.snpreader.iid_to_index(self.control_list)] ROHwinControl = LOCH_MappingTools.MakeROHonWindowMulti(control_geno, self.Position, Windowkb, numGapSNP, numMinSNP, Stretchkb, WindowGap) StretchLongControl = LOCH_MappingTools.DecideROHStretchMulti(ROHwinControl, self.Position, Stretchkb, WindowGap) for i in range(self.numControl): numStretchLongControl = len(StretchLongControl[i]) if StretchLongControl[i][0][1] else 0 out = (f"\n\nNo.ROH_in_1_Control({self.control_list[i][1].decode('utf-8')}):\t{numStretchLongControl}" f"\nROH\tChr\tStart_SNP\tEnd_SNP\tStart_Position(bp)\tEnd_Position(bp)\tLength(bp)\n") f.write(out) for j in range(len(StretchLongControl[i])): start = StretchLongControl[i][j][0] end = StretchLongControl[i][j][1] if end == 0: continue L = [str(j+1), str(self.Chr[start]), self.SNPID[start].decode('utf-8'), self.SNPID[end].decode('utf-8'), str(self.bp[start]), str(self.bp[end]), f"{(self.bp[end] - self.bp[start])}\n"] out = '\t'.join(L) f.write(out) print('ROH detection for Cases and Controls finished!!') case_regions = [np.asarray(i) for i in StretchLongArray] edges = [] for i in range(len(case_regions)): for j in range(len(case_regions[i])): edges.append([case_regions[i][j][0], i, 0]) edges.append([case_regions[i][j][1], i, 1]) state = [0] * (len(case_regions)) edges = sorted(edges) self.roh_case_regions = [] for i in range(len(edges)): if not edges[i][2]: state[edges[i][1]] += 1 else: state[edges[i][1]] -= 1 if (i+1 == len(edges) or edges[i][0] != edges[i+1][0]): self.roh_case_regions.append([edges[i][0], copy.copy(state)]) L = ["\n\nChr\tStart\tEnd\tNumber_of_ROH_in_Cases\n"] for i in range(len(self.roh_case_regions) - 1): if np.sum(self.roh_case_regions[i][1]): L.append(f"{self.Chr[self.roh_case_regions[i][0]]}\t{self.bp[self.roh_case_regions[i][0]]}" f"\t{self.bp[self.roh_case_regions[i+1][0]]}\t{np.sum(self.roh_case_regions[i][1])}\n") out = ''.join(L) f.write(out) if self.numControl: control_regions = [np.asarray(i) for i in StretchLongControl] edges = [] for i in range(len(control_regions)): for j in range(len(control_regions[i])): edges.append([control_regions[i][j][0], i, 0]) edges.append([control_regions[i][j][1], i, 1]) state = [0] * (len(control_regions)) edges = sorted(edges) self.roh_control_regions = [] for i in range(len(edges)): if not edges[i][2]: state[edges[i][1]] += 1 else: state[edges[i][1]] -= 1 if (i+1 == len(edges) or edges[i][0] != edges[i+1][0]): self.roh_control_regions.append([edges[i][0], copy.copy(state)]) L = ["\n\nChr\tStart\tEnd\tNumber_of_ROH_in_Controls\n"] for i in range(len(self.roh_control_regions) - 1): if np.sum(self.roh_control_regions[i][1]): L.append(f"{self.Chr[self.roh_control_regions[i][0]]}\t{self.bp[self.roh_control_regions[i][0]]}" f"\t{self.bp[self.roh_control_regions[i+1][0]]}\t{np.sum(self.roh_control_regions[i][1])}\n") out = ''.join(L) f.write(out) out = '\nCalculation_finished_at:\t{}\n'.format(datetime.now().strftime("%Y/%m/%d %H:%M:%S")) f.write(out) def draw_diagram(self, fig_name): '''Draws diagrams of mapping results''' chr_length = [249250621, 243199373, 198022430, 191154276, 180915260, 171115067, 159138663, 146364022, 141213431, 135534747, 135006516, 133851895, 115169878, 107349540, 102531392, 90354753, 81195210, 78077248, 59128983, 63025520, 48129895, 51304566] chr_names = list(range(1,23)) #chr_list = np.unique(self.Chr) #chr_length = pd.DataFrame(self.snpdata.pos).groupby(0, as_index=False).max() size = 22 gap_length = 10 ** 7 array = np.triu(np.ones((size,size))).T gap = np.full(size, gap_length) global_len = np.dot(array, chr_length + gap) global_len = np.insert(global_len, 0, 0) global_len += gap_length fig = plt.figure(figsize=(15,5)) cmap = plt.get_cmap("tab10") ax1 = fig.add_subplot(211) ax2 = fig.add_subplot(212) StChr, EdChr = 0, 22 Stbp, Edbp = 0, global_len[EdChr] [i.set_xlim(Stbp, Edbp) for i in fig.get_axes()] ax1.set_ylim(-0.1, 1.0) ax1.set_xticklabels([]) ax1.set_ylabel('IBD Case specificity') ax1.title.set_text('IBD mapping results based on SNP streak principle') ax1.spines['bottom'].set_visible(False) ax1.spines['top'].set_visible(False) ax1.tick_params(axis='both', which='both', length=0) ax1.axhline(0, color='k', linewidth=0.5) ax1.axhline(1, color='k', linewidth=0.5, ls='--') for i in range(size): ax1.add_patch(plt.Rectangle(xy=[global_len[i], -0.1], width=chr_length[i], height=0.05, color=cmap(i%10))) for i, reg in enumerate(self.ibd_regions): if reg[1][0]!=0: prop = 1 - sum(reg[1][1:])/self.numControl if self.numControl else 1.0 ax1.add_patch(plt.Rectangle(xy=[(self.bp[reg[0]] + global_len[self.Chr[reg[0]] - 1]), 0], width=self.bp[self.ibd_regions[i+1][0]] - self.bp[reg[0]], height=prop, color=cmap((self.Chr[reg[0]] - 1) % 10))) ax1.add_patch(plt.Rectangle(xy=[(self.bp[reg[0]] + global_len[self.Chr[reg[0]] - 1]), -0.04], width=max(self.bp[self.ibd_regions[i+1][0]] - self.bp[reg[0]], (Edbp - Stbp)*0.001), height=0.03, color='r')) ax2.set_ylim(-self.numCase*0.1, self.numCase) ax2.set_xticks([(global_len[i] + global_len[i+1])/2 for i in range(StChr, EdChr)]) ax2.set_xticklabels([i for i in chr_names]) ax2.set_ylabel('ROH in Cases') ax2.title.set_text('Runs of homozygosity detection results') ax2.spines['bottom'].set_visible(False) ax2.spines['top'].set_visible(False) ax2.tick_params(axis='both', which='both', length=0) ax2.axhline(0, color='k', linewidth=0.5) ax2.axhline(self.numCase, color='k', linewidth=0.5, ls='--') for i in range(size): ax2.add_patch(plt.Rectangle(xy=[global_len[i], -self.numCase * 0.1], width=chr_length[i], height=self.numCase * 0.05, color=cmap(i%10))) for i, reg in enumerate(self.roh_case_regions): numROH = np.sum(reg[1]) if numROH: ax2.add_patch(plt.Rectangle(xy=[(self.bp[reg[0]] + global_len[self.Chr[reg[0]] - 1]), 0], width=self.bp[self.roh_case_regions[i+1][0]] - self.bp[reg[0]], height=numROH, color=cmap((self.Chr[reg[0]] - 1) % 10))) if numROH == self.numCase: ax2.add_patch(plt.Rectangle(xy=[(self.bp[reg[0]] + global_len[self.Chr[reg[0]] - 1]), -0.03 * numROH], width=max(self.bp[self.roh_case_regions[i+1][0]] - self.bp[reg[0]], (Edbp - Stbp)*0.001), height=0.02 * numROH, color='r')) fig.align_labels() plt.savefig(fig_name)
class GRMLoaderSnpReader: """Constructs a background kernel :math:`K_0` from given binary PLINK 1 genotype files using the leave-one-out-chromosome (LOCO) strategy. Initially no background kernel is constructed, only the instance attributes are initialized. The kernel gets constructed when calling the method :func:`compute_background_kernel` which should only be called after calling the :func:`update_ind` method manually or the :func:`seak.data_loaders.intersect_and_update_datasets`. This way, individuals which are neither contained in the test nor in the background kernel data set are excluded. In full rank case, loads the SNPs in blocks to construct the kernel. In low rank case, loads all SNPs into memory at once. :param str path_to_plink_files_with_prefix: path prefix to genotype PLINK files for background kernel construction :param int blocksize: how many genotypes to load at once; should be chosen dependent on RAM available :param str/int LOCO_chrom_id: identifier of the chromosome/region that is used in the respective test set and should be excluded from the background kernel or None if all variants should be included :param bool forcelowrank: enforce low rank data loading behavior for testing purposes .. note:: The leave-one-chromosome-out (LOCO) strategy can be disabled with :attr:`LOCO_chrom_id`. """ def __init__(self, path_or_bed, blocksize, LOCO_chrom_id=None, forcelowrank=False): """Constructor.""" self.forcelowrank = forcelowrank # only for testing purposes! if isinstance(path_or_bed, str): self.bed = Bed(path_or_bed, count_A1=True) else: assert isinstance( path_or_bed, SnpReader ), 'path_or_bed must either be a path to a bed-file, or an instance of SnpReader.' self.bed.pos[:, 0] = self.bed.pos[:, 0].astype( 'str') # chromosome should be str, stored positions are 1-based self.iid_fid = pd.DataFrame(self.bed.iid, index=self.bed.iid[:, 1].astype(str), columns=['fid', 'iid']) self.variants_to_include = self._get_LOCO_SNV_indices(LOCO_chrom_id) self.blocksize = blocksize self.nb_ind = None self.nb_SNVs_unf = None self.G0 = None self.K0 = None self.nb_SNVs_f = None self.samples_overlapped = False def _get_LOCO_SNV_indices(self, LOCO_chrom_id): """Returns list of indices that should be included in the GRM. :param str/int LOCO_chrom_id: identifier of the chromosome/region that is used in the respective test set and should be excluded from the background kernel or None if all variants should be included :return: numerical indices of the SNVs to exclude from the background kernel computation :rtype: numpy.ndarray or ndarray-like """ if LOCO_chrom_id is None: return np.arange(self.bed.sid_count, dtype=int) else: return np.where(~(self.bed.pos[:, 0].astype(str) == LOCO_chrom_id))[0] def update_individuals(self, iids): """Sets individuals to include into the background kernel data set based on individual ids (:attr:`iids`). :param iids: numpy.Series of individual ids that should be retained for background kernel computation """ iid_fid = self.iid_fid.loc[iids] self.bed = self.bed[self.bed.iid_to_index(iid_fid.values), :] self.samples_overlapped = True def get_iids(self): """Returns all individual ids. :return: :rtype: numpy.ndarray """ return self.iid_fid.index.values def _build_G0(self): """Low rank case: constructs :math:`G_0` from provided bed file (PLINK 1). :return: normalized genotypes :math:`G_0` and number of SNVs that where loaded :rtype: numpy.ndarray, int """ temp_genotypes = self.bed[:, self.variants_to_include].read().standardize( Unit()).val # Replaced the code below with PySnpTools internal standardizer #filter_invariant = ~(temp_genotypes == temp_genotypes[0, :]).all(0) #filter_invariant = ~filter_invariant.all(0) #filter_all_nan = ~np.all(np.isnan(temp_genotypes), axis=0) #total_filter = filter_invariant & filter_all_nan #temp_genotypes = temp_genotypes[:, total_filter] #temp_genotypes = VariantLoader.standardize(temp_genotypes) #nb_SNVs_filtered = temp_genotypes.shape[1] # Normalize #return temp_genotypes / np.sqrt(nb_SNVs_filtered), nb_SNVs_filtered # TODO: is invariant-filtering really necessary here? invariant = (temp_genotypes == temp_genotypes[0, :]).all(0) n_filtered = (~invariant).sum() temp_genotypes /= np.sqrt(n_filtered) return temp_genotypes[:, ~invariant], n_filtered def _build_K0_blocked(self): """Full rank case: Builds background kernel :math:`K_0` by loading blocks of SNPs from provided bed file (PLINK 1). :return: normalized background kernel :math:`K_0` and number of SNVs that where used to built the kernel :rtype: numpy.ndarray, int """ # TODO: make use of PySnpTools KernelReader functionality K0 = np.zeros([self.nb_ind, self.nb_ind], dtype=np.float32) nb_SNVs_filtered = 0 stop = self.nb_SNVs_unf for start in range(0, stop, self.blocksize): if start + self.blocksize >= stop: temp_genotypes = self.bed[:, self. variants_to_include[start:]].read( ).standardize(Unit()).val else: temp_genotypes = self.bed[:, self.variants_to_include[ start:start + self.blocksize]].read().standardize( Unit()).val # Replaced the code below with the PySnpTools internal standardizer # temp_genotypes = VariantLoader.mean_imputation(temp_genotypes) # filter_invariant = temp_genotypes == temp_genotypes[0, :] # filter_invariant = ~filter_invariant.all(0) # filter_all_nan = ~np.all(np.isnan(temp_genotypes), axis=0) # total_filter = filter_invariant & filter_all_nan # temp_genotypes = temp_genotypes[:, total_filter] # temp_genotypes = VariantLoader.standardize(temp_genotypes) # temp_n_SNVS = temp_genotypes.shape[1] # nb_SNVs_filtered += temp_n_SNVS # TODO: is invariant-filtering really necessary here? invariant = (temp_genotypes == temp_genotypes[0, :]).all(0) K0 += np.matmul(temp_genotypes[:, ~invariant], temp_genotypes[:, ~invariant].T) nb_SNVs_filtered += (~invariant).sum() return K0 / nb_SNVs_filtered, nb_SNVs_filtered def compute_background_kernel(self): """Computes background kernel :math:`K_0` for given set of genotypes (binary PLINK 1 files). Overlap with data of set to be tested should have been carried out before, such that individuals in both data sets match. Does not return anything but sets instance attributes for either the background kernel :math:`K_0` or the background kernel genotype matrix :math:`G_0`. """ if not self.samples_overlapped: logging.warning( 'Data to construct background kernel was not overlapped with data of set to be tested.' ) self.nb_ind = self.bed.iid_count self.nb_SNVs_unf = self.bed.sid_count print('# of individuals for background kernel: {}'.format(self.nb_ind)) print('# of (unfiltered) SNVs for background kernel: {}'.format( self.nb_SNVs_unf)) # low rank if self.nb_ind > self.nb_SNVs_unf or self.forcelowrank: self.G0, self.nb_SNVs_f = self._build_G0() self.K0 = None # full rank else: self.G0 = None self.K0, self.nb_SNVs_f = self._build_K0_blocked() print('# of filtered SNVs for background kernel: {}'.format( self.nb_SNVs_f)) def write_kernel(self, path, filetype='hdf5'): """Write constructed background kernel :math:`K_0` to file, using eihter pysnptools.kernelreader.KernelHdf5 or pysnptools.kernelreader.KernelNpz. :param str path: Path to the output file to be created. :param str filetype: Either 'hdf5' or 'npz' """ if self.K0 is None: if self.G0 is not None: raise ValueError( 'G0 is initialized: Number of individuals < number of variants. In this case no kernel is constructed.' ) raise ValueError( 'K0 is not initialized, need to call compute_background_kernel() first' ) elif filetype == 'hdf5': KernelHdf5.write(path, KernelData(self.iid_fid.values, val=self.K0)) elif filetype == 'npz': KernelNpz.write(path, KernelData(self.iid_fid.values, val=self.K0)) else: raise ValueError( 'filetype has to be either "npz" or "hdf5", got {}'.format( filetype))
# [ 5 4001 4001] # [ 5 4002 4002] # ..., # [ 5 4997 4997] # [ 5 4998 4998] # [ 5 4999 4999]] #In one-line: chr5data = Bed("all.bed")[:,snpreader.pos[:,0] == 5].read() # You can turn iid or sid names into indexes snpreader = Bed("all.bed") iid0 =[['cid499P1','cid499P1'], ['cid489P1','cid489P1'], ['cid479P1','cid479P1']] indexes0 = snpreader.iid_to_index(iid0) print indexes0 #array([499, 489, 479]) snpreader0 = snpreader[indexes0,:] print snpreader0.iid #[['cid499P1' 'cid499P1'] # ['cid489P1' 'cid489P1'] # ['cid479P1' 'cid479P1']] # more condensed snpreader0 = snpreader[snpreader.iid_to_index(iid0),:] #both a once snpdata0chr5 = snpreader[snpreader.iid_to_index(iid0),snpreader.pos[:,0] == 5].read() print np.mean(snpdata0chr5.val) # 1.493