def read_with_specification(snpset_withbbed, order="F", dtype=SP.float64, force_python_only=False): # doesn't need to self.run_once() because it is static decide_once_on_plink_reader() global WRAPPED_PLINK_PARSER_PRESENT bed = snpset_withbbed.bed iid_count_in, iid_count_out, iid_index_out, snp_count_in, snp_count_out, snp_index_out = bed.counts_and_indexes( snpset_withbbed) if WRAPPED_PLINK_PARSER_PRESENT and not force_python_only: from pysnptools.snpreader import wrap_plink_parser SNPs = SP.zeros((iid_count_out, snp_count_out), order=order, dtype=dtype) bed_fn = bed.basefilename + ".bed" if dtype == SP.float64: if order == "F": wrap_plink_parser.readPlinkBedFiledoubleFAAA( bed_fn, iid_count_in, snp_count_in, iid_index_out, snp_index_out, SNPs) elif order == "C": wrap_plink_parser.readPlinkBedFiledoubleCAAA( bed_fn, iid_count_in, snp_count_in, iid_index_out, snp_index_out, SNPs) else: raise Exception( "order '{0}' not known, only 'F' and 'C'".format( order)) elif dtype == SP.float32: if order == "F": wrap_plink_parser.readPlinkBedFilefloatFAAA( bed_fn, iid_count_in, snp_count_in, iid_index_out, snp_index_out, SNPs) elif order == "C": wrap_plink_parser.readPlinkBedFilefloatCAAA( bed_fn, iid_count_in, snp_count_in, iid_index_out, snp_index_out, SNPs) else: raise Exception( "dtype '{0}' not known, only float64 and float32". format(dtype)) else: # An earlier version of this code had a way to read consecutive SNPs of code in one read. May want # to add that ability back to the code. # Also, note that reading with python will often result in non-contigious memory, so the python standardizers will automatically be used, too. logging.warn( "using pure python plink parser (might be much slower!!)") SNPs = SP.zeros( ((int(SP.ceil(0.25 * iid_count_in)) * 4), snp_count_out), order=order, dtype=dtype) #allocate it a little big for SNPsIndex, bimIndex in enumerate(snpset_withbbed): startbit = int(SP.ceil(0.25 * iid_count_in) * bimIndex + 3) bed._filepointer.seek(startbit) nbyte = int(SP.ceil(0.25 * iid_count_in)) bytes = SP.array(bytearray( bed._filepointer.read(nbyte))).reshape( (int(SP.ceil(0.25 * iid_count_in)), 1), order='F') SNPs[3::4, SNPsIndex:SNPsIndex + 1][bytes >= 64] = SP.nan SNPs[3::4, SNPsIndex:SNPsIndex + 1][bytes >= 128] = 1 SNPs[3::4, SNPsIndex:SNPsIndex + 1][bytes >= 192] = 2 bytes = SP.mod(bytes, 64) SNPs[2::4, SNPsIndex:SNPsIndex + 1][bytes >= 16] = SP.nan SNPs[2::4, SNPsIndex:SNPsIndex + 1][bytes >= 32] = 1 SNPs[2::4, SNPsIndex:SNPsIndex + 1][bytes >= 48] = 2 bytes = SP.mod(bytes, 16) SNPs[1::4, SNPsIndex:SNPsIndex + 1][bytes >= 4] = SP.nan SNPs[1::4, SNPsIndex:SNPsIndex + 1][bytes >= 8] = 1 SNPs[1::4, SNPsIndex:SNPsIndex + 1][bytes >= 12] = 2 bytes = SP.mod(bytes, 4) SNPs[0::4, SNPsIndex:SNPsIndex + 1][bytes >= 1] = SP.nan SNPs[0::4, SNPsIndex:SNPsIndex + 1][bytes >= 2] = 1 SNPs[0::4, SNPsIndex:SNPsIndex + 1][bytes >= 3] = 2 SNPs = SNPs[ iid_index_out, :] #reorder or trim any extra allocation ret = { 'rs': bed.rs[snp_index_out], 'pos': bed.pos[snp_index_out, :], 'snps': SNPs, 'iid': bed.original_iids[iid_index_out, :] } return ret
def read_with_specification(snpset_withbbed, order="F", dtype=SP.float64, force_python_only=False): # doesn't need to self.run_once() because it is static decide_once_on_plink_reader() global WRAPPED_PLINK_PARSER_PRESENT bed = snpset_withbbed.bed iid_count_in, iid_count_out, iid_index_out, snp_count_in, snp_count_out, snp_index_out = bed.counts_and_indexes(snpset_withbbed) if WRAPPED_PLINK_PARSER_PRESENT and not force_python_only: from pysnptools.snpreader import wrap_plink_parser SNPs = SP.zeros((iid_count_out, snp_count_out), order=order, dtype=dtype) bed_fn = bed.basefilename + ".bed" if dtype == SP.float64: if order=="F": wrap_plink_parser.readPlinkBedFiledoubleFAAA(bed_fn, iid_count_in, snp_count_in, iid_index_out, snp_index_out, SNPs) elif order=="C": wrap_plink_parser.readPlinkBedFiledoubleCAAA(bed_fn, iid_count_in, snp_count_in, iid_index_out, snp_index_out, SNPs) else: raise Exception("order '{0}' not known, only 'F' and 'C'".format(order)); elif dtype == SP.float32: if order=="F": wrap_plink_parser.readPlinkBedFilefloatFAAA(bed_fn, iid_count_in, snp_count_in, iid_index_out, snp_index_out, SNPs) elif order=="C": wrap_plink_parser.readPlinkBedFilefloatCAAA(bed_fn, iid_count_in, snp_count_in, iid_index_out, snp_index_out, SNPs) else: raise Exception("dtype '{0}' not known, only float64 and float32".format(dtype)) else: # An earlier version of this code had a way to read consecutive SNPs of code in one read. May want # to add that ability back to the code. # Also, note that reading with python will often result in non-contigious memory, so the python standardizers will automatically be used, too. logging.warn("using pure python plink parser (might be much slower!!)") SNPs = SP.zeros(((int(SP.ceil(0.25*iid_count_in))*4),snp_count_out),order=order, dtype=dtype) #allocate it a little big for SNPsIndex, bimIndex in enumerate(snpset_withbbed): startbit = int(SP.ceil(0.25*iid_count_in)*bimIndex+3) bed._filepointer.seek(startbit) nbyte = int(SP.ceil(0.25*iid_count_in)) bytes = SP.array(bytearray(bed._filepointer.read(nbyte))).reshape((int(SP.ceil(0.25*iid_count_in)),1),order='F') SNPs[3::4,SNPsIndex:SNPsIndex+1][bytes>=64]=SP.nan SNPs[3::4,SNPsIndex:SNPsIndex+1][bytes>=128]=1 SNPs[3::4,SNPsIndex:SNPsIndex+1][bytes>=192]=2 bytes=SP.mod(bytes,64) SNPs[2::4,SNPsIndex:SNPsIndex+1][bytes>=16]=SP.nan SNPs[2::4,SNPsIndex:SNPsIndex+1][bytes>=32]=1 SNPs[2::4,SNPsIndex:SNPsIndex+1][bytes>=48]=2 bytes=SP.mod(bytes,16) SNPs[1::4,SNPsIndex:SNPsIndex+1][bytes>=4]=SP.nan SNPs[1::4,SNPsIndex:SNPsIndex+1][bytes>=8]=1 SNPs[1::4,SNPsIndex:SNPsIndex+1][bytes>=12]=2 bytes=SP.mod(bytes,4) SNPs[0::4,SNPsIndex:SNPsIndex+1][bytes>=1]=SP.nan SNPs[0::4,SNPsIndex:SNPsIndex+1][bytes>=2]=1 SNPs[0::4,SNPsIndex:SNPsIndex+1][bytes>=3]=2 SNPs = SNPs[iid_index_out,:] #reorder or trim any extra allocation ret = { 'rs' :bed.rs[snp_index_out], 'pos' :bed.pos[snp_index_out,:], 'snps' :SNPs, 'iid' :bed.original_iids[iid_index_out,:] } return ret
def _read(self, iid_index_or_none, sid_index_or_none, order, dtype, force_python_only, view_ok): self._run_once() if order=='A': order='F' assert not hasattr(self, 'ind_used'), "A SnpReader should not have a 'ind_used' attribute" iid_count_in = self.iid_count sid_count_in = self.sid_count if iid_index_or_none is not None: iid_count_out = len(iid_index_or_none) iid_index_out = iid_index_or_none else: iid_count_out = iid_count_in iid_index_out = range(iid_count_in) if sid_index_or_none is not None: sid_count_out = len(sid_index_or_none) sid_index_out = sid_index_or_none else: sid_count_out = sid_count_in sid_index_out = range(sid_count_in) if not force_python_only: from pysnptools.snpreader import wrap_plink_parser val = np.zeros((iid_count_out, sid_count_out), order=order, dtype=dtype) bed_fn = SnpReader._name_of_other_file(self.filename,"bed","bed") if dtype == np.float64: if order=="F": wrap_plink_parser.readPlinkBedFiledoubleFAAA(bed_fn, iid_count_in, sid_count_in, iid_index_out, sid_index_out, val) elif order=="C": wrap_plink_parser.readPlinkBedFiledoubleCAAA(bed_fn, iid_count_in, sid_count_in, iid_index_out, sid_index_out, val) else: raise Exception("order '{0}' not known, only 'F' and 'C'".format(order)); elif dtype == np.float32: if order=="F": wrap_plink_parser.readPlinkBedFilefloatFAAA(bed_fn, iid_count_in, sid_count_in, iid_index_out, sid_index_out, val) elif order=="C": wrap_plink_parser.readPlinkBedFilefloatCAAA(bed_fn, iid_count_in, sid_count_in, iid_index_out, sid_index_out, val) else: raise Exception("order '{0}' not known, only 'F' and 'C'".format(order)); else: raise Exception("dtype '{0}' not known, only float64 and float32".format(dtype)) else: # An earlier version of this code had a way to read consecutive SNPs of code in one read. May want # to add that ability back to the code. # Also, note that reading with python will often result in non-contiguous memory, so the python standardizers will automatically be used, too. logging.warn("using pure python plink parser (might be much slower!!)") val = np.zeros(((int(np.ceil(0.25*iid_count_in))*4),sid_count_out),order=order, dtype=dtype) #allocate it a little big for SNPsIndex, bimIndex in enumerate(sid_index_out): startbit = int(np.ceil(0.25*iid_count_in)*bimIndex+3) self._filepointer.seek(startbit) nbyte = int(np.ceil(0.25*iid_count_in)) bytes = np.array(bytearray(self._filepointer.read(nbyte))).reshape((int(np.ceil(0.25*iid_count_in)),1),order='F') val[3::4,SNPsIndex:SNPsIndex+1][bytes>=64]=np.nan val[3::4,SNPsIndex:SNPsIndex+1][bytes>=128]=1 val[3::4,SNPsIndex:SNPsIndex+1][bytes>=192]=2 bytes=np.mod(bytes,64) val[2::4,SNPsIndex:SNPsIndex+1][bytes>=16]=np.nan val[2::4,SNPsIndex:SNPsIndex+1][bytes>=32]=1 val[2::4,SNPsIndex:SNPsIndex+1][bytes>=48]=2 bytes=np.mod(bytes,16) val[1::4,SNPsIndex:SNPsIndex+1][bytes>=4]=np.nan val[1::4,SNPsIndex:SNPsIndex+1][bytes>=8]=1 val[1::4,SNPsIndex:SNPsIndex+1][bytes>=12]=2 bytes=np.mod(bytes,4) val[0::4,SNPsIndex:SNPsIndex+1][bytes>=1]=np.nan val[0::4,SNPsIndex:SNPsIndex+1][bytes>=2]=1 val[0::4,SNPsIndex:SNPsIndex+1][bytes>=3]=2 val = val[iid_index_out,:] #reorder or trim any extra allocation #!!LATER this can fail because the trim statement above messes up the order #assert(SnpReader._array_properties_are_ok(val, order, dtype)) #!! return val