def write(filename, snpdata): """Writes a :class:`SnpData` to dat/fam/map format. :param filename: the name of the file to create :type filename: string :param snpdata: The in-memory data that should be written to disk. :type snpdata: :class:`SnpData` >>> from pysnptools.snpreader import Dat, Bed >>> import pysnptools.util as pstutil >>> snpdata = Bed('../examples/toydata.bed',count_A1=False)[:,:10].read() # Read first 10 snps from Bed format >>> pstutil.create_directory_if_necessary("tempdir/toydata10.dat") >>> Dat.write("tempdir/toydata10.dat",snpdata) # Write data in dat/fam/map format """ if isinstance(filename,SnpData) and isinstance(snpdata,str): #For backwards compatibility, reverse inputs if necessary warnings.warn("write statement should have filename before data to write", DeprecationWarning) filename, snpdata = snpdata, filename SnpReader._write_fam(snpdata, filename, remove_suffix="dat") SnpReader._write_map_or_bim(snpdata, filename, remove_suffix="dat", add_suffix="map") filename = SnpReader._name_of_other_file(filename,remove_suffix="dat", add_suffix="dat") snpsarray = snpdata.val with open(filename,"w") as dat_filepointer: for sid_index, sid in enumerate(snpdata.sid): if sid_index % 1000 == 0: logging.info("Writing snp # {0} to file '{1}'".format(sid_index, filename)) dat_filepointer.write("{0}\tj\tn\t".format(sid)) #use "j" and "n" as the major and minor allele row = snpsarray[:,sid_index] dat_filepointer.write("\t".join((str(i) for i in row)) + "\n") logging.info("Done writing " + filename)
def write(filename, snpdata): """Writes a :class:`SnpData` to Ped format. The values must be 0,1,2. The direction of the encoding to allele pairs is arbitrary. This means that if a SnpData is written in Ped format and then read back, then 0's may become 2's and 2's may become 0's. (1's will stay 1's). :param filename: the name of the file to create :type filename: string :param snpdata: The in-memory data that should be written to disk. :type snpdata: :class:`SnpData` >>> from pysnptools.snpreader import Ped, Bed >>> import pysnptools.util as pstutil >>> snpdata = Bed('../examples/toydata.bed',count_A1=False)[:,:10].read() # Read first 10 snps from Bed format >>> pstutil.create_directory_if_necessary("tempdir/toydata10.ped") >>> Ped.write("tempdir/toydata10.ped",snpdata) # Write data in Ped format """ if isinstance(filename, SnpData) and isinstance( snpdata, str ): #For backwards compatibility, reverse inputs if necessary warnings.warn( "write statement should have filename before data to write", DeprecationWarning) filename, snpdata = snpdata, filename SnpReader._write_map_or_bim(snpdata, filename, remove_suffix="ped", add_suffix="map") # The PED file is a white-space (space or tab) delimited file: the first six columns are mandatory: # Family ID # Case ID # Paternal ID # Maternal ID # Sex (1=male; 2=female; other=unknown) # Phenotype pedfile = SnpReader._name_of_other_file(filename, remove_suffix="ped", add_suffix="ped") with open(pedfile, "w") as ped_filepointer: for iid_index, iid_row in enumerate(snpdata.iid): ped_filepointer.write("{0} {1} 0 0 0 0".format( iid_row[0], iid_row[1])) row = snpdata.val[iid_index, :] for sid_index, val in enumerate(row): if val == 0: s = "A A" elif val == 1: s = "A G" elif val == 2: s = "G G" elif np.isnan(val): s = "0 0" else: raise Exception( "Expect values for ped file to be 0,1,2, or NAN. Instead, saw '{0}'" .format(val)) ped_filepointer.write("\t" + s) ped_filepointer.write("\n")
def write(snpdata, basefilename): SnpReader._write_map_or_bim(snpdata, basefilename, remove_suffix="ped", add_suffix="map") # The PED file is a white-space (space or tab) delimited file: the first six columns are mandatory: # Family ID # Individual ID # Paternal ID # Maternal ID # Sex (1=male; 2=female; other=unknown) # Phenotype pedfile = SnpReader._name_of_other_file(basefilename,remove_suffix="ped", add_suffix="ped") with open(pedfile,"w") as ped_filepointer: for iid_index, iid_row in enumerate(snpdata.iid): ped_filepointer.write("{0} {1} 0 0 0 0".format(iid_row[0],iid_row[1])) row = snpdata.val[iid_index,:] for sid_index, val in enumerate(row): if val == 0: s = "A A" elif val == 1: s = "A G" elif val == 2: s = "G G" elif val == np.nan: s = "0 0" else: raise Exception("Expect values for ped file to be 0,1,2, or NAN. Instead, saw '{0}'".format(val)) ped_filepointer.write("\t"+s) ped_filepointer.write("\n")
def write(snpdata, basefilename,force_python_only=False): SnpReader._write_fam(snpdata, basefilename, remove_suffix="bed") SnpReader._write_map_or_bim(snpdata, basefilename, remove_suffix="bed", add_suffix="bim") bedfile = SnpReader._name_of_other_file(basefilename,remove_suffix="bed", add_suffix="bed") if not force_python_only: from pysnptools.snpreader import wrap_plink_parser if snpdata.val.flags["C_CONTIGUOUS"]: order = "C" elif snpdata.val.flags["F_CONTIGUOUS"]: order = "F" else: raise Exception("order '{0}' not known, only 'F' and 'C'".format(order)) if snpdata.val.dtype == np.float64: if order=="F": wrap_plink_parser.writePlinkBedFiledoubleFAAA(bedfile, snpdata.iid_count, snpdata.sid_count, snpdata.val) else: wrap_plink_parser.writePlinkBedFiledoubleCAAA(bedfile, snpdata.iid_count, snpdata.sid_count, snpdata.val) elif snpdata.val.dtype == np.float32: if order=="F": wrap_plink_parser.writePlinkBedFilefloatFAAA(bedfile, snpdata.iid_count, snpdata.sid_count, snpdata.val) else: wrap_plink_parser.writePlinkBedFilefloatCAAA(bedfile, snpdata.iid_count, snpdata.sid_count, snpdata.val) else: raise Exception("dtype '{0}' not known, only float64 and float32".format(snpdata.val.dtype)) else: with open(bedfile,"wb") as bed_filepointer: #see http://pngu.mgh.harvard.edu/~purcell/plink/binary.shtml bed_filepointer.write(chr(0b01101100)) #magic numbers bed_filepointer.write(chr(0b00011011)) #magic numbers bed_filepointer.write(chr(0b00000001)) #snp major for sid_index in xrange(snpdata.sid_count): if sid_index % 1 == 0: logging.info("Writing snp # {0} to file '{1}'".format(sid_index, basefilename)) col = snpdata.val[:, sid_index] for iid_by_four in xrange(0,snpdata.iid_count,4): vals_for_this_byte = col[iid_by_four:iid_by_four+4] byte = 0b00000000 for val_index in xrange(len(vals_for_this_byte)): val = vals_for_this_byte[val_index] if val == 0: code = 0b00 elif val == 1: code = 0b10 #backwards on purpose elif val == 2: code = 0b11 elif np.isnan(val): code = 0b01 #backwards on purpose else: raise Exception("Can't convert value '{0}' to BED format (only 0,1,2,NAN allowed)".format(val)) byte |= (code << (val_index*2)) bed_filepointer.write(chr(byte)) logging.info("Done writing " + basefilename)
def __init__(self, internal, iid_indexer, sid_indexer): ''' an indexer can be: an integer i (same as [i]) a slice a list of integers a list of booleans ''' self._internal = internal self._iid_indexer = SnpReader._make_sparray_or_slice(iid_indexer) self._sid_indexer = SnpReader._make_sparray_or_slice(sid_indexer)
def run_once(self): if (self._ran_once): return self._ran_once = True self._iid = SnpReader._read_fam(self.dat_filename,remove_suffix="dat") self._sid, self._pos = SnpReader._read_map_or_bim(self.dat_filename,remove_suffix="dat", add_suffix="map") self._assert_iid_sid_pos() return self
def write(snpdata, basefilename): SnpReader._write_fam(snpdata, basefilename, remove_suffix="dat") SnpReader._write_map_or_bim(snpdata, basefilename, remove_suffix="dat", add_suffix="map") snpsarray = snpdata.val with open(basefilename,"w") as dat_filepointer: for sid_index, sid in enumerate(snpdata.sid): if sid_index % 1000 == 0: logging.info("Writing snp # {0} to file '{1}'".format(sid_index, basefilename)) dat_filepointer.write("{0}\tj\tn\t".format(sid)) #use "j" and "n" as the major and minor allele row = snpsarray[:,sid_index] dat_filepointer.write("\t".join((str(i) for i in row)) + "\n") logging.info("Done writing " + basefilename)
def compose_indexer_with_indexer(countA, indexerA, countB, indexerB): if _Subset._is_all_slice(indexerA): return indexerB if _Subset._is_all_slice(indexerB): return indexerA indexA = SnpReader._make_sparray_from_sparray_or_slice(countA, indexerA) indexB = SnpReader._make_sparray_from_sparray_or_slice(countB, indexerB) indexAB = indexA[indexB] return indexAB
def _read_pstdata(self): row = SnpReader._read_fam(self.filename,remove_suffix="dat") col, col_property = SnpReader._read_map_or_bim(self.filename,remove_suffix="dat", add_suffix="map") if len(row)==0 or len(col)==0: return SnpData(iid=row,sid=col,pos=col_property,val=np.empty([len(row),len(col)])) datfields = pd.read_csv(self.filename,delimiter = '\t',header=None,index_col=False) if not np.array_equal(np.array(datfields[0],dtype="string"), col) : raise Exception("Expect snp list in map file to exactly match snp list in dat file") del datfields[0] del datfields[1] del datfields[2] assert len(row) == datfields.shape[1], "Expect # iids in fam file to match dat file" val = datfields.as_matrix().T snpdata = SnpData(iid=row,sid=col,pos=col_property,val=val) return snpdata
def copyinputs(self, copier): # doesn't need to self.run_once() because only uses original inputs copier.input( SnpReader._name_of_other_file(self.filename, remove_suffix="bed", add_suffix="bed")) copier.input( SnpReader._name_of_other_file(self.filename, remove_suffix="bed", add_suffix="bim")) copier.input( SnpReader._name_of_other_file(self.filename, remove_suffix="bed", add_suffix="fam"))
def write(filename, snpdata): """Writes a :class:`SnpData` to Ped format. The values must be 0,1,2. The direction of the encoding to allele pairs is arbitrary. This means that if a SnpData is written in Ped format and then read back, then 0's may become 2's and 2's may become 0's. (1's will stay 1's). :param filename: the name of the file to create :type filename: string :param snpdata: The in-memory data that should be written to disk. :type snpdata: :class:`SnpData` >>> from pysnptools.snpreader import Ped, Bed >>> import pysnptools.util as pstutil >>> snpdata = Bed('../examples/toydata.bed',count_A1=False)[:,:10].read() # Read first 10 snps from Bed format >>> pstutil.create_directory_if_necessary("tempdir/toydata10.ped") >>> Ped.write("tempdir/toydata10.ped",snpdata) # Write data in Ped format """ if isinstance(filename,SnpData) and isinstance(snpdata,str): #For backwards compatibility, reverse inputs if necessary warnings.warn("write statement should have filename before data to write", DeprecationWarning) filename, snpdata = snpdata, filename SnpReader._write_map_or_bim(snpdata, filename, remove_suffix="ped", add_suffix="map") # The PED file is a white-space (space or tab) delimited file: the first six columns are mandatory: # Family ID # Case ID # Paternal ID # Maternal ID # Sex (1=male; 2=female; other=unknown) # Phenotype pedfile = SnpReader._name_of_other_file(filename, remove_suffix="ped", add_suffix="ped") with open(pedfile,"w") as ped_filepointer: for iid_index, iid_row in enumerate(snpdata.iid): ped_filepointer.write("{0} {1} 0 0 0 0".format(iid_row[0],iid_row[1])) row = snpdata.val[iid_index,:] for sid_index, val in enumerate(row): if val == 0: s = "A A" elif val == 1: s = "A G" elif val == 2: s = "G G" elif np.isnan(val): s = "0 0" else: raise Exception("Expect values for ped file to be 0,1,2, or NAN. Instead, saw '{0}'".format(val)) ped_filepointer.write("\t"+s) ped_filepointer.write("\n")
def _open_pgen(self, iid_index_or_none=None): pgenfile = SnpReader._name_of_other_file(self.filename, "pgen", "pgen") self._filepointer = PgenReader( pgenfile, raw_sample_ct=None, variant_ct=None, sample_subset=iid_index_or_none) #iid_index_or_none)
def _run_once(self): if self._ran_once: return self._ran_once = True self._row = SnpReader._read_fam(self.filename,remove_suffix="bed") self._col, self._col_property = SnpReader._read_map_or_bim(self.filename,remove_suffix="bed", add_suffix="bim") self._assert_iid_sid_pos() bedfile = SnpReader._name_of_other_file(self.filename,"bed","bed") self._filepointer = open(bedfile, "rb") mode = self._filepointer.read(2) if mode != 'l\x1b': raise Exception('No valid binary BED file') mode = self._filepointer.read(1) #\x01 = SNP major \x00 = individual major if mode != '\x01': raise Exception('only SNP-major is implemented') logging.info("bed file is open {0}".format(bedfile))
def _read_kernel(train, standardizer, block_size=None, order='A', dtype=np.float64, force_python_only=False, view_ok=False, return_trained=False): ''' The method creates a kernel for the in-memory SNP data. It handles these cases * No standardization is needed & everything is in memory OR uses the FROM-DISK method ''' from pysnptools.pstreader import PstReader #Just do a 'python' dot, if no standardization is needed and everything is the right type if isinstance(standardizer,Identity) and train.val.dtype == dtype: ts = time.time() #is_worth_logging = train.val.shape[0] * train.val.shape[1] * test.val.shape[0] > 1e9 #if is_worth_logging: logging.info(" _read_kernel about to multiply train{0} x test{1}".format(train.val.shape,test.val.shape)) if order == 'F': #numpy's 'dot' always returns 'C' order K = (train.val.dot(train.val.T)).T else: K = train.val.dot(train.val.T) assert PstReader._array_properties_are_ok(K,order,dtype), "internal error: K is not of the expected order or dtype" #if is_worth_logging: logging.info(" _read_kernel took %.2f seconds" % (time.time()-ts)) if return_trained: return K, standardizer else: return K else: #Do things the more general SnpReader way. return SnpReader._read_kernel(train, standardizer, block_size=block_size, order=order, dtype=dtype, force_python_only=force_python_only,view_ok=view_ok, return_trained=return_trained)
def _run_once(self): if self._ran_once: return self._ran_once = True if not hasattr(self, "_row"): self._row = SnpReader._read_fam(self.filename, remove_suffix="bed") if not hasattr(self, "_col") or not hasattr(self, "_col_property"): self._col, self._col_property = SnpReader._read_map_or_bim( self.filename, remove_suffix="bed", add_suffix="bim") self._assert_iid_sid_pos() if not self.skip_format_check: self._open_bed() self._close_bed()
def row(self): """*same as* :attr:`iid` """ if not hasattr(self, "_row"): self._row = SnpReader._read_fam(self.filename, remove_suffix="pgen") return self._row
def col_property(self): """*same as* :attr:`pos` """ if not hasattr(self, "_col_property"): self._col, self._col_property = SnpReader._read_map_or_bim( self.filename, remove_suffix="bed", add_suffix="bim") return self._col_property
def __init__(self, filename, missing = '0'): ''' filename : string of the filename of the ped file missing : string indicating a missing genotype (default '0') ''' self.filename = SnpReader._name_of_other_file(filename,remove_suffix="ped", add_suffix="ped") self.missing = missing
def _open_bed(self): bedfile = SnpReader._name_of_other_file(self.filename, "bed", "bed") self._filepointer = open(bedfile, "rb") mode = self._filepointer.read(2) if mode != 'l\x1b': raise Exception('No valid binary BED file') mode = self._filepointer.read( 1) #\x01 = SNP major \x00 = individual major if mode != '\x01': raise Exception('only SNP-major is implemented') logging.info("bed file is open {0}".format(bedfile))
def kernel(self, standardizer, blocksize=10000, allowlowrank=False): """ See :meth:`.SnpReader.kernel` for details and examples. """ if type(standardizer) is Identity: K = self.val.dot(self.val.T) return K else: K = SnpReader.kernel(self, standardizer, blocksize=blocksize, allowlowrank=allowlowrank) return K
def col(self): """*same as* :attr:`sid` """ if not hasattr(self, "_col"): self._col, self._col_property = SnpReader._read_map_or_bim( self.filename, remove_suffix="pgen", add_suffix="bim", max_filesize=1E9) return self._col
def __init__(self, filename, missing='0'): ''' filename : string of the filename of the ped file missing : string indicating a missing genotype (default '0') ''' super(Ped, self).__init__() self.filename = SnpReader._name_of_other_file(filename, remove_suffix="ped", add_suffix="ped") self.missing = missing
def run_once(self): if (self._ran_once): return self._ran_once = True self._sid, self._pos = SnpReader._read_map_or_bim(self.basefilename,remove_suffix="bed", add_suffix="map") pedfile = SnpReader._name_of_other_file(self.basefilename,remove_suffix="ped", add_suffix="ped") ped = np.loadtxt(pedfile,dtype = 'str',comments=None) self._iid = ped[:,0:2] self._assert_iid_sid_pos() snpsstr = ped[:,6::] inan=snpsstr==self.missing self._snps = np.zeros((snpsstr.shape[0],snpsstr.shape[1]/2)) for i in xrange(snpsstr.shape[1]/2): self._snps[inan[:,2*i],i]=np.nan vals=snpsstr[~inan[:,2*i],2*i:2*(i+1)] self._snps[~inan[:,2*i],i]+=(vals==vals[0,0]).sum(1)
def _read_pstdata(self): col, col_property = SnpReader._read_map_or_bim(self.filename,remove_suffix="ped", add_suffix="map") ped = np.loadtxt(self.filename, dtype='str', comments=None) row = ped[:,0:2] snpsstr = ped[:,6::] inan=snpsstr==self.missing snps = np.zeros((snpsstr.shape[0],snpsstr.shape[1]/2)) for i in xrange(snpsstr.shape[1]//2): snps[inan[:,2*i],i]=np.nan vals=snpsstr[~inan[:,2*i],2*i:2*(i+1)] snps[~inan[:,2*i],i]+=(vals==vals[0,0]).sum(1) snpdata = SnpData(iid=row,sid=col,pos=col_property,val=snps) return snpdata
def _read_pstdata(self): col, col_property = SnpReader._read_map_or_bim(self.filename, remove_suffix="ped", add_suffix="map") ped = np.loadtxt(self.filename, dtype='str', comments=None) row = ped[:, 0:2] snpsstr = ped[:, 6::] inan = snpsstr == self.missing snps = np.zeros((snpsstr.shape[0], snpsstr.shape[1] / 2)) for i in xrange(snpsstr.shape[1] // 2): snps[inan[:, 2 * i], i] = np.nan vals = snpsstr[~inan[:, 2 * i], 2 * i:2 * (i + 1)] snps[~inan[:, 2 * i], i] += (vals == vals[0, 0]).sum(1) snpdata = SnpData(iid=row, sid=col, pos=col_property, val=snps) return snpdata
def __init__(self, dat_filename): ''' filename : string of the name of the Dat file. ''' self.dat_filename = SnpReader._name_of_other_file(dat_filename,remove_suffix="dat", add_suffix="dat")
def _read(self, iid_index_or_none, sid_index_or_none, order, dtype, force_python_only, view_ok): self._run_once() if order == 'A': order = 'F' assert not hasattr( self, 'ind_used'), "A SnpReader should not have a 'ind_used' attribute" iid_count_in = self.iid_count sid_count_in = self.sid_count if iid_index_or_none is not None: iid_count_out = len(iid_index_or_none) iid_index_out = iid_index_or_none else: iid_count_out = iid_count_in iid_index_out = range(iid_count_in) if sid_index_or_none is not None: sid_count_out = len(sid_index_or_none) sid_index_out = sid_index_or_none else: sid_count_out = sid_count_in sid_index_out = range(sid_count_in) if not force_python_only: from pysnptools.snpreader import wrap_plink_parser val = np.zeros((iid_count_out, sid_count_out), order=order, dtype=dtype) bed_fn = SnpReader._name_of_other_file(self.filename, "bed", "bed") if dtype == np.float64: if order == "F": wrap_plink_parser.readPlinkBedFile2doubleFAAA( bed_fn, iid_count_in, sid_count_in, self.count_A1, iid_index_out, sid_index_out, val) elif order == "C": wrap_plink_parser.readPlinkBedFile2doubleCAAA( bed_fn, iid_count_in, sid_count_in, self.count_A1, iid_index_out, sid_index_out, val) else: raise Exception( "order '{0}' not known, only 'F' and 'C'".format( order)) elif dtype == np.float32: if order == "F": wrap_plink_parser.readPlinkBedFile2floatFAAA( bed_fn, iid_count_in, sid_count_in, self.count_A1, iid_index_out, sid_index_out, val) elif order == "C": wrap_plink_parser.readPlinkBedFile2floatCAAA( bed_fn, iid_count_in, sid_count_in, self.count_A1, iid_index_out, sid_index_out, val) else: raise Exception( "order '{0}' not known, only 'F' and 'C'".format( order)) else: raise Exception( "dtype '{0}' not known, only float64 and float32".format( dtype)) else: if not self.count_A1: byteZero = 0 byteThree = 2 else: byteZero = 2 byteThree = 0 # An earlier version of this code had a way to read consecutive SNPs of code in one read. May want # to add that ability back to the code. # Also, note that reading with python will often result in non-contiguous memory, so the python standardizers will automatically be used, too. self._open_bed() logging.warn( "using pure python plink parser (might be much slower!!)") val = np.zeros( ((int(np.ceil(0.25 * iid_count_in)) * 4), sid_count_out), order=order, dtype=dtype) #allocate it a little big for SNPsIndex, bimIndex in enumerate(sid_index_out): startbit = int(np.ceil(0.25 * iid_count_in) * bimIndex + 3) self._filepointer.seek(startbit) nbyte = int(np.ceil(0.25 * iid_count_in)) bytes = np.array(bytearray( self._filepointer.read(nbyte))).reshape( (int(np.ceil(0.25 * iid_count_in)), 1), order='F') val[3::4, SNPsIndex:SNPsIndex + 1] = byteZero val[3::4, SNPsIndex:SNPsIndex + 1][bytes >= 64] = np.nan val[3::4, SNPsIndex:SNPsIndex + 1][bytes >= 128] = 1 val[3::4, SNPsIndex:SNPsIndex + 1][bytes >= 192] = byteThree bytes = np.mod(bytes, 64) val[2::4, SNPsIndex:SNPsIndex + 1] = byteZero val[2::4, SNPsIndex:SNPsIndex + 1][bytes >= 16] = np.nan val[2::4, SNPsIndex:SNPsIndex + 1][bytes >= 32] = 1 val[2::4, SNPsIndex:SNPsIndex + 1][bytes >= 48] = byteThree bytes = np.mod(bytes, 16) val[1::4, SNPsIndex:SNPsIndex + 1] = byteZero val[1::4, SNPsIndex:SNPsIndex + 1][bytes >= 4] = np.nan val[1::4, SNPsIndex:SNPsIndex + 1][bytes >= 8] = 1 val[1::4, SNPsIndex:SNPsIndex + 1][bytes >= 12] = byteThree bytes = np.mod(bytes, 4) val[0::4, SNPsIndex:SNPsIndex + 1] = byteZero val[0::4, SNPsIndex:SNPsIndex + 1][bytes >= 1] = np.nan val[0::4, SNPsIndex:SNPsIndex + 1][bytes >= 2] = 1 val[0::4, SNPsIndex:SNPsIndex + 1][bytes >= 3] = byteThree val = val[iid_index_out, :] #reorder or trim any extra allocation #!!LATER this can fail because the trim statement above messes up the order #assert(SnpReader._array_properties_are_ok(val, order, dtype)) #!! self._close_bed() return val
def write(filename, snpdata, count_A1=False, force_python_only=False): """Writes a :class:`SnpData` to Bed format. :param filename: the name of the file to create :type filename: string :param snpdata: The in-memory data that should be written to disk. :type snpdata: :class:`SnpData` :param count_A1: Tells if it should count the number of A1 alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True. :type count_A1: bool >>> from pysnptools.snpreader import Pheno, Bed >>> import pysnptools.util as pstutil >>> snpdata = Pheno('../examples/toydata.phe').read() # Read data from Pheno format >>> pstutil.create_directory_if_necessary("tempdir/toydata.bed") >>> Bed.write("tempdir/toydata.bed",snpdata,count_A1=False) # Write data in Bed format """ if isinstance(filename, SnpData) and isinstance( snpdata, str ): #For backwards compatibility, reverse inputs if necessary warnings.warn( "write statement should have filename before data to write", DeprecationWarning) filename, snpdata = snpdata, filename if count_A1 is None: warnings.warn( "'count_A1' was not set. For now it will default to 'False', but in the future it will default to 'True'", FutureWarning) count_A1 = False SnpReader._write_fam(snpdata, filename, remove_suffix="bed") SnpReader._write_map_or_bim(snpdata, filename, remove_suffix="bed", add_suffix="bim") bedfile = SnpReader._name_of_other_file(filename, remove_suffix="bed", add_suffix="bed") if not force_python_only: from pysnptools.snpreader import wrap_plink_parser if snpdata.val.flags["C_CONTIGUOUS"]: order = "C" elif snpdata.val.flags["F_CONTIGUOUS"]: order = "F" else: raise Exception( "order '{0}' not known, only 'F' and 'C'".format(order)) if snpdata.val.dtype == np.float64: if order == "F": wrap_plink_parser.writePlinkBedFile2doubleFAAA( bedfile, snpdata.iid_count, snpdata.sid_count, count_A1, snpdata.val) else: wrap_plink_parser.writePlinkBedFile2doubleCAAA( bedfile, snpdata.iid_count, snpdata.sid_count, count_A1, snpdata.val) elif snpdata.val.dtype == np.float32: if order == "F": wrap_plink_parser.writePlinkBedFile2floatFAAA( bedfile, snpdata.iid_count, snpdata.sid_count, count_A1, snpdata.val) else: wrap_plink_parser.writePlinkBedFile2floatCAAA( bedfile, snpdata.iid_count, snpdata.sid_count, count_A1, snpdata.val) else: raise Exception( "dtype '{0}' not known, only float64 and float32".format( snpdata.val.dtype)) else: if not count_A1: zero_code = 0b00 two_code = 0b11 else: zero_code = 0b11 two_code = 0b00 with open(bedfile, "wb") as bed_filepointer: #see http://pngu.mgh.harvard.edu/~purcell/plink/binary.shtml bed_filepointer.write(chr(0b01101100)) #magic numbers bed_filepointer.write(chr(0b00011011)) #magic numbers bed_filepointer.write(chr(0b00000001)) #snp major for sid_index in xrange(snpdata.sid_count): if sid_index % 1 == 0: logging.info("Writing snp # {0} to file '{1}'".format( sid_index, filename)) col = snpdata.val[:, sid_index] for iid_by_four in xrange(0, snpdata.iid_count, 4): vals_for_this_byte = col[iid_by_four:iid_by_four + 4] byte = 0b00000000 for val_index in xrange(len(vals_for_this_byte)): val = vals_for_this_byte[val_index] if val == 0: code = zero_code elif val == 1: code = 0b10 #backwards on purpose elif val == 2: code = two_code elif np.isnan(val): code = 0b01 #backwards on purpose else: raise Exception( "Can't convert value '{0}' to BED format (only 0,1,2,NAN allowed)" .format(val)) byte |= (code << (val_index * 2)) bed_filepointer.write(chr(byte)) logging.info("Done writing " + filename)
def write(filename, snpdata, force_python_only=False): """Writes a :class:`SnpData` to Bed format. :param filename: the name of the file to create :type filename: string :param snpdata: The in-memory data that should be written to disk. :type snpdata: :class:`SnpData` >>> from pysnptools.snpreader import Pheno, Bed >>> import pysnptools.util as pstutil >>> snpdata = Pheno('../examples/toydata.phe').read() # Read data from Pheno format >>> pstutil.create_directory_if_necessary("tempdir/toydata.bed") >>> Bed.write("tempdir/toydata.bed",snpdata) # Write data in Bed format """ if isinstance(filename,SnpData) and isinstance(snpdata,str): #For backwards compatibility, reverse inputs if necessary warnings.warn("write statement should have filename before data to write", DeprecationWarning) filename, snpdata = snpdata, filename SnpReader._write_fam(snpdata, filename, remove_suffix="bed") SnpReader._write_map_or_bim(snpdata, filename, remove_suffix="bed", add_suffix="bim") bedfile = SnpReader._name_of_other_file(filename,remove_suffix="bed", add_suffix="bed") if not force_python_only: from pysnptools.snpreader import wrap_plink_parser if snpdata.val.flags["C_CONTIGUOUS"]: order = "C" elif snpdata.val.flags["F_CONTIGUOUS"]: order = "F" else: raise Exception("order '{0}' not known, only 'F' and 'C'".format(order)) if snpdata.val.dtype == np.float64: if order=="F": wrap_plink_parser.writePlinkBedFiledoubleFAAA(bedfile, snpdata.iid_count, snpdata.sid_count, snpdata.val) else: wrap_plink_parser.writePlinkBedFiledoubleCAAA(bedfile, snpdata.iid_count, snpdata.sid_count, snpdata.val) elif snpdata.val.dtype == np.float32: if order=="F": wrap_plink_parser.writePlinkBedFilefloatFAAA(bedfile, snpdata.iid_count, snpdata.sid_count, snpdata.val) else: wrap_plink_parser.writePlinkBedFilefloatCAAA(bedfile, snpdata.iid_count, snpdata.sid_count, snpdata.val) else: raise Exception("dtype '{0}' not known, only float64 and float32".format(snpdata.val.dtype)) else: with open(bedfile,"wb") as bed_filepointer: #see http://pngu.mgh.harvard.edu/~purcell/plink/binary.shtml bed_filepointer.write(chr(0b01101100)) #magic numbers bed_filepointer.write(chr(0b00011011)) #magic numbers bed_filepointer.write(chr(0b00000001)) #snp major for sid_index in xrange(snpdata.sid_count): if sid_index % 1 == 0: logging.info("Writing snp # {0} to file '{1}'".format(sid_index, filename)) col = snpdata.val[:, sid_index] for iid_by_four in xrange(0,snpdata.iid_count,4): vals_for_this_byte = col[iid_by_four:iid_by_four+4] byte = 0b00000000 for val_index in xrange(len(vals_for_this_byte)): val = vals_for_this_byte[val_index] if val == 0: code = 0b00 elif val == 1: code = 0b10 #backwards on purpose elif val == 2: code = 0b11 elif np.isnan(val): code = 0b01 #backwards on purpose else: raise Exception("Can't convert value '{0}' to BED format (only 0,1,2,NAN allowed)".format(val)) byte |= (code << (val_index*2)) bed_filepointer.write(chr(byte)) logging.info("Done writing " + filename)
def __init__(self, filename): ''' filename : string of the name of the Dat file. ''' super(Dat, self).__init__() self.filename = SnpReader._name_of_other_file(filename,remove_suffix="dat", add_suffix="dat")
def _read(self, iid_index_or_none, sid_index_or_none, order, dtype, force_python_only, view_ok): self._run_once() if order=='A': order='F' assert not hasattr(self, 'ind_used'), "A SnpReader should not have a 'ind_used' attribute" iid_count_in = self.iid_count sid_count_in = self.sid_count if iid_index_or_none is not None: iid_count_out = len(iid_index_or_none) iid_index_out = iid_index_or_none else: iid_count_out = iid_count_in iid_index_out = range(iid_count_in) if sid_index_or_none is not None: sid_count_out = len(sid_index_or_none) sid_index_out = sid_index_or_none else: sid_count_out = sid_count_in sid_index_out = range(sid_count_in) if not force_python_only: from pysnptools.snpreader import wrap_plink_parser val = np.zeros((iid_count_out, sid_count_out), order=order, dtype=dtype) bed_fn = SnpReader._name_of_other_file(self.filename,"bed","bed") if dtype == np.float64: if order=="F": wrap_plink_parser.readPlinkBedFiledoubleFAAA(bed_fn, iid_count_in, sid_count_in, iid_index_out, sid_index_out, val) elif order=="C": wrap_plink_parser.readPlinkBedFiledoubleCAAA(bed_fn, iid_count_in, sid_count_in, iid_index_out, sid_index_out, val) else: raise Exception("order '{0}' not known, only 'F' and 'C'".format(order)); elif dtype == np.float32: if order=="F": wrap_plink_parser.readPlinkBedFilefloatFAAA(bed_fn, iid_count_in, sid_count_in, iid_index_out, sid_index_out, val) elif order=="C": wrap_plink_parser.readPlinkBedFilefloatCAAA(bed_fn, iid_count_in, sid_count_in, iid_index_out, sid_index_out, val) else: raise Exception("order '{0}' not known, only 'F' and 'C'".format(order)); else: raise Exception("dtype '{0}' not known, only float64 and float32".format(dtype)) else: # An earlier version of this code had a way to read consecutive SNPs of code in one read. May want # to add that ability back to the code. # Also, note that reading with python will often result in non-contiguous memory, so the python standardizers will automatically be used, too. logging.warn("using pure python plink parser (might be much slower!!)") val = np.zeros(((int(np.ceil(0.25*iid_count_in))*4),sid_count_out),order=order, dtype=dtype) #allocate it a little big for SNPsIndex, bimIndex in enumerate(sid_index_out): startbit = int(np.ceil(0.25*iid_count_in)*bimIndex+3) self._filepointer.seek(startbit) nbyte = int(np.ceil(0.25*iid_count_in)) bytes = np.array(bytearray(self._filepointer.read(nbyte))).reshape((int(np.ceil(0.25*iid_count_in)),1),order='F') val[3::4,SNPsIndex:SNPsIndex+1][bytes>=64]=np.nan val[3::4,SNPsIndex:SNPsIndex+1][bytes>=128]=1 val[3::4,SNPsIndex:SNPsIndex+1][bytes>=192]=2 bytes=np.mod(bytes,64) val[2::4,SNPsIndex:SNPsIndex+1][bytes>=16]=np.nan val[2::4,SNPsIndex:SNPsIndex+1][bytes>=32]=1 val[2::4,SNPsIndex:SNPsIndex+1][bytes>=48]=2 bytes=np.mod(bytes,16) val[1::4,SNPsIndex:SNPsIndex+1][bytes>=4]=np.nan val[1::4,SNPsIndex:SNPsIndex+1][bytes>=8]=1 val[1::4,SNPsIndex:SNPsIndex+1][bytes>=12]=2 bytes=np.mod(bytes,4) val[0::4,SNPsIndex:SNPsIndex+1][bytes>=1]=np.nan val[0::4,SNPsIndex:SNPsIndex+1][bytes>=2]=1 val[0::4,SNPsIndex:SNPsIndex+1][bytes>=3]=2 val = val[iid_index_out,:] #reorder or trim any extra allocation #!!LATER this can fail because the trim statement above messes up the order #assert(SnpReader._array_properties_are_ok(val, order, dtype)) #!! return val
def copyinputs(self, copier): # doesn't need to self.run_once() because only uses original inputs copier.input(SnpReader._name_of_other_file(self.basefilename,remove_suffix="ped", add_suffix="ped")) copier.input(SnpReader._name_of_other_file(self.basefilename,remove_suffix="ped", add_suffix="map"))
def copyinputs(self, copier): # doesn't need to self.run_once() because creates name of all files itself copier.input(SnpReader._name_of_other_file(self.filename,remove_suffix="dat", add_suffix="dat")) copier.input(SnpReader._name_of_other_file(self.filename,remove_suffix="dat", add_suffix="fam")) copier.input(SnpReader._name_of_other_file(self.filename,remove_suffix="dat", add_suffix="map"))
def _read(self, iid_index_or_none, sid_index_or_none, order, dtype, force_python_only, view_ok, out_buffer): if force_python_only: raise NotImplementedError("No Python only mode") self._run_once() if order == 'A': order = 'F' assert not hasattr( self, 'ind_used'), "A SnpReader should not have a 'ind_used' attribute" iid_count_in = self.iid_count sid_count_in = self.sid_count if iid_index_or_none is not None: iid_count_out = len(iid_index_or_none) iid_index_out = np.array(iid_index_or_none, dtype=np.uint32) else: iid_count_out = iid_count_in iid_index_out = None if sid_index_or_none is not None: sid_count_out = len(sid_index_or_none) sid_index_out = sid_index_or_none else: sid_count_out = sid_count_in sid_index_out = range(sid_count_in) out_compatible = (order == "F") and ((dtype == np.int8) or (dtype == np.int32) or (dtype == np.int64)) if out_buffer is not None: assert out_buffer.dtype == dtype, "Wrong type" assert out_buffer.flags["F_CONTIGUOUS"] == ( order == 'F'), "wrong order" assert out_buffer.shape[ 0] == iid_count_out, "insufficient first dimension" assert out_buffer.shape[ 1] == sid_count_out, "insufficient second dimension" if out_compatible and (out_buffer is not None): val = out_buffer.T dtype_val = out_buffer.dtype elif out_compatible: dtype_val = dtype val = np.empty((sid_count_out, iid_count_out), order='C', dtype=dtype_val) out_buffer = val.T else: dtype_val = np.int8 val = np.empty((sid_count_out, iid_count_out), order='C', dtype=dtype_val) pgen_fn = SnpReader._name_of_other_file(self.filename, "pgen", "pgen") self._open_pgen(iid_index_out) self._filepointer.read_list(np.array(sid_index_out, dtype=np.uint32), val) self._close_pgen() if (out_buffer is None) and out_compatible: out_buffer = val.T elif (out_buffer is not None) and (not out_compatible): out_buffer[:] = val.T elif (out_buffer is None) and (not out_compatible): out_buffer = np.array(val.T, dtype=dtype, order=order) if (out_buffer.dtype == np.float64) or (out_buffer.dtype == np.float32): out_buffer[out_buffer == -9.0] = np.nan return out_buffer # [:iid_count_out,:sid_count_out]
def copyinputs(self, copier): # doesn't need to self.run_once() because creates name of all files itself copier.input(SnpReader._name_of_other_file(self.dat_filename,remove_suffix="dat", add_suffix="dat")) copier.input(SnpReader._name_of_other_file(self.dat_filename,remove_suffix="dat", add_suffix="fam")) copier.input(SnpReader._name_of_other_file(self.dat_filename,remove_suffix="dat", add_suffix="map"))