Esempi in Python per SnpReader, esempi in Python per pysnptools.snpreader.SnpReader

Esempio n. 1

0

Mostra file

File: dat.py Progetto: MicrosoftGenomics/PySnpTools

    def write(filename, snpdata):
        """Writes a :class:`SnpData` to dat/fam/map format.

        :param filename: the name of the file to create
        :type filename: string
        :param snpdata: The in-memory data that should be written to disk.
        :type snpdata: :class:`SnpData`

        >>> from pysnptools.snpreader import Dat, Bed
        >>> import pysnptools.util as pstutil
        >>> snpdata = Bed('../examples/toydata.bed',count_A1=False)[:,:10].read()  # Read first 10 snps from Bed format
        >>> pstutil.create_directory_if_necessary("tempdir/toydata10.dat")
        >>> Dat.write("tempdir/toydata10.dat",snpdata)              # Write data in dat/fam/map format
        """

        if isinstance(filename,SnpData) and isinstance(snpdata,str): #For backwards compatibility, reverse inputs if necessary
            warnings.warn("write statement should have filename before data to write", DeprecationWarning)
            filename, snpdata = snpdata, filename 

        SnpReader._write_fam(snpdata, filename, remove_suffix="dat")
        SnpReader._write_map_or_bim(snpdata, filename, remove_suffix="dat", add_suffix="map")
        filename = SnpReader._name_of_other_file(filename,remove_suffix="dat", add_suffix="dat")

        snpsarray = snpdata.val
        with open(filename,"wb") as dat_filepointer:
            for sid_index, sid in enumerate(snpdata.sid):
                if sid_index % 1000 == 0:
                    logging.info("Writing snp # {0} to file '{1}'".format(sid_index, filename))
                dat_filepointer.write(b"%s\tj\tn\t"%sid) #use "j" and "n" as the major and minor allele
                row = snpsarray[:,sid_index]
                dat_filepointer.write(b"\t".join((str(i).encode('ascii') for i in row)) + b"\n")
        logging.info("Done writing " + filename)

Esempio n. 2

0

Mostra file

File: dat.py Progetto: fastlmm/PySnpTools

 def _read_pstdata(self):
     row = SnpReader._read_fam(self.filename, remove_suffix="dat")
     col, col_property = SnpReader._read_map_or_bim(self.filename,
                                                    remove_suffix="dat",
                                                    add_suffix="map")
     if len(row) == 0 or len(col) == 0:
         return SnpData(iid=row,
                        sid=col,
                        pos=col_property,
                        val=np.empty([len(row), len(col)]))
     datfields = pd.read_csv(self.filename,
                             delimiter='\t',
                             header=None,
                             index_col=False,
                             skiprows=self.skiprows)
     if not np.array_equal(datfields[0], col):
         raise Exception(
             "Expect snp list in map file to exactly match snp list in dat file"
         )
     del datfields[0]
     del datfields[1]
     del datfields[2]
     assert len(row) == datfields.shape[
         1], "Expect # iids in fam file to match dat file"
     val = datfields.values.T
     snpdata = SnpData(iid=row, sid=col, pos=col_property, val=val)
     return snpdata

Esempio n. 3

0

Mostra file

 def col(self):
     """*same as* :attr:`sid`
     """
     if not hasattr(self, "_col"):
         _bim = SnpReader._name_of_other_file(self.path,
                                              remove_suffix="bed",
                                              add_suffix="bim")
         local_bim = self._storage.open_read(_bim)
         self._col, self._col_property = SnpReader._read_map_or_bim(
             local_bim.__enter__(), remove_suffix="bim", add_suffix="bim")
         self._file_dict["bim"] = local_bim
     return self._col

Esempio n. 4

0

Mostra file

 def row(self):
     """*same as* :attr:`iid`
     """
     if not hasattr(self, "_row"):
         _fam = SnpReader._name_of_other_file(self.path,
                                              remove_suffix="bed",
                                              add_suffix="fam")
         local_fam = self._storage.open_read(_fam)
         self._row = SnpReader._read_fam(local_fam.__enter__(),
                                         remove_suffix="fam")
         self._file_dict["fam"] = local_fam
     return self._row

Esempio n. 5

0

Mostra file

File: dat.py Progetto: fastlmm/PySnpTools

 def copyinputs(self, copier):
     # doesn't need to self.run_once() because creates name of all files itself
     copier.input(
         SnpReader._name_of_other_file(self.filename,
                                       remove_suffix="dat",
                                       add_suffix="dat"))
     copier.input(
         SnpReader._name_of_other_file(self.filename,
                                       remove_suffix="dat",
                                       add_suffix="fam"))
     copier.input(
         SnpReader._name_of_other_file(self.filename,
                                       remove_suffix="dat",
                                       add_suffix="map"))

Esempio n. 6

0

Mostra file

File: bed.py Progetto: eric-czech/PySnpTools

 def copyinputs(self, copier):
     # doesn't need to self.run_once() because only uses original inputs
     copier.input(
         SnpReader._name_of_other_file(self.filename,
                                       remove_suffix="bed",
                                       add_suffix="bed"))
     copier.input(
         SnpReader._name_of_other_file(self.filename,
                                       remove_suffix="bed",
                                       add_suffix="bim"))
     copier.input(
         SnpReader._name_of_other_file(self.filename,
                                       remove_suffix="bed",
                                       add_suffix="fam"))

Esempio n. 7

0

Mostra file

File: dat.py Progetto: MicrosoftGenomics/PySnpTools

 def _read_pstdata(self):
     row = SnpReader._read_fam(self.filename,remove_suffix="dat")
     col, col_property = SnpReader._read_map_or_bim(self.filename,remove_suffix="dat", add_suffix="map")
     if len(row)==0 or len(col)==0:
         return SnpData(iid=row,sid=col,pos=col_property,val=np.empty([len(row),len(col)]))
     datfields = pd.read_csv(self.filename,delimiter = '\t',header=None,index_col=False)
     if not np.array_equal(np.array([x.encode('ascii') for x in datfields[0]]), col) : raise Exception("Expect snp list in map file to exactly match snp list in dat file")
     del datfields[0]
     del datfields[1]
     del datfields[2]
     assert len(row) == datfields.shape[1], "Expect # iids in fam file to match dat file"
     val = datfields.values.T
     snpdata = SnpData(iid=row,sid=col,pos=col_property,val=val)
     return snpdata

Esempio n. 8

0

Mostra file

File: dat.py Progetto: fastlmm/PySnpTools

    def write(filename, snpdata):
        """Writes a :class:`SnpData` to dat/fam/map format and returns the :class:`.Dat`.

        :param filename: the name of the file to create
        :type filename: string
        :param snpdata: The in-memory data that should be written to disk.
        :type snpdata: :class:`SnpData`
        :rtype: :class:`.Dat`

        >>> from pysnptools.snpreader import Dat, Bed
        >>> import pysnptools.util as pstutil
        >>> from pysnptools.util import example_file # Download and return local file name
        >>> bed_file = example_file("pysnptools/examples/toydata.5chrom.*","*.bed")
        >>> snpdata = Bed(bed_file,count_A1=False)[:,:10].read()  # Read first 10 snps from Bed format
        >>> pstutil.create_directory_if_necessary("tempdir/toydata10.dat")
        >>> Dat.write("tempdir/toydata10.dat",snpdata)              # Write data in dat/fam/map format
        Dat('tempdir/toydata10.dat')
        """

        if isinstance(filename, SnpData) and isinstance(
                snpdata, str
        ):  #For backwards compatibility, reverse inputs if necessary
            warnings.warn(
                "write statement should have filename before data to write",
                DeprecationWarning)
            filename, snpdata = snpdata, filename

        SnpReader._write_fam(snpdata, filename, remove_suffix="dat")
        SnpReader._write_map_or_bim(snpdata,
                                    filename,
                                    remove_suffix="dat",
                                    add_suffix="map")
        filename = SnpReader._name_of_other_file(filename,
                                                 remove_suffix="dat",
                                                 add_suffix="dat")

        snpsarray = snpdata.val
        with open(filename, "w") as dat_filepointer:
            for sid_index, sid in enumerate(snpdata.sid):
                if sid_index % 1000 == 0:
                    logging.info("Writing snp # {0} to file '{1}'".format(
                        sid_index, filename))
                dat_filepointer.write(
                    "%s\tj\tn\t" %
                    sid)  #use "j" and "n" as the major and minor allele
                row = snpsarray[:, sid_index]
                dat_filepointer.write("\t".join((str(i) for i in row)) + "\n")
        logging.info("Done writing " + filename)
        return Dat(filename)

Esempio n. 9

0

Mostra file

File: bed.py Progetto: MicrosoftGenomics/PySnpTools

    def _run_once(self):
        if self._ran_once:
            return
        self._ran_once = True

        if not hasattr(self,"_row"):
            self._row = SnpReader._read_fam(self.filename,remove_suffix="bed")

        if not hasattr(self,"_col") or not hasattr(self,"_col_property"):
            self._col, self._col_property = SnpReader._read_map_or_bim(self.filename,remove_suffix="bed", add_suffix="bim")
        self._assert_iid_sid_pos()

        if not self.skip_format_check:
            self._open_bed()
            self._close_bed()

Esempio n. 10

0

Mostra file

File: ped.py Progetto: MicrosoftGenomics/PySnpTools

    def write(filename, snpdata):
        """Writes a :class:`SnpData` to Ped format. The values must be 0,1,2. The direction of the encoding to allele pairs is arbitrary. This means
        that if a SnpData is written in Ped format and then read back, then 0's may become 2's and 2's may become 0's. (1's will stay 1's).

        :param filename: the name of the file to create
        :type filename: string
        :param snpdata: The in-memory data that should be written to disk.
        :type snpdata: :class:`SnpData`

        >>> from pysnptools.snpreader import Ped, Bed
        >>> import pysnptools.util as pstutil
        >>> snpdata = Bed('../examples/toydata.bed',count_A1=False)[:,:10].read()  # Read first 10 snps from Bed format
        >>> pstutil.create_directory_if_necessary("tempdir/toydata10.ped")
        >>> Ped.write("tempdir/toydata10.ped",snpdata)            # Write data in Ped format
        """

        if isinstance(filename,SnpData) and isinstance(snpdata,str): #For backwards compatibility, reverse inputs if necessary
            warnings.warn("write statement should have filename before data to write", DeprecationWarning)
            filename, snpdata = snpdata, filename 

        SnpReader._write_map_or_bim(snpdata, filename, remove_suffix="ped", add_suffix="map")

        # The PED file is a white-space (space or tab) delimited file: the first six columns are mandatory:
        # Family ID
        # Case ID
        # Paternal ID
        # Maternal ID
        # Sex (1=male; 2=female; other=unknown)
        # Phenotype

        pedfile = SnpReader._name_of_other_file(filename, remove_suffix="ped", add_suffix="ped")
        with open(pedfile,"wb") as ped_filepointer:
            for iid_index, iid_row in enumerate(snpdata.iid):
                ped_filepointer.write(b"%s %s 0 0 0 0"%(iid_row[0],iid_row[1])) #Must use % formating because Python3 doesn't support .format on bytes
                row = snpdata.val[iid_index,:]
                for sid_index, val in enumerate(row):
                    if val == 0:
                        s = b"A A"
                    elif val == 1:
                        s = b"A G"
                    elif val == 2:
                        s = b"G G"
                    elif np.isnan(val):
                        s = b"0 0"
                    else:
                        raise Exception("Expect values for ped file to be 0,1,2, or NAN. Instead, saw '{0}'".format(val))
                    ped_filepointer.write(b"\t"+s)
                ped_filepointer.write(b"\n")

Esempio n. 11

0

Mostra file

File: snp_gen.py Progetto: MMesbahU/GWAS_benchmark

def write_tped(snpdata, basefilename):
    #\\bobd02\Public\PLink\x64\Plink.exe --noweb --tfile test --make-bed --out test2
    
    
    SnpReader._write_fam(snpdata, basefilename, remove_suffix="tped")
    #SnpReader._write_map_or_bim(snpdata, basefilename, remove_suffix="dat", add_suffix="map")

    snpsarray = snpdata.val
    with open(basefilename,"w") as dat_filepointer:
        for sid_index, sid in enumerate(snpdata.sid):
            if sid_index % 1000 == 0:
                logging.info("Writing snp # {0} to file '{1}'".format(sid_index, basefilename))
            dat_filepointer.write("1 {0} {1} {1} ".format(sid ,sid_index)) #use "j" and "n" as the major and minor allele
            row = snpsarray[:,sid_index]
            dat_filepointer.write(" ".join((encode_snp(i) for i in row)) + "\n")
    logging.info("Done writing " + basefilename)

Esempio n. 12

0

Mostra file

    def __init__(
        self,
        filename,
        count_A1=None,
        iid=None,
        sid=None,
        pos=None,
        num_threads=None,
        skip_format_check=False,
        fam_filename=None,
        bim_filename=None,
        chrom_map = plink_chrom_map,
    ):
        super(Bed, self).__init__()

        self._ran_once = False

        self.filename = SnpReader._name_of_other_file(filename,remove_suffix="bed", add_suffix="bed")
        self.fam_filename = fam_filename
        self.bim_filename = bim_filename
        if count_A1 is None:
            warnings.warn(
                "'count_A1' was not set. For now it will default to 'False', but in the future it will default to 'True'",
                FutureWarning,
            )
            count_A1 = False
        self.count_A1 = count_A1
        self._skip_format_check = skip_format_check
        self._original_iid = iid
        self._original_sid = sid
        self._original_pos = pos
        self._num_threads = num_threads
        self._open_bed = None
        self.chrom_map = chrom_map

Esempio n. 13

0

Mostra file

File: bed.py Progetto: eric-czech/PySnpTools

    def _run_once(self):
        if self._ran_once:
            return
        self._ran_once = True

        if not hasattr(self, "_row"):
            self._row = SnpReader._read_fam(self.filename, remove_suffix="bed")

        if not hasattr(self, "_col") or not hasattr(self, "_col_property"):
            self._col, self._col_property = SnpReader._read_map_or_bim(
                self.filename, remove_suffix="bed", add_suffix="bim")
        self._assert_iid_sid_pos(check_val=False)

        if not self.skip_format_check:
            self._open_bed()
            self._close_bed()

Esempio n. 14

0

Mostra file

File: bed.py Progetto: eric-czech/PySnpTools

 def col_property(self):
     """*same as* :attr:`pos`
     """
     if not hasattr(self, "_col_property"):
         self._col, self._col_property = SnpReader._read_map_or_bim(
             self.filename, remove_suffix="bed", add_suffix="bim")
     return self._col_property

Esempio n. 15

0

Mostra file

File: snpdata.py Progetto: MicrosoftGenomics/PySnpTools

    def _read_kernel(train, standardizer, block_size=None, order='A', dtype=np.float64, force_python_only=False, view_ok=False, return_trained=False):
        '''
        The method creates a kernel for the in-memory SNP data. It handles these cases
                * No standardization is needed & everything is in memory  OR uses the FROM-DISK method
        '''
        from pysnptools.pstreader import PstReader


        #Just do a 'python' dot, if no standardization is needed and everything is the right type
        if isinstance(standardizer,Identity) and train.val.dtype == dtype:
            ts = time.time()
            #is_worth_logging = train.val.shape[0] * train.val.shape[1] * test.val.shape[0] > 1e9
            #if is_worth_logging: logging.info("  _read_kernel about to multiply train{0} x test{1}".format(train.val.shape,test.val.shape))
            if order == 'F': #numpy's 'dot' always returns 'C' order
                K = (train.val.dot(train.val.T)).T
            else:
                K = train.val.dot(train.val.T)
            assert PstReader._array_properties_are_ok(K,order,dtype), "internal error: K is not of the expected order or dtype"
            #if is_worth_logging: logging.info("  _read_kernel took %.2f seconds" % (time.time()-ts))
            if return_trained:
                return K, standardizer
            else:
                return K
        else: #Do things the more general SnpReader way.
            return SnpReader._read_kernel(train, standardizer, block_size=block_size, order=order, dtype=dtype, force_python_only=force_python_only,view_ok=view_ok, return_trained=return_trained)

Esempio n. 16

0

Mostra file

File: bed.py Progetto: MicrosoftGenomics/PySnpTools

 def _open_bed(self):
     bedfile = SnpReader._name_of_other_file(self.filename,"bed","bed")
     self._filepointer = open(bedfile, "rb")
     mode = self._filepointer.read(2)
     if mode != b'l\x1b': raise Exception('No valid binary BED file')
     mode = self._filepointer.read(1) #\x01 = SNP major \x00 = individual major
     if mode != b'\x01': raise Exception('only SNP-major is implemented')
     logging.info("bed file is open {0}".format(bedfile))

Esempio n. 17

0

Mostra file

 def __init__(self, filename):
     '''
     filename    : string of the name of the Dat file.
     '''
     super(Dat, self).__init__()
     self.filename = SnpReader._name_of_other_file(filename,
                                                   remove_suffix="dat",
                                                   add_suffix="dat")

Esempio n. 18

0

Mostra file

File: ped.py Progetto: MicrosoftGenomics/PySnpTools

 def __init__(self, filename, missing = '0'):
     '''
         filename    : string of the filename of the ped file
         missing         : string indicating a missing genotype (default b'0')
     '''
     super(Ped, self).__init__()
     self.filename = SnpReader._name_of_other_file(filename,remove_suffix="ped", add_suffix="ped")
     self.missing = to_ascii(missing) # This is for Python2/3 compatibility

Esempio n. 19

0

Mostra file

    def _read(self, row_index_or_none, col_index_or_none, order, dtype,
              force_python_only, view_ok, num_threads):
        assert row_index_or_none is None and col_index_or_none is None  #real assert because indexing should already be pushed to the inner snpreader
        dtype = np.dtype(dtype)

        #Do all-at-once (not in blocks) if 1. No block size is given or 2. The #ofSNPs < Min(block_size,iid_count)
        if self.block_size is None or (self.sid_count <= self.block_size
                                       or self.sid_count <= self.iid_count):
            snpdata, _ = SnpReader._as_snpdata(
                self.snpreader,
                dtype=dtype,
                order=order,
                force_python_only=force_python_only,
                standardizer=stdizer.Identity(),
                num_threads=num_threads)
            val = self._snpval_to_distval(snpdata.val, order, dtype)

            has_right_order = order = "A" or (
                order == "C" and val.flags["C_CONTIGUOUS"]) or (
                    order == "F" and val.flags["F_CONTIGUOUS"])
            assert has_right_order
            return val
        else:  #Do in blocks
            t0 = time.time()
            if order == 'A':
                order = 'F'
            val = np.zeros([self.iid_count, self.sid_count, 3],
                           dtype=dtype,
                           order=order)  #LATER use empty or fillnan???

            logging.info(
                "reading {0} value data in blocks of {1} SNPs and finding distribution (for {2} individuals)"
                .format(self.sid_count, self.block_size, self.iid_count))
            ct = 0
            ts = time.time()

            for start in range(0, self.sid_count, self.block_size):
                ct += self.block_size
                snpdata = self.snpreader[:, start:start + self.block_size].read(
                    order=order,
                    dtype=dtype,
                    force_python_only=force_python_only,
                    view_ok=True,
                    num_threads=num_threads
                )  # a view is always OK, because we'll allocate memory in the next step
                val[:,
                    start:start + self.block_size] = self._snpval_to_distval(
                        snpdata.val, order, dtype)
                if ct % self.block_size == 0:
                    diff = time.time() - ts
                    if diff > 5:
                        logging.info("read %s SNPs in %.2f seconds" %
                                     (ct, diff))

            t1 = time.time()
            logging.info("%.2f seconds elapsed" % (t1 - t0))

            return val

Esempio n. 20

0

Mostra file

File: bed.py Progetto: eric-czech/PySnpTools

 def _open_bed(self):
     bedfile = SnpReader._name_of_other_file(self.filename, "bed", "bed")
     self._filepointer = open(bedfile, 'rb')
     mode = self._filepointer.read(2)
     if mode != b'l\x1b': raise Exception('No valid binary BED file')
     mode = self._filepointer.read(
         1)  #\x01 = SNP major \x00 = individual major
     if mode != b'\x01': raise Exception('only SNP-major is implemented')
     logging.info("bed file is open {0}".format(bedfile))

Esempio n. 21

0

Mostra file

def write_tped(snpdata, basefilename):
    #\\bobd02\Public\PLink\x64\Plink.exe --noweb --tfile test --make-bed --out test2

    SnpReader._write_fam(snpdata, basefilename, remove_suffix="tped")
    #SnpReader._write_map_or_bim(snpdata, basefilename, remove_suffix="dat", add_suffix="map")

    snpsarray = snpdata.val
    with open(basefilename, "w") as dat_filepointer:
        for sid_index, sid in enumerate(snpdata.sid):
            if sid_index % 1000 == 0:
                logging.info("Writing snp # {0} to file '{1}'".format(
                    sid_index, basefilename))
            dat_filepointer.write("1 {0} {1} {1} ".format(
                sid,
                sid_index))  #use "j" and "n" as the major and minor allele
            row = snpsarray[:, sid_index]
            dat_filepointer.write(" ".join((encode_snp(i)
                                            for i in row)) + "\n")
    logging.info("Done writing " + basefilename)

Esempio n. 22

0

Mostra file

 def __init__(self, filename, missing='0'):
     '''
         filename    : string of the filename of the ped file
         missing         : string indicating a missing genotype (default '0')
     '''
     super(Ped, self).__init__()
     self.filename = SnpReader._name_of_other_file(filename,
                                                   remove_suffix="ped",
                                                   add_suffix="ped")
     self.missing = missing

Esempio n. 23

0

Mostra file

File: ped.py Progetto: MicrosoftGenomics/PySnpTools

 def _read_pstdata(self):
     col, col_property = SnpReader._read_map_or_bim(self.filename,remove_suffix="ped", add_suffix="map")
     ped = np.loadtxt(self.filename, dtype='S', comments=None)
     row = ped[:,0:2]
     snpsstr = ped[:,6::]
     inan=snpsstr==self.missing
     snps = np.zeros((snpsstr.shape[0],snpsstr.shape[1]//2))
     for i in range(snpsstr.shape[1]//2):
         snps[inan[:,2*i],i]=np.nan
         vals=snpsstr[~inan[:,2*i],2*i:2*(i+1)]
         snps[~inan[:,2*i],i]+=(vals==vals[0,0]).sum(1)
     snpdata = SnpData(iid=row,sid=col,pos=col_property,val=snps)
     return snpdata

Esempio n. 24

0

Mostra file

    def write(path, storage, snpdata, count_A1=True, updater=None):
        file_list = [
            SnpReader._name_of_other_file(path,
                                          remove_suffix="bed",
                                          add_suffix=new_suffix)
            for new_suffix in ["bim", "fam", "bed"]
        ]  #'bed' should be last
        with _multiopen(
                lambda file_name: storage.open_write(file_name,
                                                     updater=updater),
                file_list) as local_file_name_list:
            Bed.write(local_file_name_list[-1], snpdata, count_A1=count_A1)

        return _Distributed1Bed(path, storage)

Esempio n. 25

0

Mostra file

 def _read_pstdata(self):
     col, col_property = SnpReader._read_map_or_bim(self.filename,
                                                    remove_suffix="ped",
                                                    add_suffix="map")
     ped = np.loadtxt(self.filename, dtype='str', comments=None)
     ped = ped.reshape(-1, ped.shape[-1])  #Turns 1-d row into 2-d
     row = ped[:, 0:2]
     snpsstr = ped[:, 6::]
     inan = snpsstr == self.missing
     snps = np.zeros((snpsstr.shape[0], snpsstr.shape[1] // 2))
     for i in range(snpsstr.shape[1] // 2):
         snps[inan[:, 2 * i], i] = np.nan
         vals = snpsstr[~inan[:, 2 * i], 2 * i:2 * (i + 1)]
         if vals.shape[0] > 0:
             snps[~inan[:, 2 * i], i] += (vals == vals[0, 0]).sum(1)
     snpdata = SnpData(iid=row, sid=col, pos=col_property, val=snps)
     return snpdata

Esempio n. 26

0

Mostra file

    def _run_once(self):
        if self._ran_once:
            return
        self._ran_once = True
        self.row  # get row info
        self.col  # get col info

        _bed = SnpReader._name_of_other_file(self.path,
                                             remove_suffix="bed",
                                             add_suffix="bed")
        local_bed = self._storage.open_read(_bed)
        self.local = Bed(local_bed.__enter__(),
                         count_A1=True,
                         iid=self.row,
                         sid=self.col,
                         pos=self.col_property,
                         skip_format_check=True)
        self._file_dict["bed"] = local_bed

Esempio n. 27

0

Mostra file

File: snpdata.py Progetto: fastlmm/PySnpTools

    def _read_kernel(train,
                     standardizer,
                     block_size=None,
                     order='A',
                     dtype=np.float64,
                     force_python_only=False,
                     view_ok=False,
                     return_trained=False,
                     num_threads=None):
        '''
        The method creates a kernel for the in-memory SNP data. It handles these cases
                * No standardization is needed & everything is in memory  OR uses the FROM-DISK method
        '''
        from pysnptools.pstreader import PstReader
        dtype = np.dtype(dtype)

        #Just do a 'python' dot, if no standardization is needed and everything is the right type
        if isinstance(standardizer, Identity) and train.val.dtype == dtype:
            ts = time.time()
            #is_worth_logging = train.val.shape[0] * train.val.shape[1] * test.val.shape[0] > 1e9
            #if is_worth_logging: logging.info("  _read_kernel about to multiply train{0} x test{1}".format(train.val.shape,test.val.shape))
            if order == 'F':  #numpy's 'dot' always returns 'C' order
                K = (train.val.dot(train.val.T)).T
            else:
                K = train.val.dot(train.val.T)
            assert PstReader._array_properties_are_ok(
                K, order, dtype
            ), "internal error: K is not of the expected order or dtype"
            #if is_worth_logging: logging.info("  _read_kernel took %.2f seconds" % (time.time()-ts))
            if return_trained:
                return K, standardizer
            else:
                return K
        else:  #Do things the more general SnpReader way.
            return SnpReader._read_kernel(train,
                                          standardizer,
                                          block_size=block_size,
                                          order=order,
                                          dtype=dtype,
                                          force_python_only=force_python_only,
                                          view_ok=view_ok,
                                          return_trained=return_trained,
                                          num_threads=num_threads)

Esempio n. 28

0

Mostra file

File: dat.py Progetto: MicrosoftGenomics/PySnpTools

 def __init__(self, filename):
     '''
     filename    : string of the name of the Dat file.
     '''
     super(Dat, self).__init__()
     self.filename = SnpReader._name_of_other_file(filename,remove_suffix="dat", add_suffix="dat")

Esempio n. 29

0

Mostra file

File: bed.py Progetto: eric-czech/PySnpTools

 def row(self):
     """*same as* :attr:`iid`
     """
     if not hasattr(self, "_row"):
         self._row = SnpReader._read_fam(self.filename, remove_suffix="bed")
     return self._row

Esempio n. 30

0

Mostra file

File: bed.py Progetto: eric-czech/PySnpTools

    def _read(self, iid_index_or_none, sid_index_or_none, order, dtype,
              force_python_only, view_ok):
        self._run_once()

        if order == 'A':
            order = 'F'
        dtype = np.dtype(dtype)

        assert not hasattr(
            self,
            'ind_used'), "A SnpReader should not have a 'ind_used' attribute"

        iid_count_in = self.iid_count
        sid_count_in = self.sid_count

        if iid_index_or_none is not None:
            iid_count_out = len(iid_index_or_none)
            iid_index = iid_index_or_none
        else:
            iid_count_out = iid_count_in
            iid_index = list(range(iid_count_in))

        if sid_index_or_none is not None:
            sid_count_out = len(sid_index_or_none)
            sid_index = sid_index_or_none
        else:
            sid_count_out = sid_count_in
            sid_index = list(range(sid_count_in))

        if not force_python_only:
            from pysnptools.snpreader import wrap_plink_parser
            val = np.zeros((iid_count_out, sid_count_out),
                           order=order,
                           dtype=dtype)
            bed_fn = SnpReader._name_of_other_file(self.filename, "bed", "bed")

            if iid_count_in > 0 and sid_count_in > 0:
                if dtype == np.float64:
                    if order == "F":
                        wrap_plink_parser.readPlinkBedFile2doubleFAAA(
                            bed_fn.encode('ascii'), iid_count_in, sid_count_in,
                            self.count_A1, iid_index, sid_index, val)
                    elif order == "C":
                        wrap_plink_parser.readPlinkBedFile2doubleCAAA(
                            bed_fn.encode('ascii'), iid_count_in, sid_count_in,
                            self.count_A1, iid_index, sid_index, val)
                    else:
                        raise Exception(
                            "order '{0}' not known, only 'F' and 'C'".format(
                                order))
                elif dtype == np.float32:
                    if order == "F":
                        wrap_plink_parser.readPlinkBedFile2floatFAAA(
                            bed_fn.encode('ascii'), iid_count_in, sid_count_in,
                            self.count_A1, iid_index, sid_index, val)
                    elif order == "C":
                        wrap_plink_parser.readPlinkBedFile2floatCAAA(
                            bed_fn.encode('ascii'), iid_count_in, sid_count_in,
                            self.count_A1, iid_index, sid_index, val)
                    else:
                        raise Exception(
                            "order '{0}' not known, only 'F' and 'C'".format(
                                order))
                else:
                    raise Exception(
                        "dtype '{0}' not known, only float64 and float32".
                        format(dtype))

        else:
            if not self.count_A1:
                byteZero = 0
                byteThree = 2
            else:
                byteZero = 2
                byteThree = 0
            # An earlier version of this code had a way to read consecutive SNPs of code in one read. May want
            # to add that ability back to the code.
            # Also, note that reading with python will often result in non-contiguous memory, so the python standardizers will automatically be used, too.
            self._open_bed()
            #logging.warn("using pure python plink parser (might be much slower!!)")
            val = np.zeros(
                ((int(np.ceil(0.25 * iid_count_in)) * 4), sid_count_out),
                order=order,
                dtype=dtype)  #allocate it a little big
            for SNPsIndex, bimIndex in enumerate(sid_index):

                startbit = int(np.ceil(0.25 * iid_count_in) * bimIndex + 3)
                self._filepointer.seek(startbit)
                nbyte = int(np.ceil(0.25 * iid_count_in))
                bytes = np.array(bytearray(
                    self._filepointer.read(nbyte))).reshape(
                        (int(np.ceil(0.25 * iid_count_in)), 1), order='F')

                val[3::4, SNPsIndex:SNPsIndex + 1] = byteZero
                val[3::4, SNPsIndex:SNPsIndex + 1][bytes >= 64] = np.nan
                val[3::4, SNPsIndex:SNPsIndex + 1][bytes >= 128] = 1
                val[3::4, SNPsIndex:SNPsIndex + 1][bytes >= 192] = byteThree
                bytes = np.mod(bytes, 64)
                val[2::4, SNPsIndex:SNPsIndex + 1] = byteZero
                val[2::4, SNPsIndex:SNPsIndex + 1][bytes >= 16] = np.nan
                val[2::4, SNPsIndex:SNPsIndex + 1][bytes >= 32] = 1
                val[2::4, SNPsIndex:SNPsIndex + 1][bytes >= 48] = byteThree
                bytes = np.mod(bytes, 16)
                val[1::4, SNPsIndex:SNPsIndex + 1] = byteZero
                val[1::4, SNPsIndex:SNPsIndex + 1][bytes >= 4] = np.nan
                val[1::4, SNPsIndex:SNPsIndex + 1][bytes >= 8] = 1
                val[1::4, SNPsIndex:SNPsIndex + 1][bytes >= 12] = byteThree
                bytes = np.mod(bytes, 4)
                val[0::4, SNPsIndex:SNPsIndex + 1] = byteZero
                val[0::4, SNPsIndex:SNPsIndex + 1][bytes >= 1] = np.nan
                val[0::4, SNPsIndex:SNPsIndex + 1][bytes >= 2] = 1
                val[0::4, SNPsIndex:SNPsIndex + 1][bytes >= 3] = byteThree
            val = val[iid_index, :]  #reorder or trim any extra allocation
            if not SnpReader._array_properties_are_ok(val, order, dtype):
                val = val.copy(order=order)
            self._close_bed()

        return val

Esempio n. 31

0

Mostra file

File: bed.py Progetto: eric-czech/PySnpTools

    def write(filename, snpdata, count_A1=False, force_python_only=False):
        """Writes a :class:`SnpData` to Bed format and returns the :class:`.Bed`.

        :param filename: the name of the file to create
        :type filename: string
        :param snpdata: The in-memory data that should be written to disk.
        :type snpdata: :class:`SnpData`
        :param count_A1: Tells if it should count the number of A1 alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True.
        :type count_A1: bool
        :rtype: :class:`.Bed`

        >>> from pysnptools.snpreader import Pheno, Bed
        >>> import pysnptools.util as pstutil
        >>> from pysnptools.util import example_file # Download and return local file name
        >>> pheno_fn = example_file("pysnptools/examples/toydata.phe")
        >>> snpdata = Pheno(pheno_fn).read()         # Read data from Pheno format
        >>> pstutil.create_directory_if_necessary("tempdir/toydata.5chrom.bed")
        >>> Bed.write("tempdir/toydata.5chrom.bed",snpdata,count_A1=False)   # Write data in Bed format
        Bed('tempdir/toydata.5chrom.bed',count_A1=False)
        """

        if isinstance(filename, SnpData) and isinstance(
                snpdata, str
        ):  #For backwards compatibility, reverse inputs if necessary
            warnings.warn(
                "write statement should have filename before data to write",
                DeprecationWarning)
            filename, snpdata = snpdata, filename

        if count_A1 is None:
            warnings.warn(
                "'count_A1' was not set. For now it will default to 'False', but in the future it will default to 'True'",
                FutureWarning)
            count_A1 = False

        SnpReader._write_fam(snpdata, filename, remove_suffix="bed")
        SnpReader._write_map_or_bim(snpdata,
                                    filename,
                                    remove_suffix="bed",
                                    add_suffix="bim")

        bedfile = SnpReader._name_of_other_file(filename,
                                                remove_suffix="bed",
                                                add_suffix="bed")

        if not force_python_only:
            from pysnptools.snpreader import wrap_plink_parser

            if snpdata.val.flags["C_CONTIGUOUS"]:
                order = "C"
            elif snpdata.val.flags["F_CONTIGUOUS"]:
                order = "F"
            else:
                raise Exception("order not known (not 'F' or 'C')")

            if snpdata.val.dtype == np.float64:
                if order == "F":
                    wrap_plink_parser.writePlinkBedFile2doubleFAAA(
                        bedfile.encode('ascii'), snpdata.iid_count,
                        snpdata.sid_count, count_A1, snpdata.val)
                else:
                    wrap_plink_parser.writePlinkBedFile2doubleCAAA(
                        bedfile.encode('ascii'), snpdata.iid_count,
                        snpdata.sid_count, count_A1, snpdata.val)
            elif snpdata.val.dtype == np.float32:
                if order == "F":
                    wrap_plink_parser.writePlinkBedFile2floatFAAA(
                        bedfile.encode('ascii'), snpdata.iid_count,
                        snpdata.sid_count, count_A1, snpdata.val)
                else:
                    wrap_plink_parser.writePlinkBedFile2floatCAAA(
                        bedfile.encode('ascii'), snpdata.iid_count,
                        snpdata.sid_count, count_A1, snpdata.val)
            else:
                raise Exception(
                    "dtype '{0}' not known, only float64 and float32".format(
                        snpdata.val.dtype))

        else:
            if not count_A1:
                zero_code = 0b00
                two_code = 0b11
            else:
                zero_code = 0b11
                two_code = 0b00

            with open(bedfile, "wb") as bed_filepointer:
                #see http://zzz.bwh.harvard.edu/plink/binary.shtml
                bed_filepointer.write(bytes(bytearray([0b01101100
                                                       ])))  #magic numbers
                bed_filepointer.write(bytes(bytearray([0b00011011
                                                       ])))  #magic numbers
                bed_filepointer.write(bytes(bytearray([0b00000001
                                                       ])))  #snp major

                for sid_index in range(snpdata.sid_count):
                    if sid_index % 1 == 0:
                        logging.info("Writing snp # {0} to file '{1}'".format(
                            sid_index, filename))

                    col = snpdata.val[:, sid_index]
                    for iid_by_four in range(0, snpdata.iid_count, 4):
                        vals_for_this_byte = col[iid_by_four:iid_by_four + 4]
                        byte = 0b00000000
                        for val_index in range(len(vals_for_this_byte)):
                            val = vals_for_this_byte[val_index]
                            if val == 0:
                                code = zero_code
                            elif val == 1:
                                code = 0b10  #backwards on purpose
                            elif val == 2:
                                code = two_code
                            elif np.isnan(val):
                                code = 0b01  #backwards on purpose
                            else:
                                raise Exception(
                                    "Can't convert value '{0}' to BED format (only 0,1,2,NAN allowed)"
                                    .format(val))
                            byte |= (code << (val_index * 2))
                        bed_filepointer.write(bytes(bytearray([byte])))
        logging.info("Done writing " + filename)
        return Bed(filename, count_A1=count_A1)

Esempio n. 32

0

Mostra file

File: bed.py Progetto: MicrosoftGenomics/PySnpTools

 def row(self):
     """*same as* :attr:`iid`
     """
     if not hasattr(self,"_row"):
         self._row = SnpReader._read_fam(self.filename,remove_suffix="bed")
     return self._row

Esempio n. 33

0

Mostra file

File: dat.py Progetto: MicrosoftGenomics/PySnpTools

 def copyinputs(self, copier):
     # doesn't need to self.run_once() because creates name of all files itself
     copier.input(SnpReader._name_of_other_file(self.filename,remove_suffix="dat", add_suffix="dat"))
     copier.input(SnpReader._name_of_other_file(self.filename,remove_suffix="dat", add_suffix="fam"))
     copier.input(SnpReader._name_of_other_file(self.filename,remove_suffix="dat", add_suffix="map"))

Esempio n. 34

0

Mostra file

File: bed.py Progetto: MicrosoftGenomics/PySnpTools

    def _read(self, iid_index_or_none, sid_index_or_none, order, dtype, force_python_only, view_ok):
        self._run_once()

        if order=='A':
            order='F'

        assert not hasattr(self, 'ind_used'), "A SnpReader should not have a 'ind_used' attribute"

        iid_count_in = self.iid_count
        sid_count_in = self.sid_count

        if iid_index_or_none is not None:
            iid_count_out = len(iid_index_or_none)
            iid_index_out = iid_index_or_none
        else:
            iid_count_out = iid_count_in
            iid_index_out = range(iid_count_in)

        if sid_index_or_none is not None:
            sid_count_out = len(sid_index_or_none)
            sid_index_out = sid_index_or_none
        else:
            sid_count_out = sid_count_in
            sid_index_out = range(sid_count_in)

        if not force_python_only:
            from pysnptools.snpreader import wrap_plink_parser
            val = np.zeros((iid_count_out, sid_count_out), order=order, dtype=dtype)
            bed_fn = SnpReader._name_of_other_file(self.filename,"bed","bed")

            if iid_count_in > 0 and sid_count_in > 0:
                if dtype == np.float64:
                    if order=="F":
                        wrap_plink_parser.readPlinkBedFile2doubleFAAA(bed_fn.encode('ascii'), iid_count_in, sid_count_in, self.count_A1, iid_index_out, sid_index_out, val)
                    elif order=="C":
                        wrap_plink_parser.readPlinkBedFile2doubleCAAA(bed_fn.encode('ascii'), iid_count_in, sid_count_in, self.count_A1, iid_index_out, sid_index_out, val)
                    else:
                        raise Exception("order '{0}' not known, only 'F' and 'C'".format(order));
                elif dtype == np.float32:
                    if order=="F":
                        wrap_plink_parser.readPlinkBedFile2floatFAAA(bed_fn.encode('ascii'), iid_count_in, sid_count_in, self.count_A1, iid_index_out, sid_index_out, val)
                    elif order=="C":
                        wrap_plink_parser.readPlinkBedFile2floatCAAA(bed_fn.encode('ascii'), iid_count_in, sid_count_in, self.count_A1, iid_index_out, sid_index_out, val)
                    else:
                        raise Exception("order '{0}' not known, only 'F' and 'C'".format(order));
                else:
                    raise Exception("dtype '{0}' not known, only float64 and float32".format(dtype))
            
        else:
            if not self.count_A1:
                byteZero = 0
                byteThree = 2
            else:
                byteZero = 2
                byteThree = 0
            # An earlier version of this code had a way to read consecutive SNPs of code in one read. May want
            # to add that ability back to the code. 
            # Also, note that reading with python will often result in non-contiguous memory, so the python standardizers will automatically be used, too.       
            self._open_bed()
            logging.warn("using pure python plink parser (might be much slower!!)")
            val = np.zeros(((int(np.ceil(0.25*iid_count_in))*4),sid_count_out),order=order, dtype=dtype) #allocate it a little big
            for SNPsIndex, bimIndex in enumerate(sid_index_out):

                startbit = int(np.ceil(0.25*iid_count_in)*bimIndex+3)
                self._filepointer.seek(startbit)
                nbyte = int(np.ceil(0.25*iid_count_in))
                bytes = np.array(bytearray(self._filepointer.read(nbyte))).reshape((int(np.ceil(0.25*iid_count_in)),1),order='F')

                val[3::4,SNPsIndex:SNPsIndex+1]=byteZero
                val[3::4,SNPsIndex:SNPsIndex+1][bytes>=64]=np.nan
                val[3::4,SNPsIndex:SNPsIndex+1][bytes>=128]=1
                val[3::4,SNPsIndex:SNPsIndex+1][bytes>=192]=byteThree
                bytes=np.mod(bytes,64)
                val[2::4,SNPsIndex:SNPsIndex+1]=byteZero
                val[2::4,SNPsIndex:SNPsIndex+1][bytes>=16]=np.nan
                val[2::4,SNPsIndex:SNPsIndex+1][bytes>=32]=1
                val[2::4,SNPsIndex:SNPsIndex+1][bytes>=48]=byteThree
                bytes=np.mod(bytes,16)
                val[1::4,SNPsIndex:SNPsIndex+1]=byteZero
                val[1::4,SNPsIndex:SNPsIndex+1][bytes>=4]=np.nan
                val[1::4,SNPsIndex:SNPsIndex+1][bytes>=8]=1
                val[1::4,SNPsIndex:SNPsIndex+1][bytes>=12]=byteThree
                bytes=np.mod(bytes,4)
                val[0::4,SNPsIndex:SNPsIndex+1]=byteZero
                val[0::4,SNPsIndex:SNPsIndex+1][bytes>=1]=np.nan
                val[0::4,SNPsIndex:SNPsIndex+1][bytes>=2]=1
                val[0::4,SNPsIndex:SNPsIndex+1][bytes>=3]=byteThree
            val = val[iid_index_out,:] #reorder or trim any extra allocation


            #!!LATER this can fail because the trim statement above messes up the order
            #assert(SnpReader._array_properties_are_ok(val, order, dtype)) #!!
            self._close_bed()

        return val

Esempio n. 35

0

Mostra file

    def write(filename, snpdata):
        """Writes a :class:`SnpData` to Ped format. The values must be 0,1,2. The direction of the encoding to allele pairs is arbitrary. This means
        that if a SnpData is written in Ped format and then read back, then 0's may become 2's and 2's may become 0's. (1's will stay 1's).
        Returns the :class:`.Ped`

        :param filename: the name of the file to create
        :type filename: string
        :param snpdata: The in-memory data that should be written to disk.
        :type snpdata: :class:`SnpData`
        :rtype: :class:`.Ped`

        >>> from pysnptools.snpreader import Ped, Bed
        >>> import pysnptools.util as pstutil
        >>> from pysnptools.util import example_file # Download and return local file name
        >>> bed_file = example_file("pysnptools/examples/toydata.5chrom.*","*.bed")
        >>> snpdata = Bed(bed_file,count_A1=False)[:,:10].read()  # Read first 10 snps from Bed format
        >>> pstutil.create_directory_if_necessary("tempdir/toydata10.ped")
        >>> Ped.write("tempdir/toydata10.ped",snpdata)            # Write data in Ped format
        Ped('tempdir/toydata10.ped')
        """

        if isinstance(filename, SnpData) and isinstance(
                snpdata, str
        ):  #For backwards compatibility, reverse inputs if necessary
            warnings.warn(
                "write statement should have filename before data to write",
                DeprecationWarning)
            filename, snpdata = snpdata, filename

        SnpReader._write_map_or_bim(snpdata,
                                    filename,
                                    remove_suffix="ped",
                                    add_suffix="map")

        # The PED file is a white-space (space or tab) delimited file: the first six columns are mandatory:
        # Family ID
        # Case ID
        # Paternal ID
        # Maternal ID
        # Sex (1=male; 2=female; other=unknown)
        # Phenotype

        pedfile = SnpReader._name_of_other_file(filename,
                                                remove_suffix="ped",
                                                add_suffix="ped")
        with open(pedfile, "w") as ped_filepointer:
            for iid_index, iid_row in enumerate(snpdata.iid):
                ped_filepointer.write("{0} {1} 0 0 0 0".format(
                    iid_row[0], iid_row[1]))
                row = snpdata.val[iid_index, :]
                for sid_index, val in enumerate(row):
                    if val == 0:
                        s = "A A"
                    elif val == 1:
                        s = "A G"
                    elif val == 2:
                        s = "G G"
                    elif np.isnan(val):
                        s = "0 0"
                    else:
                        raise Exception(
                            "Expect values for ped file to be 0,1,2, or NAN. Instead, saw '{0}'"
                            .format(val))
                    ped_filepointer.write("\t" + s)
                ped_filepointer.write("\n")
        return Ped(filename)

Esempio n. 36

0

Mostra file

File: ped.py Progetto: MicrosoftGenomics/PySnpTools

 def copyinputs(self, copier):
     # doesn't need to self.run_once() because only uses original inputs
     copier.input(self.filename)
     copier.input(SnpReader._name_of_other_file(self.filename,remove_suffix="ped", add_suffix="map"))

Esempio n. 37

0

Mostra file

File: bed.py Progetto: MicrosoftGenomics/PySnpTools

 def col_property(self):
     """*same as* :attr:`pos`
     """
     if not hasattr(self,"_col_property"):
         self._col, self._col_property = SnpReader._read_map_or_bim(self.filename,remove_suffix="bed", add_suffix="bim")
     return self._col_property

Esempio n. 38

0

Mostra file

File: bed.py Progetto: MicrosoftGenomics/PySnpTools

    def write(filename, snpdata, count_A1=False, force_python_only=False):
        """Writes a :class:`SnpData` to Bed format.

        :param filename: the name of the file to create
        :type filename: string
        :param snpdata: The in-memory data that should be written to disk.
        :type snpdata: :class:`SnpData`
        :param count_A1: Tells if it should count the number of A1 alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True.
        :type count_A1: bool

        >>> from pysnptools.snpreader import Pheno, Bed
        >>> import pysnptools.util as pstutil
        >>> snpdata = Pheno('../examples/toydata.phe').read()         # Read data from Pheno format
        >>> pstutil.create_directory_if_necessary("tempdir/toydata.bed")
        >>> Bed.write("tempdir/toydata.bed",snpdata,count_A1=False)   # Write data in Bed format
        """

        if isinstance(filename,SnpData) and isinstance(snpdata,str): #For backwards compatibility, reverse inputs if necessary
            warnings.warn("write statement should have filename before data to write", DeprecationWarning)
            filename, snpdata = snpdata, filename 

        if count_A1 is None:
             warnings.warn("'count_A1' was not set. For now it will default to 'False', but in the future it will default to 'True'", FutureWarning)
             count_A1 = False

        SnpReader._write_fam(snpdata, filename, remove_suffix="bed")
        SnpReader._write_map_or_bim(snpdata, filename, remove_suffix="bed", add_suffix="bim")

        bedfile = SnpReader._name_of_other_file(filename,remove_suffix="bed", add_suffix="bed")

        if not force_python_only:
            from pysnptools.snpreader import wrap_plink_parser

            if snpdata.val.flags["C_CONTIGUOUS"]:
                order = "C"
            elif snpdata.val.flags["F_CONTIGUOUS"]:
                order = "F"
            else:
                raise Exception("order '{0}' not known, only 'F' and 'C'".format(order))

            if snpdata.val.dtype == np.float64:
                if order=="F":
                    wrap_plink_parser.writePlinkBedFile2doubleFAAA(bedfile.encode('ascii'), snpdata.iid_count, snpdata.sid_count, count_A1, snpdata.val)
                else:
                    wrap_plink_parser.writePlinkBedFile2doubleCAAA(bedfile.encode('ascii'), snpdata.iid_count, snpdata.sid_count, count_A1, snpdata.val)
            elif snpdata.val.dtype == np.float32:
                if order=="F":
                    wrap_plink_parser.writePlinkBedFile2floatFAAA(bedfile.encode('ascii'), snpdata.iid_count, snpdata.sid_count, count_A1, snpdata.val)
                else:
                    wrap_plink_parser.writePlinkBedFile2floatCAAA(bedfile.encode('ascii'), snpdata.iid_count, snpdata.sid_count, count_A1, snpdata.val)
            else:
                raise Exception("dtype '{0}' not known, only float64 and float32".format(snpdata.val.dtype))
            
        else:
            if not count_A1:
                zero_code = 0b00
                two_code = 0b11
            else:
                zero_code = 0b11
                two_code = 0b00

            with open(bedfile,"wb") as bed_filepointer:
                #see http://pngu.mgh.harvard.edu/~purcell/plink/binary.shtml
                bed_filepointer.write(bytes(bytearray([0b01101100]))) #magic numbers
                bed_filepointer.write(bytes(bytearray([0b00011011]))) #magic numbers
                bed_filepointer.write(bytes(bytearray([0b00000001]))) #snp major

                for sid_index in range(snpdata.sid_count):
                    if sid_index % 1 == 0:
                        logging.info("Writing snp # {0} to file '{1}'".format(sid_index, filename))

                    col = snpdata.val[:, sid_index]
                    for iid_by_four in range(0,snpdata.iid_count,4):
                        vals_for_this_byte = col[iid_by_four:iid_by_four+4]
                        byte = 0b00000000
                        for val_index in range(len(vals_for_this_byte)):
                            val = vals_for_this_byte[val_index]
                            if val == 0:
                                code = zero_code
                            elif val == 1:
                                code = 0b10 #backwards on purpose
                            elif val == 2:
                                code = two_code
                            elif np.isnan(val):
                                code = 0b01 #backwards on purpose
                            else:
                                raise Exception("Can't convert value '{0}' to BED format (only 0,1,2,NAN allowed)".format(val))
                            byte |= (code << (val_index*2))
                        bed_filepointer.write(bytes(bytearray([byte])))
        logging.info("Done writing " + filename)

Esempio n. 39

0

Mostra file

    def write(
        filename,
        snpdata,
        count_A1=False,
        force_python_only=False,
        _require_float32_64=True,
        num_threads=None,
        reverse_chrom_map = {},
    ):
        """Writes a :class:`SnpData` to Bed format and returns the :class:`.Bed`.

        :param filename: the name of the file to create
        :type filename: string
        :param snpdata: The in-memory data that should be written to disk.
        :type snpdata: :class:`SnpData`
        :param count_A1: Tells if it should count the number of A1 alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True.
        :type count_A1: bool
        :param force_python_only: Defaults to False. Tells to use Python code rather than faster Rust code. (Might be useful for debugging).
        :type force_python_only: bool
        :param _require_float32_64: Defaults to True. Requires that snpdata's dtype is float32 or float64. (False is useful for writing int8 data.)
        :type _require_float32_64: bool
        :param num_threads: Maximum number of threads to use. Currently ignored and number of threads is 1.
        :type num_threads: int
        :param reverse_chrom_map: Dictionary from chromosome number to chromsome string to write in the \*.bim file. Defaults to empty dictionary.
        :type reverse_chrom_map: dictionary
        :rtype: :class:`.Bed`

        Any :attr:`pos` values of NaN will be written as 0, the PLINK standard for missing chromosome and position values. 

        >>> from pysnptools.snpreader import Pheno, Bed
        >>> import pysnptools.util as pstutil
        >>> from pysnptools.util import example_file # Download and return local file name
        >>> bed_fn = example_file("pysnptools/examples/toydata.5chrom.bed")
        >>> snpdata = Bed(bed_fn)[:,::2].read() # Read every-other SNP
        >>> pstutil.create_directory_if_necessary("tempdir/everyother.bed")
        >>> Bed.write("tempdir/everyother.bed",snpdata,count_A1=False)   # Write data in Bed format
        Bed('tempdir/everyother.bed',count_A1=False)
        >>> # Can write from an int8 array, too.
        >>> snpdata_int = SnpData(val=np.int_(snpdata.val).astype('int8'),iid=snpdata.iid,sid=snpdata.sid,pos=snpdata.pos,_require_float32_64=False)
        >>> snpdata_int.val.dtype
        dtype('int8')
        >>> Bed.write("tempdir/everyother.bed",snpdata_int,count_A1=False,_require_float32_64=False)
        Bed('tempdir/everyother.bed',count_A1=False)
        """

        if isinstance(filename, SnpData) and isinstance(
            snpdata, str
        ):  # For backwards compatibility, reverse inputs if necessary
            warnings.warn(
                "write statement should have filename before data to write",
                DeprecationWarning,
            )
            filename, snpdata = snpdata, filename

        if count_A1 is None:
            warnings.warn(
                "'count_A1' was not set. For now it will default to 'False', but in the future it will default to 'True'",
                FutureWarning,
            )
            count_A1 = False

        filename = SnpReader._name_of_other_file(filename,remove_suffix="bed", add_suffix="bed")

        chromosome = snpdata.pos[:, 0]
        intersection = reverse_chrom_map.keys() & chromosome
        if len(intersection) > 0:
            chromosome = chromosome.astype("object")
            for key in intersection:
                chromosome[chromosome==key]=reverse_chrom_map[key]

        to_bed(
            filename,
            val=snpdata.val,
            properties={
                "fid": snpdata.iid[:, 0],
                "iid": snpdata.iid[:, 1],
                "sid": snpdata.sid,
                "chromosome": chromosome,
                "cm_position": snpdata.pos[:, 1],
                "bp_position": snpdata.pos[:, 2],
            },
            count_A1=count_A1,
            force_python_only=force_python_only,
            num_threads=num_threads,
        )

        return Bed(filename, count_A1=count_A1)