Beispiel #1
0
    def _read(self, iid_index_or_none, sid_index_or_none, order, dtype,
              force_python_only, view_ok):
        self._run_once()

        if order == 'A':
            order = 'F'
        dtype = np.dtype(dtype)

        assert not hasattr(
            self,
            'ind_used'), "A SnpReader should not have a 'ind_used' attribute"

        iid_count_in = self.iid_count
        sid_count_in = self.sid_count

        if iid_index_or_none is not None:
            iid_count_out = len(iid_index_or_none)
            iid_index = iid_index_or_none
        else:
            iid_count_out = iid_count_in
            iid_index = list(range(iid_count_in))

        if sid_index_or_none is not None:
            sid_count_out = len(sid_index_or_none)
            sid_index = sid_index_or_none
        else:
            sid_count_out = sid_count_in
            sid_index = list(range(sid_count_in))

        if not force_python_only:
            from pysnptools.snpreader import wrap_plink_parser
            val = np.zeros((iid_count_out, sid_count_out),
                           order=order,
                           dtype=dtype)
            bed_fn = SnpReader._name_of_other_file(self.filename, "bed", "bed")

            if iid_count_in > 0 and sid_count_in > 0:
                if dtype == np.float64:
                    if order == "F":
                        wrap_plink_parser.readPlinkBedFile2doubleFAAA(
                            bed_fn.encode('ascii'), iid_count_in, sid_count_in,
                            self.count_A1, iid_index, sid_index, val)
                    elif order == "C":
                        wrap_plink_parser.readPlinkBedFile2doubleCAAA(
                            bed_fn.encode('ascii'), iid_count_in, sid_count_in,
                            self.count_A1, iid_index, sid_index, val)
                    else:
                        raise Exception(
                            "order '{0}' not known, only 'F' and 'C'".format(
                                order))
                elif dtype == np.float32:
                    if order == "F":
                        wrap_plink_parser.readPlinkBedFile2floatFAAA(
                            bed_fn.encode('ascii'), iid_count_in, sid_count_in,
                            self.count_A1, iid_index, sid_index, val)
                    elif order == "C":
                        wrap_plink_parser.readPlinkBedFile2floatCAAA(
                            bed_fn.encode('ascii'), iid_count_in, sid_count_in,
                            self.count_A1, iid_index, sid_index, val)
                    else:
                        raise Exception(
                            "order '{0}' not known, only 'F' and 'C'".format(
                                order))
                else:
                    raise Exception(
                        "dtype '{0}' not known, only float64 and float32".
                        format(dtype))

        else:
            if not self.count_A1:
                byteZero = 0
                byteThree = 2
            else:
                byteZero = 2
                byteThree = 0
            # An earlier version of this code had a way to read consecutive SNPs of code in one read. May want
            # to add that ability back to the code.
            # Also, note that reading with python will often result in non-contiguous memory, so the python standardizers will automatically be used, too.
            self._open_bed()
            #logging.warn("using pure python plink parser (might be much slower!!)")
            val = np.zeros(
                ((int(np.ceil(0.25 * iid_count_in)) * 4), sid_count_out),
                order=order,
                dtype=dtype)  #allocate it a little big
            for SNPsIndex, bimIndex in enumerate(sid_index):

                startbit = int(np.ceil(0.25 * iid_count_in) * bimIndex + 3)
                self._filepointer.seek(startbit)
                nbyte = int(np.ceil(0.25 * iid_count_in))
                bytes = np.array(bytearray(
                    self._filepointer.read(nbyte))).reshape(
                        (int(np.ceil(0.25 * iid_count_in)), 1), order='F')

                val[3::4, SNPsIndex:SNPsIndex + 1] = byteZero
                val[3::4, SNPsIndex:SNPsIndex + 1][bytes >= 64] = np.nan
                val[3::4, SNPsIndex:SNPsIndex + 1][bytes >= 128] = 1
                val[3::4, SNPsIndex:SNPsIndex + 1][bytes >= 192] = byteThree
                bytes = np.mod(bytes, 64)
                val[2::4, SNPsIndex:SNPsIndex + 1] = byteZero
                val[2::4, SNPsIndex:SNPsIndex + 1][bytes >= 16] = np.nan
                val[2::4, SNPsIndex:SNPsIndex + 1][bytes >= 32] = 1
                val[2::4, SNPsIndex:SNPsIndex + 1][bytes >= 48] = byteThree
                bytes = np.mod(bytes, 16)
                val[1::4, SNPsIndex:SNPsIndex + 1] = byteZero
                val[1::4, SNPsIndex:SNPsIndex + 1][bytes >= 4] = np.nan
                val[1::4, SNPsIndex:SNPsIndex + 1][bytes >= 8] = 1
                val[1::4, SNPsIndex:SNPsIndex + 1][bytes >= 12] = byteThree
                bytes = np.mod(bytes, 4)
                val[0::4, SNPsIndex:SNPsIndex + 1] = byteZero
                val[0::4, SNPsIndex:SNPsIndex + 1][bytes >= 1] = np.nan
                val[0::4, SNPsIndex:SNPsIndex + 1][bytes >= 2] = 1
                val[0::4, SNPsIndex:SNPsIndex + 1][bytes >= 3] = byteThree
            val = val[iid_index, :]  #reorder or trim any extra allocation
            if not SnpReader._array_properties_are_ok(val, order, dtype):
                val = val.copy(order=order)
            self._close_bed()

        return val
Beispiel #2
0
    def read_with_specification(snpset_withbbed,
                                order="F",
                                dtype=SP.float64,
                                force_python_only=False):
        # doesn't need to self.run_once() because it is static
        decide_once_on_plink_reader()
        global WRAPPED_PLINK_PARSER_PRESENT

        bed = snpset_withbbed.bed
        iid_count_in, iid_count_out, iid_index_out, snp_count_in, snp_count_out, snp_index_out = bed.counts_and_indexes(
            snpset_withbbed)

        if WRAPPED_PLINK_PARSER_PRESENT and not force_python_only:
            from pysnptools.snpreader import wrap_plink_parser
            SNPs = SP.zeros((iid_count_out, snp_count_out),
                            order=order,
                            dtype=dtype)
            bed_fn = bed.basefilename + ".bed"
            count_A1 = False

            if dtype == SP.float64:
                if order == "F":
                    wrap_plink_parser.readPlinkBedFile2doubleFAAA(
                        bed_fn, iid_count_in, snp_count_in, count_A1,
                        iid_index_out, snp_index_out, SNPs)
                elif order == "C":
                    wrap_plink_parser.readPlinkBedFile2doubleCAAA(
                        bed_fn, iid_count_in, snp_count_in, count_A1,
                        iid_index_out, snp_index_out, SNPs)
                else:
                    raise Exception(
                        "order '{0}' not known, only 'F' and 'C'".format(
                            order))
            elif dtype == SP.float32:
                if order == "F":
                    wrap_plink_parser.readPlinkBedFile2floatFAAA(
                        bed_fn, iid_count_in, snp_count_in, count_A1,
                        iid_index_out, snp_index_out, SNPs)
                elif order == "C":
                    wrap_plink_parser.readPlinkBedFile2floatCAAA(
                        bed_fn, iid_count_in, snp_count_in, count_A1,
                        iid_index_out, snp_index_out, SNPs)
                else:
                    raise Exception(
                        "dtype '{0}' not known, only float64 and float32".
                        format(dtype))

        else:
            # An earlier version of this code had a way to read consecutive SNPs of code in one read. May want
            # to add that ability back to the code.
            # Also, note that reading with python will often result in non-contigious memory, so the python standardizers will automatically be used, too.
            logging.warn(
                "using pure python plink parser (might be much slower!!)")
            SNPs = SP.zeros(
                ((int(SP.ceil(0.25 * iid_count_in)) * 4), snp_count_out),
                order=order,
                dtype=dtype)  #allocate it a little big
            for SNPsIndex, bimIndex in enumerate(snpset_withbbed):

                startbit = int(SP.ceil(0.25 * iid_count_in) * bimIndex + 3)
                bed._filepointer.seek(startbit)
                nbyte = int(SP.ceil(0.25 * iid_count_in))
                bytes = SP.array(bytearray(
                    bed._filepointer.read(nbyte))).reshape(
                        (int(SP.ceil(0.25 * iid_count_in)), 1), order='F')

                SNPs[3::4, SNPsIndex:SNPsIndex + 1][bytes >= 64] = SP.nan
                SNPs[3::4, SNPsIndex:SNPsIndex + 1][bytes >= 128] = 1
                SNPs[3::4, SNPsIndex:SNPsIndex + 1][bytes >= 192] = 2
                bytes = SP.mod(bytes, 64)
                SNPs[2::4, SNPsIndex:SNPsIndex + 1][bytes >= 16] = SP.nan
                SNPs[2::4, SNPsIndex:SNPsIndex + 1][bytes >= 32] = 1
                SNPs[2::4, SNPsIndex:SNPsIndex + 1][bytes >= 48] = 2
                bytes = SP.mod(bytes, 16)
                SNPs[1::4, SNPsIndex:SNPsIndex + 1][bytes >= 4] = SP.nan
                SNPs[1::4, SNPsIndex:SNPsIndex + 1][bytes >= 8] = 1
                SNPs[1::4, SNPsIndex:SNPsIndex + 1][bytes >= 12] = 2
                bytes = SP.mod(bytes, 4)
                SNPs[0::4, SNPsIndex:SNPsIndex + 1][bytes >= 1] = SP.nan
                SNPs[0::4, SNPsIndex:SNPsIndex + 1][bytes >= 2] = 1
                SNPs[0::4, SNPsIndex:SNPsIndex + 1][bytes >= 3] = 2
            SNPs = SNPs[
                iid_index_out, :]  #reorder or trim any extra allocation

        ret = {
            'rs': bed.rs[snp_index_out],
            'pos': bed.pos[snp_index_out, :],
            'snps': SNPs,
            'iid': bed.original_iids[iid_index_out, :]
        }
        return ret
Beispiel #3
0
    def read_with_specification(snpset_withbbed, order="F", dtype=SP.float64, force_python_only=False):
        # doesn't need to self.run_once() because it is static
        decide_once_on_plink_reader()
        global WRAPPED_PLINK_PARSER_PRESENT

        bed = snpset_withbbed.bed
        iid_count_in, iid_count_out, iid_index_out, snp_count_in, snp_count_out, snp_index_out = bed.counts_and_indexes(snpset_withbbed)

        if WRAPPED_PLINK_PARSER_PRESENT and not force_python_only:
            from pysnptools.snpreader import wrap_plink_parser
            SNPs = SP.zeros((iid_count_out, snp_count_out), order=order, dtype=dtype)
            bed_fn = bed.basefilename + ".bed"
            count_A1 = False

            if dtype == SP.float64:
                if order=="F":
                    wrap_plink_parser.readPlinkBedFile2doubleFAAA(bed_fn, iid_count_in, snp_count_in, count_A1, iid_index_out, snp_index_out, SNPs)
                elif order=="C":
                    wrap_plink_parser.readPlinkBedFile2doubleCAAA(bed_fn, iid_count_in, snp_count_in, count_A1, iid_index_out, snp_index_out, SNPs)
                else:
                    raise Exception("order '{0}' not known, only 'F' and 'C'".format(order));
            elif dtype == SP.float32:
                if order=="F":
                    wrap_plink_parser.readPlinkBedFile2floatFAAA(bed_fn, iid_count_in, snp_count_in, count_A1, iid_index_out, snp_index_out, SNPs)
                elif order=="C":
                    wrap_plink_parser.readPlinkBedFile2floatCAAA(bed_fn, iid_count_in, snp_count_in, count_A1, iid_index_out, snp_index_out, SNPs)
                else:
                    raise Exception("dtype '{0}' not known, only float64 and float32".format(dtype))
            
        else:
            # An earlier version of this code had a way to read consecutive SNPs of code in one read. May want
            # to add that ability back to the code. 
            # Also, note that reading with python will often result in non-contigious memory, so the python standardizers will automatically be used, too.       
            logging.warn("using pure python plink parser (might be much slower!!)")
            SNPs = SP.zeros(((int(SP.ceil(0.25*iid_count_in))*4),snp_count_out),order=order, dtype=dtype) #allocate it a little big
            for SNPsIndex, bimIndex in enumerate(snpset_withbbed):

                startbit = int(SP.ceil(0.25*iid_count_in)*bimIndex+3)
                bed._filepointer.seek(startbit)
                nbyte = int(SP.ceil(0.25*iid_count_in))
                bytes = SP.array(bytearray(bed._filepointer.read(nbyte))).reshape((int(SP.ceil(0.25*iid_count_in)),1),order='F')

                SNPs[3::4,SNPsIndex:SNPsIndex+1][bytes>=64]=SP.nan
                SNPs[3::4,SNPsIndex:SNPsIndex+1][bytes>=128]=1
                SNPs[3::4,SNPsIndex:SNPsIndex+1][bytes>=192]=2
                bytes=SP.mod(bytes,64)
                SNPs[2::4,SNPsIndex:SNPsIndex+1][bytes>=16]=SP.nan
                SNPs[2::4,SNPsIndex:SNPsIndex+1][bytes>=32]=1
                SNPs[2::4,SNPsIndex:SNPsIndex+1][bytes>=48]=2
                bytes=SP.mod(bytes,16)
                SNPs[1::4,SNPsIndex:SNPsIndex+1][bytes>=4]=SP.nan
                SNPs[1::4,SNPsIndex:SNPsIndex+1][bytes>=8]=1
                SNPs[1::4,SNPsIndex:SNPsIndex+1][bytes>=12]=2
                bytes=SP.mod(bytes,4)
                SNPs[0::4,SNPsIndex:SNPsIndex+1][bytes>=1]=SP.nan
                SNPs[0::4,SNPsIndex:SNPsIndex+1][bytes>=2]=1
                SNPs[0::4,SNPsIndex:SNPsIndex+1][bytes>=3]=2
            SNPs = SNPs[iid_index_out,:] #reorder or trim any extra allocation

        ret = {
                'rs'     :bed.rs[snp_index_out],
                'pos'    :bed.pos[snp_index_out,:],
                'snps'   :SNPs,
                'iid'    :bed.original_iids[iid_index_out,:]
                }
        return ret
Beispiel #4
0
    def _read(self, iid_index_or_none, sid_index_or_none, order, dtype, force_python_only, view_ok):
        self._run_once()

        if order=='A':
            order='F'

        assert not hasattr(self, 'ind_used'), "A SnpReader should not have a 'ind_used' attribute"

        iid_count_in = self.iid_count
        sid_count_in = self.sid_count

        if iid_index_or_none is not None:
            iid_count_out = len(iid_index_or_none)
            iid_index_out = iid_index_or_none
        else:
            iid_count_out = iid_count_in
            iid_index_out = range(iid_count_in)

        if sid_index_or_none is not None:
            sid_count_out = len(sid_index_or_none)
            sid_index_out = sid_index_or_none
        else:
            sid_count_out = sid_count_in
            sid_index_out = range(sid_count_in)

        if not force_python_only:
            from pysnptools.snpreader import wrap_plink_parser
            val = np.zeros((iid_count_out, sid_count_out), order=order, dtype=dtype)
            bed_fn = SnpReader._name_of_other_file(self.filename,"bed","bed")

            if iid_count_in > 0 and sid_count_in > 0:
                if dtype == np.float64:
                    if order=="F":
                        wrap_plink_parser.readPlinkBedFile2doubleFAAA(bed_fn.encode('ascii'), iid_count_in, sid_count_in, self.count_A1, iid_index_out, sid_index_out, val)
                    elif order=="C":
                        wrap_plink_parser.readPlinkBedFile2doubleCAAA(bed_fn.encode('ascii'), iid_count_in, sid_count_in, self.count_A1, iid_index_out, sid_index_out, val)
                    else:
                        raise Exception("order '{0}' not known, only 'F' and 'C'".format(order));
                elif dtype == np.float32:
                    if order=="F":
                        wrap_plink_parser.readPlinkBedFile2floatFAAA(bed_fn.encode('ascii'), iid_count_in, sid_count_in, self.count_A1, iid_index_out, sid_index_out, val)
                    elif order=="C":
                        wrap_plink_parser.readPlinkBedFile2floatCAAA(bed_fn.encode('ascii'), iid_count_in, sid_count_in, self.count_A1, iid_index_out, sid_index_out, val)
                    else:
                        raise Exception("order '{0}' not known, only 'F' and 'C'".format(order));
                else:
                    raise Exception("dtype '{0}' not known, only float64 and float32".format(dtype))
            
        else:
            if not self.count_A1:
                byteZero = 0
                byteThree = 2
            else:
                byteZero = 2
                byteThree = 0
            # An earlier version of this code had a way to read consecutive SNPs of code in one read. May want
            # to add that ability back to the code. 
            # Also, note that reading with python will often result in non-contiguous memory, so the python standardizers will automatically be used, too.       
            self._open_bed()
            logging.warn("using pure python plink parser (might be much slower!!)")
            val = np.zeros(((int(np.ceil(0.25*iid_count_in))*4),sid_count_out),order=order, dtype=dtype) #allocate it a little big
            for SNPsIndex, bimIndex in enumerate(sid_index_out):

                startbit = int(np.ceil(0.25*iid_count_in)*bimIndex+3)
                self._filepointer.seek(startbit)
                nbyte = int(np.ceil(0.25*iid_count_in))
                bytes = np.array(bytearray(self._filepointer.read(nbyte))).reshape((int(np.ceil(0.25*iid_count_in)),1),order='F')

                val[3::4,SNPsIndex:SNPsIndex+1]=byteZero
                val[3::4,SNPsIndex:SNPsIndex+1][bytes>=64]=np.nan
                val[3::4,SNPsIndex:SNPsIndex+1][bytes>=128]=1
                val[3::4,SNPsIndex:SNPsIndex+1][bytes>=192]=byteThree
                bytes=np.mod(bytes,64)
                val[2::4,SNPsIndex:SNPsIndex+1]=byteZero
                val[2::4,SNPsIndex:SNPsIndex+1][bytes>=16]=np.nan
                val[2::4,SNPsIndex:SNPsIndex+1][bytes>=32]=1
                val[2::4,SNPsIndex:SNPsIndex+1][bytes>=48]=byteThree
                bytes=np.mod(bytes,16)
                val[1::4,SNPsIndex:SNPsIndex+1]=byteZero
                val[1::4,SNPsIndex:SNPsIndex+1][bytes>=4]=np.nan
                val[1::4,SNPsIndex:SNPsIndex+1][bytes>=8]=1
                val[1::4,SNPsIndex:SNPsIndex+1][bytes>=12]=byteThree
                bytes=np.mod(bytes,4)
                val[0::4,SNPsIndex:SNPsIndex+1]=byteZero
                val[0::4,SNPsIndex:SNPsIndex+1][bytes>=1]=np.nan
                val[0::4,SNPsIndex:SNPsIndex+1][bytes>=2]=1
                val[0::4,SNPsIndex:SNPsIndex+1][bytes>=3]=byteThree
            val = val[iid_index_out,:] #reorder or trim any extra allocation


            #!!LATER this can fail because the trim statement above messes up the order
            #assert(SnpReader._array_properties_are_ok(val, order, dtype)) #!!
            self._close_bed()

        return val