def __init__(self, iid, test=None): #!!! add docs and test for test super(Identity, self).__init__() if test is None: test = iid if test is iid: iid = PstData._fixup_input( iid, empty_creator=lambda ignore: np.empty([0, 2], dtype='str'), dtype='str') test = iid else: iid = PstData._fixup_input( iid, empty_creator=lambda ignore: np.empty([0, 2], dtype='str'), dtype='str') test = PstData._fixup_input( test, empty_creator=lambda ignore: np.empty([0, 2], dtype='str'), dtype='str') if len(iid) > 0: self._row0 = iid else: self._row0 = self._empty if len(test) > 0: self._row1 = test else: self._row1 = self._empty
def __init__(self, filename, count_A1=None, iid=None, sid=None, pos=None, skip_format_check=False ): #!!!document these new optionals. they are here super(Bed, self).__init__() self._ran_once = False self._file_pointer = None self.filename = filename if count_A1 is None: warnings.warn( "'count_A1' was not set. For now it will default to 'False', but in the future it will default to 'True'", FutureWarning) count_A1 = False self.count_A1 = count_A1 self.skip_format_check = skip_format_check if iid is not None: self._row = PstData._fixup_input( iid, empty_creator=lambda ignore: np.empty([0, 2], dtype=str)) if sid is not None: self._col = PstData._fixup_input( sid, empty_creator=lambda ignore: np.empty([0], dtype=str)) if pos is not None: self._col_property = PstData._fixup_input( pos, count=len(self._col), empty_creator=lambda count: np.array([[np.nan, np.nan, np.nan]] * count))
def cmktest_repr_test(self): np.random.seed(0) row_property=np.array([[1.0,2,2.5],[3,4,4.5],[5,6,6.5]]) col_property=np.array([[1.0,2,2.5,1],[3,4,4.5,3]]) pstdata = PstData(row=np.array([[1.0,2],[3,4],[5,6]]), col=np.array([("A","a"),("B","b")]), val = np.random.normal(.5,2,size=(3,2)), row_property=row_property, col_property=col_property) assert pstdata.col_to_index([("B","b")])[0] == 1 s = str(pstdata)
def _run_once(self): if (self._ran_once): return row, col, val, row_property, col_property = self._run_once_inner() PstData.__init__(self, row, col, val, row_property, col_property, name="np.memmap('{0}')".format(self._filename))
def __init__(self, iid, sid, val, pos=None, name=None, parent_string=None, copyinputs_function=None): if parent_string is not None: warnings.warn("'parent_string' is deprecated. Use 'name'", DeprecationWarning) self._row = PstData._fixup_input(iid,empty_creator=lambda ignore:np.empty([0,2],dtype=str)) self._col = PstData._fixup_input(sid,empty_creator=lambda ignore:np.empty([0],dtype=str)) self._row_property = PstData._fixup_input(None,count=len(self._row),empty_creator=lambda count:np.empty([count,0],dtype=str)) self._col_property = PstData._fixup_input(pos,count=len(self._col),empty_creator=lambda count:np.array([[np.nan, np.nan, np.nan]]*count)) self.val = PstData._fixup_input_val(val,row_count=len(self._row),col_count=len(self._col),empty_creator=lambda row_count,col_count:np.empty([row_count,col_count],dtype=np.float64)) self._assert_iid_sid_pos() self._name = name or parent_string or "" self._std_string_list = []
def test_repr_test(self): np.random.seed(0) row_property=np.array([[1.0,2,2.5],[3,4,4.5],[5,6,6.5]]) col_property=np.array([[1.0,2,2.5,1],[3,4,4.5,3]]) pstdata = PstData(row=np.array([[1.0,2],[3,4],[5,6]]), col=np.array([("A","a"),("B","b")]), val = np.random.normal(.5,2,size=(3,2)), row_property=row_property, col_property=col_property) assert pstdata.col_to_index([("B","b")])[0] == 1 s = str(pstdata)
def __init__(self, iid, sid, val, pos=None, name=None, copyinputs_function=None): #We don't have a 'super(DistData, self).__init__()' here because DistData takes full responsibility for initializing both its superclasses self._val = None self._row = PstData._fixup_input(iid,empty_creator=lambda ignore:np.empty([0,2],dtype='str'),dtype='str') self._col = PstData._fixup_input(sid,empty_creator=lambda ignore:np.empty([0],dtype='str'),dtype='str') self._row_property = PstData._fixup_input(None,count=len(self._row),empty_creator=lambda count:np.empty([count,0],dtype='str'),dtype='str') self._col_property = PstData._fixup_input(pos,count=len(self._col),empty_creator=lambda count:np.full([count, 3], np.nan)) self._val = PstData._fixup_input_val(val,row_count=len(self._row),col_count=len(self._col),empty_creator=lambda row_count,col_count:np.empty([row_count,col_count,3],dtype=np.float64))#!!!Replace empty with my FillNA method? self._assert_iid_sid_pos(check_val=True) self._name = name or "" self._std_string_list = []
def test_inputs3(self): from pysnptools.pstreader import PstData np.random.seed(0) row_property=None col_property=None pstdata = PstData(row=[[1.0,2.0],[3,4],[6,7]], col=np.array([]), val = [[],[],[]], row_property=row_property, col_property=col_property, name="test_read") assert pstdata.row_to_index([[3,4]])[0] == 1 assert np.array_equal(pstdata[1:,:2].row_property,pstdata.row_property[1:]) assert np.array_equal(pstdata[1:,:2].col_property,pstdata.col_property[:2]) logging.info("done with test")
def __init__(self, iid, sid, val, pos=None, name=None, parent_string=None, copyinputs_function=None): #We don't have a 'super(SnpData, self).__init__()' here because SnpData takes full responsiblity for initializing both its superclasses self.val = None if parent_string is not None: warnings.warn("'parent_string' is deprecated. Use 'name'", DeprecationWarning) self._row = PstData._fixup_input(iid,empty_creator=lambda ignore:np.empty([0,2],dtype=str)) self._col = PstData._fixup_input(sid,empty_creator=lambda ignore:np.empty([0],dtype=str)) self._row_property = PstData._fixup_input(None,count=len(self._row),empty_creator=lambda count:np.empty([count,0],dtype=str)) self._col_property = PstData._fixup_input(pos,count=len(self._col),empty_creator=lambda count:np.array([[np.nan, np.nan, np.nan]]*count)) self.val = PstData._fixup_input_val(val,row_count=len(self._row),col_count=len(self._col),empty_creator=lambda row_count,col_count:np.empty([row_count,col_count],dtype=np.float64)) self._assert_iid_sid_pos() self._name = name or parent_string or "" self._std_string_list = []
def assert_approx_equal(distdata0, distdata1, atol): from pysnptools.pstreader import PstData assert PstData._allclose(distdata0.row, distdata1.row, equal_nan=True) assert PstData._allclose(distdata0.col, distdata1.col, equal_nan=True) assert PstData._allclose(distdata0.row_property, distdata1.row_property, equal_nan=True) assert PstData._allclose(distdata0.col_property, distdata1.col_property, equal_nan=True) np.testing.assert_allclose(distdata0.val, distdata1.val, atol=atol, equal_nan=True, verbose=True)
def test_every_read(self): for order_from in ['F', 'C']: for order_to in ['F', 'C']: for dtype_from in [np.float32, np.float64]: for dtype_to in [np.float32, np.float64]: for val_shape in [None, 1, 3]: for force_python_only in [True, False]: np.random.seed(0) val0 = np.random.normal(.5, 2, size=( 3, 2 )) if val_shape is None else np.random.normal( .5, 2, size=(3, 2, val_shape)) val = np.array(val0, order=order_from, dtype=dtype_from) pstdata = PstData(val=val, row=list(range(3)), col=list(range(2))) expected = np.array(val[::-2, :][:, ::-1], order=order_to, dtype=dtype_to) result = pstdata[::-2, ::-1].read( order=order_to, dtype=dtype_to, force_python_only=force_python_only) assert result.val.dtype == dtype_to assert (order_to == 'F' and result.val.flags['F_CONTIGUOUS'] ) or (order_to == 'C' and result.val.flags['C_CONTIGUOUS']) assert np.array_equal(result.val, expected)
def cmktest_inputs3(self): from pysnptools.pstreader import PstData np.random.seed(0) row_property=None col_property=None pstdata = PstData(row=[[1.0,2.0],[3,4],[6,7]], col=np.array([]), val = [[],[],[]], row_property=row_property, col_property=col_property, name="test_read") assert pstdata.row_to_index([[3,4]])[0] == 1 assert np.array_equal(pstdata[1:,:2].row_property,pstdata.row_property[1:]) assert np.array_equal(pstdata[1:,:2].col_property,pstdata.col_property[:2]) logging.info("done with test")
def __init__(self, iid, sid, val, pos=None, name=None, parent_string=None, copyinputs_function=None): #We don't have a 'super(SnpData, self).__init__()' here because SnpData takes full responsiblity for initializing both its superclasses self.val = None if parent_string is not None: warnings.warn("'parent_string' is deprecated. Use 'name'", DeprecationWarning) self._row = PstData._fixup_input(iid,empty_creator=lambda ignore:np.empty([0,2],dtype='S'),dtype='S') self._col = PstData._fixup_input(sid,empty_creator=lambda ignore:np.empty([0],dtype='S'),dtype='S') self._row_property = PstData._fixup_input(None,count=len(self._row),empty_creator=lambda count:np.empty([count,0],dtype='S'),dtype='S') self._col_property = PstData._fixup_input(pos,count=len(self._col),empty_creator=lambda count:np.array([[np.nan, np.nan, np.nan]]*count)) self.val = PstData._fixup_input_val(val,row_count=len(self._row),col_count=len(self._col),empty_creator=lambda row_count,col_count:np.empty([row_count,col_count],dtype=np.float64)) self._assert_iid_sid_pos() self._name = name or parent_string or "" self._std_string_list = []
def test_writes(self): #=================================== # Defining sub functions #=================================== def _oned_int(c): return list(range(c)) def _oned_str(c): return [str(i) for i in range(c)] def _twooned_int(c): return [[i] for i in range(c)] def _twooned_str(c): return [[str(i)] for i in range(c)] def _twotwod_int(c): return [[i,i] for i in range(c)] def _twotwod_str(c): return [[str(i),"hello"] for i in range(c)] def _none(c): return None def _zero(c): return np.empty([c,0]) #=================================== # Staring main function #=================================== logging.info("starting 'test_writes'") np.random.seed(0) output_template = "tempdir/pstreader/writes.{0}.{1}" create_directory_if_necessary(output_template.format(0,"npz")) i = 0 for row_count in [5,2,1,0]: for col_count in [4,2,1,0]: val = np.random.normal(.5,2,size=(row_count,col_count)) for row_or_col_gen in [_oned_int,_oned_str,_twooned_int,_twooned_str,_twotwod_int,_twotwod_str]: row = row_or_col_gen(row_count) col = row_or_col_gen(col_count) for prop_gen in [_oned_int,_oned_str,_twooned_int,_twooned_str,_twotwod_int,_twotwod_str,_none,_zero]: row_prop = prop_gen(row_count) col_prop = prop_gen(col_count) pstdata = PstData(row,col,val,row_prop,col_prop,str(i)) for the_class,suffix in [(PstNpz,"npz"),(PstHdf5,"hdf5")]: filename = output_template.format(i,suffix) logging.info(filename) i += 1 the_class.write(filename,pstdata) for subsetter in [None, sp.s_[::2,::3]]: reader = the_class(filename) _fortesting_JustCheckExists().input(reader) subreader = reader if subsetter is None else reader[subsetter[0],subsetter[1]] readdata = subreader.read(order='C') expected = pstdata if subsetter is None else pstdata[subsetter[0],subsetter[1]].read() assert np.array_equal(readdata.val,expected.val) assert np.array_equal(readdata.row,expected.row) assert np.array_equal(readdata.col,expected.col) assert np.array_equal(readdata.row_property,expected.row_property) assert np.array_equal(readdata.col_property,expected.col_property) try: os.remove(filename) except: pass logging.info("done with 'test_writes'")
def __init__(self, iid, sid, val, pos=None, name=None, parent_string=None, copyinputs_function=None, xp=None, _require_float32_64=True): #We don't have a 'super(SnpData, self).__init__()' here because SnpData takes full responsibility for initializing both its superclasses xp = pstutil.array_module(xp) self._val = None if parent_string is not None: warnings.warn("'parent_string' is deprecated. Use 'name'", DeprecationWarning) self._row = PstData._fixup_input( iid, empty_creator=lambda ignore: np.empty([0, 2], dtype='str'), dtype='str') self._col = PstData._fixup_input( sid, empty_creator=lambda ignore: np.empty([0], dtype='str'), dtype='str') self._row_property = PstData._fixup_input( None, count=len(self._row), empty_creator=lambda count: np.empty([count, 0], dtype='str'), dtype='str') self._col_property = PstData._fixup_input( pos, count=len(self._col), empty_creator=lambda count: np.full([count, 3], np.nan)) self._val = PstData._fixup_input_val( val, row_count=len(self._row), col_count=len(self._col), empty_creator=lambda row_count, col_count: np.empty( [row_count, col_count], dtype=np.float64), _require_float32_64=_require_float32_64, xp=xp) self._assert_iid_sid_pos(check_val=True) self._name = name or parent_string or "" self._std_string_list = [] self._xp = xp
def val(self, new_value): self._val = PstData._fixup_input_val( new_value, row_count=len(self._row), col_count=len(self._col), empty_creator=lambda row_count, col_count: np.empty( [row_count, col_count], dtype=np.float64)) self._assert_iid0_iid1(check_val=True)
def cmktest_inputs2(self): from pysnptools.pstreader import PstData np.random.seed(0) row_property=None col_property=None pstdata = PstData(row=np.array([1.0,3,6]), col=np.array(["Aa","Bb"]), val = np.random.normal(.5,2,size=(3,2)), row_property=row_property, col_property=col_property, name="test_read") assert pstdata.row_to_index([3])[0] == 1 assert pstdata.col_to_index(["Aa"])[0] == 0 assert np.array_equal(pstdata[1:,:2].row_property,pstdata.row_property[1:]) assert np.array_equal(pstdata[1:,:2].col_property,pstdata.col_property[:2]) logging.info("done with test")
def test_inputs2(self): from pysnptools.pstreader import PstData np.random.seed(0) row_property=None col_property=None pstdata = PstData(row=np.array([1.0,3,6]), col=np.array(["Aa","Bb"]), val = np.random.normal(.5,2,size=(3,2)), row_property=row_property, col_property=col_property, name="test_read") assert pstdata.row_to_index([3])[0] == 1 assert pstdata.col_to_index(["Aa"])[0] == 0 assert np.array_equal(pstdata[1:,:2].row_property,pstdata.row_property[1:]) assert np.array_equal(pstdata[1:,:2].col_property,pstdata.col_property[:2]) logging.info("done with test")
def test_read(self): np.random.seed(0) row_property=np.array([[1.0,2,2.5],[3,4,4.5],[5,6,6.5]]) col_property=np.array([[1.0,2,2.5,1],[3,4,4.5,3]]) pstdata = PstData(row=np.array([[1.0,2],[3,4],[5,6]]), col=np.array([["A","a"],["B","b"]]), val = np.random.normal(.5,2,size=(3,2)), row_property=row_property, col_property=col_property, name="test_read") assert pstdata.row_to_index([np.array([3.0,4])])[0] == 1 assert pstdata.col_to_index([np.array(["A","a"])])[0] == 0 assert np.array_equal(pstdata[1:,:2].row_property,row_property[1:]) assert np.array_equal(pstdata[1:,:2].col_property,col_property[:2]) pstdata2 = pstdata[:2,:2].read() from pysnptools.kernelreader.test import _fortesting_JustCheckExists _fortesting_JustCheckExists().input(pstdata) _fortesting_JustCheckExists().input(pstdata2) np.testing.assert_array_almost_equal(pstdata2.val, pstdata.val[:2,:2], decimal=10) pstdata3 = pstdata[[],:].read() assert pstdata3.val.shape[0] == 0 and pstdata3.val.shape[1]==2 pstdata.val = pstdata.val.copy(order='F') pstdata2 = pstdata[:2,:2].read() np.testing.assert_array_almost_equal(pstdata2.val, pstdata.val[:2,:2], decimal=10) pstdata2 = pstdata[:2,:2].read(order='F') np.testing.assert_array_almost_equal(pstdata2.val, pstdata.val[:2,:2], decimal=10) pstdata2 = pstdata[:2,:2].read(order='A') np.testing.assert_array_almost_equal(pstdata2.val, pstdata.val[:2,:2], decimal=10) pstdata2 = pstdata[:2,:2].read(force_python_only=True,dtype=None,order='C') np.testing.assert_array_almost_equal(pstdata2.val, pstdata.val[:2,:2], decimal=10) pstdata2 = pstdata[:2,:2].read(force_python_only=True,dtype='float32',order='C') np.testing.assert_array_almost_equal(pstdata2.val, pstdata.val[:2,:2].astype(dtype='float32'), decimal=10) pstdata2 = pstdata[:2,:2].read(force_python_only=True,dtype='float32',order=None) np.testing.assert_array_almost_equal(pstdata2.val, pstdata.val[:2,:2].astype(dtype='float32'), decimal=10) pstdata2 = pstdata[:2,:2].read(force_python_only=True,dtype=None,order='F') np.testing.assert_array_almost_equal(pstdata2.val, pstdata.val[:2,:2], decimal=10) pstdata4 = pstdata[::,::].read(force_python_only=True) np.testing.assert_array_almost_equal(pstdata4.val, pstdata.val, decimal=10) logging.info("done with test")
def __init__(self, filename, count_A1=None, iid=None, sid=None, pos=None, skip_format_check=False): #!!!document these new optionals. they are here super(Bed, self).__init__() self._ran_once = False self._file_pointer = None self.filename = filename if count_A1 is None: warnings.warn("'count_A1' was not set. For now it will default to 'False', but in the future it will default to 'True'", FutureWarning) count_A1 = False self.count_A1 =count_A1 self.skip_format_check = skip_format_check if iid is not None: self._row = PstData._fixup_input(iid,empty_creator=lambda ignore:np.empty([0,2],dtype='S'),dtype='S') if sid is not None: self._col = PstData._fixup_input(sid,empty_creator=lambda ignore:np.empty([0],dtype='S'),dtype='S') if pos is not None: self._col_property = PstData._fixup_input(pos,count=len(self._col),empty_creator=lambda count:np.array([[np.nan, np.nan, np.nan]]*count))
def val(self, new_value): self._val = PstData._fixup_input_val( new_value, row_count=len(self._row), col_count=len(self._col), empty_creator=lambda row_count, col_count: np.empty( [row_count, col_count, 3], dtype=np.float64) ) #!!!Replace empty with my FillNA method? self._assert_iid_sid_pos(check_val=True)
def test_big_npz(self): logging.info("in test_big_npz") n = 1000 pstdata = PstData(row=range(n-1),col=range(n+1),val=np.zeros([n-1,n+1])) output = "tempdir/pstreader/big.npz" create_directory_if_necessary(output) PstNpz.write(output,pstdata) pstnpz = PstNpz(output) pstdata1 = pstnpz[::2,::4].read() pstdata2 = pstnpz.read(order='A') assert pstdata2.val.flags['C_CONTIGUOUS'] pstdata = PstData(row=range(n-1),col=range(n+1),val=np.zeros([n-1,n+1],order='F')) PstNpz.write(output,pstdata) pstnpz = PstNpz(output) pstdata2 = pstnpz.read(order='A') pstdata2.val.flags['F_CONTIGUOUS'] print("done")
def test_inputs4(self): from pysnptools.pstreader import PstData pstdata = PstData(row=None, col=None, val = None, row_property=None, col_property=None, name="test_read") assert pstdata.row_count == 0 and pstdata.col_count == 0 and pstdata.val.shape[0] == 0 and pstdata.val.shape[1]==0 and len(pstdata.row_property)==0 and len(pstdata.col_property)==0 logging.info("done with test")
def _empty_inner(self, row, col, filename, row_property, col_property, order, dtype, val_shape): self._ran_once = True self._dtype = np.dtype(dtype) self._order = order row = PstData._fixup_input(row) col = PstData._fixup_input(col) row_property = PstData._fixup_input(row_property,count=len(row)) col_property = PstData._fixup_input(col_property,count=len(col)) with open(filename,'wb') as fp: np.save(fp, np.array([_magic_number])) np.save(fp, np.array(["pstmemmap"])) #name of file format np.save(fp, np.array([2])) #file format version np.save(fp, row) np.save(fp, col) np.save(fp, row_property) np.save(fp, col_property) np.save(fp, np.array([self._dtype])) np.save(fp, np.array([self._order])) np.save(fp, np.array([val_shape])) self._offset = fp.tell() logging.info("About to start allocating memmap '{0}'".format(filename)) shape = (len(row),len(col)) if val_shape is None else (len(row),len(col),val_shape) val = np.memmap(filename, offset=self._offset, dtype=dtype, mode="r+", order=order, shape=shape) logging.info("Finished allocating memmap '{0}'. Size is {1:,}".format(filename,os.path.getsize(filename))) PstData.__init__(self,row,col,val,row_property,col_property,name="np.memmap('{0}')".format(filename))
def __init__(self, iid=None, iid0=None, iid1=None, val=None, name=None, parent_string=None, xp=None ): #!!!autodoc doesn't generate good doc for this constructor #We don't have a 'super(KernelData, self).__init__()' here because KernelData takes full responsibility for initializing both its superclasses xp = pstutil.array_module(xp) self._val = None #!!why does SnpData __init__ have a copy_inputs, but KernelData doesn't? assert (iid is None) != ( iid0 is None and iid1 is None ), "Either 'iid' or both 'iid0' 'iid1' must be provided." assert name is None or parent_string is None, "Can't set both 'name' and the deprecated 'parent_string'" if parent_string is not None: warnings.warn("'parent_string' is deprecated. Use 'name'", DeprecationWarning) if iid is not None: self._row = PstData._fixup_input( iid, empty_creator=lambda ignore: np.empty([0, 2], dtype='str'), dtype='str') self._col = self._row else: self._row = PstData._fixup_input( iid0, empty_creator=lambda ignore: np.empty([0, 2], dtype='str'), dtype='str') self._col = PstData._fixup_input( iid1, empty_creator=lambda ignore: np.empty([0, 2], dtype='str'), dtype='str') self._row_property = PstData._fixup_input( None, count=len(self._row), empty_creator=lambda count: np.empty([count, 0], dtype='str'), dtype='str') self._col_property = PstData._fixup_input( None, count=len(self._col), empty_creator=lambda count: np.empty([count, 0], dtype='str'), dtype='str') self._val = PstData._fixup_input_val( val, row_count=len(self._row), col_count=len(self._col), empty_creator=lambda row_count, col_count: xp.empty( [row_count, col_count], dtype=xp.float64), xp=xp) self._assert_iid0_iid1(check_val=True) self._name = name or parent_string or "" self._std_string_list = [] self._xp = xp
def test2(self): old_dir = os.getcwd() os.chdir(os.path.dirname(os.path.realpath(__file__))) filename = "tempdir/x.pst.memmap" pstutil.create_directory_if_necessary(filename) a = PstMemMap.empty(row=['a','b','c'],col=['y','z'],filename=filename,row_property=['A','B','C'],order="F",dtype=np.float64) b = PstData(row=['a','b','c'],col=['y','z'],val=[[1,2],[3,4],[np.nan,6]],row_property=['A','B','C']) pointer1, read_only_flag = a.val.__array_interface__['data'] a.val+=1 a.val+=b.val pointer2, read_only_flag = a.val.__array_interface__['data'] assert pointer1==pointer2 os.chdir(old_dir)
def allclose(self, value, equal_nan=True): ''' :param value: Other object with which to compare. :type value: :class:`SnpData` :param equal_nan: (Default: True) Tells if NaN in :attr:`SnpData.val` should be treated as regular values when testing equality. :type equal_nan: bool >>> import numpy as np >>> snpdata5 = SnpData(iid=[['fam0','iid0'],['fam0','iid1']], sid=['snp334','snp349','snp921'], val=[[0.,2.,0.],[0.,1.,np.nan]], pos=[[0,0,0],[0,0,0],[0,0,0]]) >>> snpdata6 = SnpData(iid=[['fam0','iid0'],['fam0','iid1']], sid=['snp334','snp349','snp921'], val=[[0.,2.,0.],[0.,1.,np.nan]], pos=[[0,0,0],[0,0,0],[0,0,0]]) >>> print(snpdata5.allclose(snpdata6)) #True, if we consider the NaN as regular values, all the arrays have the same values. True >>> print(snpdata5.allclose(snpdata6,equal_nan=False)) #False, if we consider the NaN as special values, all the arrays are not equal. False ''' return PstData.allclose(self, value, equal_nan=equal_nan)
def _read_pstdata(self): bim_list = [] val_list_list = [] with open(self.filename, "r") as fp: header = fp.readline() iid_string_list = header.strip().split()[1:] iid = np.array([ self.extract_iid_function(iid_string) for iid_string in iid_string_list ], dtype='str') val_list = [] zerofloat = float('0'[0]) missing_char = "?"[0] for line_index, line in enumerate(fp): if line_index % 1000 == 0: logging.info( "reading sid and iid info from line {0} of file '{1}'". format(line_index, self.filename)) sid_string_rest = line.strip().split() sid_string = sid_string_rest[0] rest = [] if len(sid_string_rest) == 1 else sid_string_rest[1] assert len(rest) == len(iid) bim_list.append(self.extract_sid_pos_function(sid_string)) val_list = np.array([ float(val) - zerofloat if val != missing_char else np.NaN for val in rest ]) val_list_list.append(val_list) col = np.array([bim[1] for bim in bim_list], dtype='str') col_property = np.array([[bim[0], bim[2], bim[3]] for bim in bim_list], dtype=np.float64) val = np.zeros((len(iid), len(col))) for col_index in range(len(col)): val[:, col_index] = val_list_list[col_index] return PstData(iid, col, val, col_property=col_property, name=self.filename)
def test_subset(self): np.random.seed(0) row_property = np.array([[1.0, 2, 2.5], [3, 4, 4.5], [5, 6, 6.5]]) col_property = np.array([[1.0, 2, 2.5, 1], [3, 4, 4.5, 3]]) val = np.random.normal(.5, 2, size=(3, 2)) pstdata = PstData(row=np.array([[1.0, 2], [3, 4], [5, 6]]), col=np.array([["A", "a"], ["B", "b"]]), val=val, row_property=row_property, col_property=col_property, name="test_read") assert np.array_equal(pstdata[-1:0:-1, :].read().val, val[-1:0:-1, :]) assert pstdata[-1, -1].read().val[0, 0] == val[-1, -1] assert np.array_equal(pstdata[[-1, 0], [-1, 0]].read().val, val[[-1, 0], :][:, [-1, 0]]) assert np.array_equal( pstdata[[True, False, True], [False, True]].read().val, val[[True, False, True], [False, True]].reshape(2, 1)) assert pstdata[0, 0].read().val[0, 0] == val[0, 0] assert np.array_equal(pstdata[1::2, 1::2].read().val, val[1::2, 1::2]) logging.info("done with test")
def __init__(self, iid=None, iid0=None, iid1=None, val=None, name=None, parent_string=None): #!!!autodoc doesn't generate good doc for this constructor #!!why does SnpData __init__ have a copy_inputs, but KernelData doesn't? assert (iid is None) != (iid0 is None and iid1 is None), "Either 'iid' or both 'iid0' 'iid1' must be provided." assert name is None or parent_string is None, "Can't set both 'name' and the deprecated 'parent_string'" if parent_string is not None: warnings.warn("'parent_string' is deprecated. Use 'name'", DeprecationWarning) if iid is not None: self._row = PstData._fixup_input(iid,empty_creator=lambda ignore:np.empty([0,2],dtype=str)) self._col = self._row else: self._row = PstData._fixup_input(iid0,empty_creator=lambda ignore:np.empty([0,2],dtype=str)) self._col = PstData._fixup_input(iid1,empty_creator=lambda ignore:np.empty([0,2],dtype=str)) self._row_property = PstData._fixup_input(None,count=len(self._row),empty_creator=lambda count:np.empty([count,0],dtype=str)) self._col_property = PstData._fixup_input(None,count=len(self._col),empty_creator=lambda count:np.empty([count,0],dtype=str)) self.val = PstData._fixup_input_val(val,row_count=len(self._row),col_count=len(self._col),empty_creator=lambda row_count,col_count:np.empty([row_count,col_count],dtype=np.float64)) self._assert_iid0_iid1() self._name = name or parent_string or "" self._std_string_list = []
fn = '../examples/tiny.pst.memmap' os.getcwd() print((os.path.exists(fn))) pst_mem_map = PstMemMap(fn) print((pst_mem_map.val[0,1])) if False: a=np.ndarray([2,3]) pointer, read_only_flag = a.__array_interface__['data'] print(pointer) a*=2 pointer, read_only_flag = a.__array_interface__['data'] print(pointer) a = PstMemMap.empty(row=['a','b','c'],col=['y','z'],filename=r'c:\deldir\a.memmap',row_property=['A','B','C'],order="F",dtype=np.float64) b = PstData(row=['a','b','c'],col=['y','z'],val=[[1,2],[3,4],[np.nan,6]],row_property=['A','B','C']) pointer, read_only_flag = a.val.__array_interface__['data'] print(pointer) a.val+=1 a.val+=b.val pointer, read_only_flag = a.val.__array_interface__['data'] print(pointer) suites = getTestSuite() r = unittest.TextTestRunner(failfast=True) ret = r.run(suites) assert ret.wasSuccessful() result = doctest.testmod(optionflags=doctest.ELLIPSIS) assert result.failed == 0, "failed doc test: " + __file__
def read(self, order='F', dtype=np.float64, force_python_only=False, view_ok=False, num_threads=None): """Reads the matrix values and returns a :class:`.PstData` (with :attr:`PstData.val` property containing a new ndarray of the matrix values). :param order: {'F' (default), 'C', 'A'}, optional -- Specify the order of the ndarray. If order is 'F' (default), then the array will be in F-contiguous order (row-index varies the fastest). If order is 'C', then the returned array will be in C-contiguous order (col-index varies the fastest). If order is 'A', then the :attr:`PstData.val` ndarray may be in any order (either C-, Fortran-contiguous). :type order: string or None :param dtype: {numpy.float64 (default), numpy.float32}, optional -- The data-type for the :attr:`PstData.val` ndarray. :type dtype: data-type :param force_python_only: optional -- If False (default), may use outside library code. If True, requests that the read be done without outside library code. :type force_python_only: bool :param view_ok: optional -- If False (default), allocates new memory for the :attr:`PstData.val`'s ndarray. If True, if practical and reading from a :class:`PstData`, will return a new :class:`PstData` with a ndarray shares memory with the original :class:`PstData`. Typically, you'll also wish to use "order='A'" to increase the chance that sharing will be possible. Use these parameters with care because any change to either ndarray will effect the others. Also keep in mind that :meth:`read` relies on ndarray's mechanisms to decide whether to actually share memory and so it may ignore your suggestion and allocate a new ndarray anyway. :type view_ok: bool :param num_threads: optional -- The number of threads with which to read data. Defaults to all available processors. Can also be set with these environment variables (listed in priority order): 'PST_NUM_THREADS', 'NUM_THREADS', 'MKL_NUM_THREADS'. :type num_threads: None or int :rtype: :class:`.PstData` Calling the method again causes the matrix values to be re-read and creates a new in-memory :class:`.PstData` with a new ndarray of matrix values. If you request the values for only a subset of the sids or iids, (to the degree practical) only that subset will be read from disk. :Example: >>> from pysnptools.pstreader import PstHdf5 >>> from pysnptools.util import example_file # Download and return local file name >>> hdf5_file = example_file('pysnptools/examples/toydata.iidmajor.snp.hdf5') >>> on_disk = PstHdf5(hdf5_file) # Specify matrix data on disk >>> pstdata1 = on_disk.read() # Read all the matrix data returning a PstData instance >>> print(type(pstdata1.val).__name__) # The PstData instance contains a ndarray of the data. ndarray >>> subset_pstdata = on_disk[:,::2].read() # From the disk, read matrix values for every other sid >>> print(subset_pstdata.val[0,0]) # Print the first matrix value in the subset 1.0 >>> subsub_pstdata = subset_pstdata[:10,:].read(order='A',view_ok=True) # Create an in-memory subset of the subset with matrix values for the first ten iids. Share memory if practical. >>> import numpy as np >>> # print(np.may_share_memory(subset_snpdata.val, subsub_snpdata.val)) # Do the two ndarray's share memory? They could. Currently they won't. """ dtype = np.dtype(dtype) val = self._read(None, None, order, dtype, force_python_only, view_ok, num_threads) from pysnptools.pstreader import PstData ret = PstData(self.row, self.col, val, row_property=self.row_property, col_property=self.col_property, name=str(self)) return ret
def test_writes(self): #=================================== # Defining sub functions #=================================== def _oned_int(c): return range(c) def _oned_str(c): return [str(i).encode('ascii') for i in range(c)] def _twooned_int(c): return [[i] for i in range(c)] def _twooned_str(c): return [[str(i).encode('ascii')] for i in range(c)] def _twotwod_int(c): return [[i, i] for i in range(c)] def _twotwod_str(c): return [[str(i).encode('ascii'), b"hello"] for i in range(c)] #def _twotwod_U(c): # return [[str(i).encode('UTF-8'),u"hello"] for i in range(c)] def _none(c): return None def _zero(c): return np.empty([c, 0], dtype='S') #=================================== # Starting main function #=================================== logging.info("starting 'test_writes'") np.random.seed(0) temp_dir = tempfile.TemporaryDirectory("pstreader") output_template = temp_dir.name + '/writes.{0}.{1}' i = 0 for row_count in [5, 2, 1, 0]: for col_count in [4, 2, 1, 0]: for val_shape in [3, None, 1]: val = np.random.normal(.5, 2, size=( row_count, col_count)) if val_shape is None else np.random.normal( .5, 2, size=(row_count, col_count, val_shape)) for row_or_col_gen in [ _oned_int, _oned_str, _twooned_int, _twooned_str, _twotwod_int, _twotwod_str ]: #!!!,_twotwod_U can't roundtrop Unicode in hdf5 row = row_or_col_gen(row_count) col = row_or_col_gen(col_count) for prop_gen in [ _none, _oned_str, _oned_int, _twooned_int, _twooned_str, _twotwod_int, _twotwod_str, _zero ]: #!!!_twotwod_U can't round trip Unicode because Hdf5 doesn't like it. row_prop = prop_gen(row_count) col_prop = prop_gen(col_count) pstdata = PstData(row, col, val, row_prop, col_prop, str(i)) for the_class, suffix in [(PstMemMap, "memmap"), (PstHdf5, "hdf5"), (PstNpz, "npz")]: filename = output_template.format(i, suffix) logging.info(filename) i += 1 the_class.write(filename, pstdata) reader = the_class( filename ) if suffix != 'hdf5' else the_class( filename, block_size=3) _fortesting_JustCheckExists().input(reader) for subsetter in [None, np.s_[::2, ::3]]: subreader = reader if subsetter is None else reader[ subsetter[0], subsetter[1]] expected = pstdata if subsetter is None else pstdata[ subsetter[0], subsetter[1]].read() for order in ['C', 'F', 'A']: for force_python_only in [True, False]: readdata = subreader.read( order=order, force_python_only= force_python_only) assert np.array_equal( readdata.val, expected.val) assert np.array_equal( readdata.row, expected.row) assert np.array_equal( readdata.col, expected.col) assert np.array_equal( readdata.row_property, expected.row_property ) or ( readdata.row_property.shape[1] == 0 and expected.row_property.shape[1] == 0) assert np.array_equal( readdata.col_property, expected.col_property ) or ( readdata.col_property.shape[1] == 0 and expected.col_property.shape[1] == 0) if suffix in {'memmap', 'hdf5'}: reader.flush() os.remove(filename) temp_dir.cleanup() logging.info("done with 'test_writes'")