def test_gensmall(self): # Just checking that doesn't generate errors for iid_count in [10, 5, 3, 2, 1, 0]: for sid_count in [0, 10, 5, 3, 2, 1]: for chr_count in [30, 10, 5, 3, 2, 1, 0]: if chr_count == 0 and sid_count > 0: continue # not break logging.debug("{0}, {1}, {2}".format(iid_count, sid_count, chr_count)) snpdata = snp_gen( fst=0.1, dfr=0.5, iid_count=iid_count, sid_count=sid_count, maf_low=0.05, seed=6, chr_count=chr_count, ) assert snpdata.iid_count <= iid_count assert snpdata.sid_count == sid_count assert len(snpdata.pos) == 0 or max(snpdata.pos[:, 0]) <= chr_count assert len(snpdata.pos) == 0 or max(snpdata.pos[:, 1]) <= int( max(1, np.ceil(float(sid_count) / chr_count)) ) assert len(snpdata.pos) == 0 or max(snpdata.pos[:, 2]) <= int( max(1, np.ceil(float(sid_count) / chr_count)) )
def gen_and_compare(self, output_file, **kwargs): gen_snpdata = snp_gen(**kwargs) # pstutil.create_directory_if_necessary(self.currentFolder + "/tempdir/" + output_file,isfile=True) # Bed.write(gen_snpdata, self.currentFolder + "/tempdir/" + output_file) #comment out ref_snpdata = Bed(self.currentFolder + "/expected/" + output_file).read() assert TestSnpGen.is_same(gen_snpdata, ref_snpdata), "Failure on " + output_file return gen_snpdata
def test_gensmall(self): #Just checking that doesn't generate errors for iid_count in [10, 5, 3, 2, 1, 0]: for sid_count in [0, 10, 5, 3, 2, 1]: for chr_count in [30, 10, 5, 3, 2, 1, 0]: if chr_count == 0 and sid_count > 0: continue # not break logging.debug("{0}, {1}, {2}".format( iid_count, sid_count, chr_count)) snpdata = snp_gen(fst=.1, dfr=.5, iid_count=iid_count, sid_count=sid_count, maf_low=.05, seed=6, chr_count=chr_count) assert snpdata.iid_count <= iid_count assert snpdata.sid_count == sid_count assert len(snpdata.pos) == 0 or max( snpdata.pos[:, 0]) <= chr_count assert len( snpdata.pos) == 0 or max(snpdata.pos[:, 1]) <= int( max(1, np.ceil(float(sid_count) / chr_count))) assert len( snpdata.pos) == 0 or max(snpdata.pos[:, 2]) <= int( max(1, np.ceil(float(sid_count) / chr_count)))
def gen_and_compare(self, output_file, **kwargs): gen_snpdata = snp_gen(**kwargs) #pstutil.create_directory_if_necessary(self.currentFolder + "/tempdir/" + output_file,isfile=True) #Bed.write(gen_snpdata, self.currentFolder + "/tempdir/" + output_file) #comment out ref_snpdata = Bed(self.currentFolder + "/expected/" + output_file).read() assert TestSnpGen.is_same(gen_snpdata, ref_snpdata), "Failure on " + output_file return gen_snpdata
def generate_discrete_ascertained(prevalence, iid_count, snp_args, phenotype_args, seed=0): """ Generate discrete ascertained data. Internally, case will be generated at the requested prevalence. Before returning, however, the control will randomly sampled so that in the returned data, case and control have number of examples. :param prevalence: Prior probability of a case, e.g. .1 :type prevalence: a float between 0.0 and 1.0 (exclusive) :param iid_count: The number of examples desired in the returned data. Because of rounding during data generate the actual number may be lower. Of this happens, a warning will be shown. :type iid_count: int :param snp_args: arguments for an internal call to :func:`GWAS_benchmark.snp_gen`. Do not include 'iid_count' or 'seed' :type snp_args: dictionary :param phenotype_args: arguments for an internal call to :func:`.generate_phenotype`. Do not include 'snp_count' or 'seed' :type phenotype_args: dictionary :param seed: a random seed to control random number generation :type seed: int :rtype: a :class:`pysnptools.snpreader.SnpData' of genotype data and a nparray of 0,1 phenotype values. :Example: >>> snp_args = {"fst":.1,"dfr":.5,"sid_count":200,"maf_low":.05} >>> phenotype_args = {"causals":10,"genetic_var":0.5, "noise_var":0.5} >>> snps,pheno = generate_discrete_ascertained(prevalence=.1,iid_count=100,seed=5,snp_args=snp_args,phenotype_args=phenotype_args) >>> print int(snps.val.shape[0]),int(snps.val.shape[1]),int(len(pheno)) 98 200 98 """ assert 0<prevalence and prevalence <= .5, "Expect prevalence to be between 0.0 (exclusive) and .5 (inclusive)" assert int(iid_count) == iid_count and iid_count >= 0, "Expect iid_count to be a non-negative integer" # generate more examples than we ultimately want iid_count2 = int(float(iid_count) / 2.0 / prevalence) from GWAS_benchmark import snp_gen snp2 = snp_gen(iid_count=iid_count2, seed=seed, **snp_args) pheno2 = generate_phenotype(snp_data=snp2, seed=seed, **phenotype_args) # Sort by snps by pheno2 value snps2_sorted = snp2[pheno2.argsort(),:] # we want the top snp_count*prevalence for cases # and a random sample, of the same size, from the rest for control case_count = int(snps2_sorted.iid_count * prevalence) case_index = range(-1,-(case_count+1),-1) # e.g. if case_count is 3, then -1,-2,-3 control_count = case_count if control_count + case_count != iid_count: logging.warn("iid_count is {0} instead of {1} because of rounding".format(control_count + case_count, iid_count)) if seed is not None: np.random.seed(int(seed % sys.maxint)) #print "gda", snps2_sorted.iid_count,case_count,control_count #the "if..else" is a work around because the linux version of np.random.choice doesn't like to select zero items from an empty list. We need to call random in either case so that the random seed ends up in the expected state control_index = np.random.choice(np.arange(snps2_sorted.iid_count-case_count if control_count > 0 else 1), control_count, replace=False) snp_final = snps2_sorted[np.concatenate((control_index,case_index)),:].read() pheno_final = np.zeros(control_count+case_count) pheno_final[control_count:]=1 return snp_final, pheno_final
def generate_discrete_ascertained(prevalence, iid_count, snp_args, phenotype_args, seed=0): """ Generate discrete ascertained data. Internally, case will be generated at the requested prevalence. Before returning, however, the control will randomly sampled so that in the returned data, case and control have number of examples. :param prevalence: Prior probability of a case, e.g. .1 :type prevalence: a float between 0.0 and 1.0 (exclusive) :param iid_count: The number of examples desired in the returned data. Because of rounding during data generate the actual number may be lower. Of this happens, a warning will be shown. :type iid_count: int :param snp_args: arguments for an internal call to :func:`GWAS_benchmark.snp_gen`. Do not include 'iid_count' or 'seed' :type snp_args: dictionary :param phenotype_args: arguments for an internal call to :func:`.generate_phenotype`. Do not include 'snp_count' or 'seed' :type phenotype_args: dictionary :param seed: a random seed to control random number generation :type seed: int :rtype: a :class:`pysnptools.snpreader.SnpData' of genotype data and a nparray of 0,1 phenotype values. :Example: >>> snp_args = {"fst":.1,"dfr":.5,"sid_count":200,"maf_low":.05} >>> phenotype_args = {"causals":10,"genetic_var":0.5, "noise_var":0.5} >>> snps,pheno = generate_discrete_ascertained(prevalence=.1,iid_count=100,seed=5,snp_args=snp_args,phenotype_args=phenotype_args) >>> print int(snps.val.shape[0]),int(snps.val.shape[1]),int(len(pheno)) 98 200 98 """ assert 0 < prevalence and prevalence <= .5, "Expect prevalence to be between 0.0 (exclusive) and .5 (inclusive)" assert int( iid_count ) == iid_count and iid_count >= 0, "Expect iid_count to be a non-negative integer" # generate more examples than we ultimately want iid_count2 = int(float(iid_count) / 2.0 / prevalence) from GWAS_benchmark import snp_gen snp2 = snp_gen(iid_count=iid_count2, seed=seed, **snp_args) pheno2 = generate_phenotype(snp_data=snp2, seed=seed, **phenotype_args) # Sort by snps by pheno2 value snps2_sorted = snp2[pheno2.argsort(), :] # we want the top snp_count*prevalence for cases # and a random sample, of the same size, from the rest for control case_count = int(snps2_sorted.iid_count * prevalence) case_index = range(-1, -(case_count + 1), -1) # e.g. if case_count is 3, then -1,-2,-3 control_count = case_count if control_count + case_count != iid_count: logging.warn( "iid_count is {0} instead of {1} because of rounding".format( control_count + case_count, iid_count)) if seed is not None: np.random.seed(int(seed % sys.maxint)) #print "gda", snps2_sorted.iid_count,case_count,control_count #the "if..else" is a work around because the linux version of np.random.choice doesn't like to select zero items from an empty list. We need to call random in either case so that the random seed ends up in the expected state control_index = np.random.choice( np.arange(snps2_sorted.iid_count - case_count if control_count > 0 else 1), control_count, replace=False) snp_final = snps2_sorted[np.concatenate( (control_index, case_index)), :].read() pheno_final = np.zeros(control_count + case_count) pheno_final[control_count:] = 1 return snp_final, pheno_final