def f(mixing,K0_val=K0_val,K1_val=K1_val,covar=covar,y=y,**kwargs): if not isinstance(mixing, (int, long, float, complex)): assert mixing.ndim == 1 and mixing.shape[0] == 1 mixing = mixing[0] _mix_from_Ks(K, K0_val,K1_val,mixing) lmm = lmm_cov(X=covar, Y=y, G=None, K=K, inplace=True) result = lmm.findH2() if (resmin[0] is None) or (result['nLL']<resmin[0]['nLL']): resmin[0]=result logging.debug("mixing_from_Ks\t{0}\th2\t{1}\tnLL\t{2}".format(mixing,result['h2'],result['nLL'])) #logging.info("reporter:counter:single_snp,find_mixing_from_Ks_count,1") assert not np.isnan(result['nLL']), "nLL should be a number (not a NaN)" return result['nLL']
def _internal_single(K0, test_snps, pheno, covar, K1, mixing, h2, log_delta, cache_file, force_full_rank, force_low_rank, output_file_name, block_size, interact_with_snp, runner): assert K0 is not None, "real assert" assert K1 is not None, "real assert" assert block_size is not None, "real assert" assert mixing is None or 0.0 <= mixing <= 1.0 if force_full_rank and force_low_rank: raise Exception("Can't force both full rank and low rank") assert h2 is None or log_delta is None, "if h2 is specified, log_delta may not be specified" if log_delta is not None: h2 = 1.0/(np.exp(log_delta)+1) covar = np.c_[covar.read(view_ok=True,order='A').val,np.ones((test_snps.iid_count, 1))] #view_ok because np.c_ will allocation new memory y = pheno.read(view_ok=True,order='A').val #view_ok because this code already did a fresh read to look for any missing values if cache_file is not None and os.path.exists(cache_file): lmm = lmm_cov(X=covar, Y=y, G=None, K=None) with np.load(cache_file) as data: #!! similar code in epistasis lmm.U = data['arr_0'] lmm.S = data['arr_1'] h2 = data['arr_2'][0] mixing = data['arr_2'][1] else: K, h2, mixer = _Mixer.combine_the_best_way(K0, K1, covar, y, mixing, h2, force_full_rank=force_full_rank, force_low_rank=force_low_rank,kernel_standardizer=DiagKtoN()) mixing = mixer.mixing if mixer.do_g: lmm = lmm_cov(X=covar, Y=y, K=None, G=K.snpreader.val, inplace=True) else: lmm = lmm_cov(X=covar, Y=y, K=K.val, G=None, inplace=True) if h2 is None: result = lmm.findH2() h2 = result['h2'] logging.info("h2={0}".format(h2)) if cache_file is not None and not os.path.exists(cache_file): pstutil.create_directory_if_necessary(cache_file) lmm.getSU() np.savez(cache_file, lmm.U,lmm.S,np.array([h2,mixing])) #using np.savez instead of pickle because it seems to be faster to read and write if interact_with_snp is not None: logging.info("interaction with %i" % interact_with_snp) assert 0 <= interact_with_snp and interact_with_snp < covar.shape[1]-1, "interact_with_snp is out of range" interact = covar[:,interact_with_snp].copy() interact -=interact.mean() interact /= interact.std() else: interact = None work_count = -(test_snps.sid_count // -block_size) #Find the work count based on batch size (rounding up) # We define three closures, that is, functions define inside function so that the inner function has access to the local variables of the outer function. def debatch_closure(work_index): return test_snps.sid_count * work_index // work_count def mapper_closure(work_index): if work_count > 1: logging.info("single_snp: Working on part {0} of {1}".format(work_index,work_count)) do_work_time = time.time() start = debatch_closure(work_index) end = debatch_closure(work_index+1) snps_read = test_snps[:,start:end].read().standardize() if interact_with_snp is not None: variables_to_test = snps_read.val * interact[:,np.newaxis] else: variables_to_test = snps_read.val res = lmm.nLLeval(h2=h2, dof=None, scale=1.0, penalty=0.0, snps=variables_to_test) beta = res['beta'] chi2stats = beta*beta/res['variance_beta'] #p_values = stats.chi2.sf(chi2stats,1)[:,0] assert test_snps.iid_count == lmm.U.shape[0] p_values = stats.f.sf(chi2stats,1,lmm.U.shape[0]-3)[:,0]#note that G.shape is the number of individuals and 3 is the number of fixed effects (covariates+SNP) dataframe = _create_dataframe(snps_read.sid_count) dataframe['sid_index'] = np.arange(start,end) dataframe['SNP'] = snps_read.sid dataframe['Chr'] = snps_read.pos[:,0] dataframe['GenDist'] = snps_read.pos[:,1] dataframe['ChrPos'] = snps_read.pos[:,2] dataframe['PValue'] = p_values dataframe['SnpWeight'] = beta[:,0] dataframe['SnpWeightSE'] = np.sqrt(res['variance_beta'][:,0]) dataframe['SnpFractVarExpl'] = np.sqrt(res['fraction_variance_explained_beta'][:,0]) dataframe['Mixing'] = np.zeros((snps_read.sid_count)) + mixing dataframe['Nullh2'] = np.zeros((snps_read.sid_count)) + h2 logging.info("time={0}".format(time.time()-do_work_time)) #logging.info(dataframe) return dataframe def reducer_closure(result_sequence): if output_file_name is not None: create_directory_if_necessary(output_file_name) frame = pd.concat(result_sequence) frame.sort_values(by="PValue", inplace=True) frame.index = np.arange(len(frame)) if output_file_name is not None: frame.to_csv(output_file_name, sep="\t", index=False) return frame frame = map_reduce(xrange(work_count), mapper=mapper_closure,reducer=reducer_closure, input_files=[test_snps],output_files=[output_file_name], name="single_snp(output_file={0})".format(output_file_name), runner=runner) return frame
def _internal_single(K0, test_snps, pheno, covar, K1, mixing, h2, log_delta, cache_file, force_full_rank, force_low_rank, output_file_name, block_size, interact_with_snp, runner): assert K0 is not None, "real assert" assert K1 is not None, "real assert" assert block_size is not None, "real assert" assert mixing is None or 0.0 <= mixing <= 1.0 if force_full_rank and force_low_rank: raise Exception("Can't force both full rank and low rank") assert h2 is None or log_delta is None, "if h2 is specified, log_delta may not be specified" if log_delta is not None: h2 = 1.0/(np.exp(log_delta)+1) covar = np.c_[covar.read(view_ok=True,order='A').val,np.ones((test_snps.iid_count, 1))] #view_ok because np.c_ will allocation new memory y = pheno.read(view_ok=True,order='A').val #view_ok because this code already did a fresh read to look for any missing values if cache_file is not None and os.path.exists(cache_file): lmm = lmm_cov(X=covar, Y=y, G=None, K=None) with np.load(cache_file) as data: #!! similar code in epistasis lmm.U = data['arr_0'] lmm.S = data['arr_1'] h2 = data['arr_2'][0] mixing = data['arr_2'][1] else: K, h2, mixer = _Mixer.combine_the_best_way(K0, K1, covar, y, mixing, h2, force_full_rank=force_full_rank, force_low_rank=force_low_rank,kernel_standardizer=DiagKtoN()) mixing = mixer.mixing if mixer.do_g: lmm = lmm_cov(X=covar, Y=y, K=None, G=K.snpreader.val, inplace=True) else: #print(covar.sum(),y.sum(),K.val.sum(),covar[0],y[0],K.val[0,0]) lmm = lmm_cov(X=covar, Y=y, K=K.val, G=None, inplace=True) if h2 is None: result = lmm.findH2() h2 = result['h2'] logging.info("h2={0}".format(h2)) if cache_file is not None and not os.path.exists(cache_file): pstutil.create_directory_if_necessary(cache_file) lmm.getSU() np.savez(cache_file, lmm.U,lmm.S,np.array([h2,mixing])) #using np.savez instead of pickle because it seems to be faster to read and write if interact_with_snp is not None: logging.info("interaction with %i" % interact_with_snp) assert 0 <= interact_with_snp and interact_with_snp < covar.shape[1]-1, "interact_with_snp is out of range" interact = covar[:,interact_with_snp].copy() interact -=interact.mean() interact /= interact.std() else: interact = None work_count = -(test_snps.sid_count // -block_size) #Find the work count based on batch size (rounding up) # We define three closures, that is, functions define inside function so that the inner function has access to the local variables of the outer function. def debatch_closure(work_index): return test_snps.sid_count * work_index // work_count def mapper_closure(work_index): if work_count > 1: logging.info("single_snp: Working on snp block {0} of {1}".format(work_index,work_count)) do_work_time = time.time() start = debatch_closure(work_index) end = debatch_closure(work_index+1) snps_read = test_snps[:,start:end].read().standardize() if interact_with_snp is not None: variables_to_test = snps_read.val * interact[:,np.newaxis] else: variables_to_test = snps_read.val res = lmm.nLLeval(h2=h2, dof=None, scale=1.0, penalty=0.0, snps=variables_to_test) beta = res['beta'] chi2stats = beta*beta/res['variance_beta'] #p_values = stats.chi2.sf(chi2stats,1)[:,0] assert test_snps.iid_count == lmm.U.shape[0] p_values = stats.f.sf(chi2stats,1,lmm.U.shape[0]-(lmm.linreg.D+1))[:,0]#note that G.shape is the number of individuals# dataframe = _create_dataframe(snps_read.sid_count) dataframe['sid_index'] = np.arange(start,end) dataframe['SNP'] = snps_read.sid dataframe['Chr'] = snps_read.pos[:,0] dataframe['GenDist'] = snps_read.pos[:,1] dataframe['ChrPos'] = snps_read.pos[:,2] dataframe['PValue'] = p_values dataframe['SnpWeight'] = beta[:,0] dataframe['SnpWeightSE'] = np.sqrt(res['variance_beta'][:,0]) dataframe['SnpFractVarExpl'] = np.sqrt(res['fraction_variance_explained_beta'][:,0]) dataframe['Mixing'] = np.zeros((snps_read.sid_count)) + mixing dataframe['Nullh2'] = np.zeros((snps_read.sid_count)) + h2 logging.info("time={0}".format(time.time()-do_work_time)) #logging.info(dataframe) return dataframe def reducer_closure(result_sequence): if output_file_name is not None: create_directory_if_necessary(output_file_name) frame = pd.concat(result_sequence) frame.sort_values(by="PValue", inplace=True) frame.index = np.arange(len(frame)) if output_file_name is not None: frame.to_csv(output_file_name, sep="\t", index=False) return frame frame = map_reduce(xrange(work_count), mapper=mapper_closure,reducer=reducer_closure, input_files=[test_snps],output_files=[output_file_name], name="single_snp(output_file={0})".format(output_file_name), runner=runner) return frame