def randomsample_qtl(null,n_jobs,rep=20000): screen_output.run_out('random sampling...') cdist = [] ##multiple process#### p = Pool(processes=n_jobs, initializer=init_child, initargs=(null,)) result = p.imap(ransample_qtl,range(rep), chunksize=rep//args.n_jobs) #result.wait() p.close() p.join() cdist = [r for r in result] return sorted(cdist)
def null_perfect(self, n_jobs, perfect_max, update_null=True, verbose=1): #null distribution of IBDs under perfect data if verbose == 1: screen_output.run_out( "calculating null distribution using perfect data approximation..." ) p = Pool(processes=n_jobs) inv_list = [] rep = pow(2, 2 * len(self.family.invnf)) #print rep if rep > perfect_max: inv_list = random.sample(xrange(rep), perfect_max) else: inv_list = range(rep) self.pinv_sall = [] chunk_size = len(inv_list) // n_jobs if len(inv_list) >= n_jobs else 1 result = p.imap(null_inv, product(inv_list, [(self.family, self)]), chunksize=chunk_size) p.close() p.join() null_ibd = [] invs = [] for r in result: null_ibd.append((float('%.9f' % r[0]), float('%.9f' % r[1]))) invs.append(int(r[2])) #print len(null_ibd) if self.pinv_sall == []: self.pinv_sall = [0 for x in xrange(rep)] for idx, inv in enumerate(invs): self.pinv_sall[inv] = null_ibd[idx][1] if update_null: self.null_ibd = null_ibd #expectation and standard deviation ibd_pair = [x[0] for x in null_ibd] ibd_all = [x[1] for x in null_ibd] self.null_mean = float('%.9f' % (sum(ibd_pair) / len(ibd_pair))) self.null_std = self.std(ibd_pair, self.null_mean, False) self.sall_null_mean = float('%.9f' % (sum(ibd_all) / len(ibd_all))) self.sall_null_std = self.std(ibd_all, self.sall_null_mean, False)
def null_permute(self, n_jobs, perfect_max, conditional_prob=[]): ##get pairwise IBD under each prior inheritance vector while maintain founder genotype screen_output.run_out( "calculating theoretical null distribution for rvibd...") paircount = len(self.family.pairs) self.family.expect_pair_ibd = [0 for x in range(paircount)] self.family.prior = [[0 for y in range(paircount)] for x in range(paircount)] rep = pow(2, 2 * len(self.family.nonfounder)) #print rep #Get all possible parental genotype configurations foundergt_list = [] foundergt_dic = {} if conditional_prob == []: conditional_prob = self.family.conditional_prob for f in self.family.founder: foundergt_list += self.family.fam_dict[f]['gt'][2 * self.family.mid:2 * self.family.mid + 2] try: #if there are missing gt_id = conditional_prob['~combined'][1] for gt, prob in conditional_prob['~combined'][0].iteritems(): for f in set(self.family.missing_all) & set( self.family.founder): f_idx = self.family.founder.index(f) idx = gt_id.index(f) foundergt_list[2 * f_idx:2 * f_idx + 2] = [ ord(x) - 96 for x in gt[2 * idx:2 * idx + 2] ] key_fgt = [] for i in range(len(self.family.founder)): key_fgt.extend(sorted(foundergt_list[2 * i:2 * i + 2])) key_fgt = tuple(key_fgt) if key_fgt not in foundergt_dic: foundergt_dic[key_fgt] = prob else: foundergt_dic[key_fgt] += prob except: foundergt_dic[tuple(foundergt_list)] = 1 local_perfect_max = int(perfect_max / len(foundergt_dic.keys())) p = Pool(processes=n_jobs) for fgt, prob in foundergt_dic.iteritems(): #For each possible parental genotypes #Calculate corresponding IBD given inheritance vector sample_flag = False full_permutation = [list(fgt)] if len(set(fgt)) == 1: full_permutation_inv_raw = [(0, list(fgt))] else: full_permutation_inv_raw = xrange( rep) #list(product(xrange(rep),full_permutation)) if len(full_permutation_inv_raw) > local_perfect_max: #sample_flag=True full_permutation_inv_tmp = random.sample( xrange(rep), local_perfect_max) else: full_permutation_inv_tmp = full_permutation_inv_raw if isinstance(full_permutation_inv_tmp[0], tuple): full_permutation_inv = full_permutation_inv_tmp else: full_permutation_inv = list( product(full_permutation_inv_tmp, full_permutation)) len_total = len(full_permutation_inv) csize = len_total // n_jobs if csize < 1: csize = 1 #p = Pool(processes=n_jobs) result = p.imap(null_generator_pairibd, [(i, self) for i in full_permutation_inv], chunksize=csize) pair_ibd = [r for r in result] #start n seperate processes for tmp_pair_ibd in set(pair_ibd): count = pair_ibd.count(tmp_pair_ibd) #print tmp_pair_ibd #print count tmp_prob = 1 / len_total * count * prob for pair_idx in range(paircount): t_ibd = tmp_pair_ibd[pair_idx] * tmp_prob if t_ibd != 0: self.family.expect_pair_ibd[pair_idx] += t_ibd for pair_jdx in range(pair_idx, paircount): self.family.prior[pair_idx][ pair_jdx] += t_ibd * tmp_pair_ibd[pair_jdx] self.family.prior[pair_jdx][ pair_idx] = self.family.prior[pair_idx][ pair_jdx] p.close() p.join()
def null_perfect_rvibd(self, n_jobs, perfect_max, sall_flag=False, infer_flag=2, verbose=1): #null distribution of IBDs under perfect data if verbose == 1: screen_output.run_out( "calculating theoretical null distribution for rvibd...") rep = pow(2, 2 * len(self.family.nonfounder)) #print rep #Get all possible parental genotype configurations foundergt_list = [] foundergt_dic = {} for f in self.family.founder: foundergt_list += self.family.fam_dict[f]['gt'][2 * self.family.mid:2 * self.family.mid + 2] try: #if there are missing gt_id = self.family.conditional_prob['~combined'][1] for gt, prob in self.family.conditional_prob['~combined'][ 0].iteritems(): for f in set(self.family.missing_all) & set( self.family.founder): f_idx = self.family.founder.index(f) idx = gt_id.index(f) foundergt_list[2 * f_idx:2 * f_idx + 2] = [ ord(x) - 96 for x in gt[2 * idx:2 * idx + 2] ] key_fgt = [] for i in range(len(self.family.founder)): key_fgt.extend(sorted(foundergt_list[2 * i:2 * i + 2])) key_fgt = tuple(key_fgt) if key_fgt not in foundergt_dic: foundergt_dic[key_fgt] = prob else: foundergt_dic[key_fgt] += prob except: foundergt_dic[tuple(foundergt_list)] = 1 mean_pair, mean_all, var_pair, var_all = 0, 0, 0, 0 all_ibd = manager.dict(self.all_ibd) if self.pinv_sall == []: pinv_sall = Array( c_double, [0 for x in xrange(pow(2, 2 * len(self.family.invnf)))]) else: pinv_sall = (c_double * len(self.pinv_sall))(*self.pinv_sall) pinv_key_dict = manager.dict(self.pinv_key_dict) pinv_pair_dict = manager.dict(self.pinv_pair_dict) combined_dist = {} prob_and_mean = [] if self.family.simple: local_perfect_max = int(perfect_max / len(foundergt_dic.keys())) else: local_perfect_max = min( int(perfect_max / len(foundergt_dic.keys())), 5) for fgt, prob in foundergt_dic.iteritems(): #For each possible parental genotypes #Calculate corresponding IBD given inheritance vector #full_permutation=list(permutations(list(fgt),len(self.founder)*2)) sample_flag = False full_permutation = [list(fgt)] if len(set(fgt)) == 1: full_permutation_inv_raw = [(0, list(fgt))] else: full_permutation_inv_raw = list( product(xrange(rep), full_permutation)) ###parallel processing### inqueue = multiprocessing.Queue() null_ibd = manager.list([]) if len(full_permutation_inv_raw) > local_perfect_max: #sample_flag=True full_permutation_inv = random.sample(full_permutation_inv_raw, local_perfect_max) else: full_permutation_inv = full_permutation_inv_raw len_total = len(full_permutation_inv) for i in full_permutation_inv: inqueue.put(i) #start n seperate processes procs = [] for proc in range(n_jobs): p = myProcess(proc,self.family,self,inqueue,all_ibd,\ null_ibd,pinv_sall,pinv_key_dict,pinv_pair_dict,sall_flag,infer_flag) p.start() procs.append(p) inqueue.put(None) TIMEOUT = 3600 start = time.time() while time.time() - start <= TIMEOUT or len(null_ibd) < 2: if any(p.is_alive() for p in procs): time.sleep(.1) # Just to avoid hogging the CPU else: break else: try: #print("timed out, killing all processes") for p in procs: p.terminate() p.join() except: pass while not inqueue.empty(): inqueue.get() tmp_ibd_pair = [x[0] for x in null_ibd] tmp_mean_pair = float('%.9f' % (sum(tmp_ibd_pair) / len_total)) tmp_std_pair = self.std(tmp_ibd_pair, tmp_mean_pair, sample_flag) mean_pair += prob * tmp_mean_pair var_pair += prob * tmp_std_pair**2 tmp_mean_all, tmp_std_all = 0, 0 if sall_flag: tmp_ibd_all = [x[1] for x in null_ibd] tmp_mean_all = float('%.9f' % (sum(tmp_ibd_all) / len_total)) tmp_std_all = self.std(tmp_ibd_all, tmp_mean_all, sample_flag) mean_all += prob * tmp_mean_all var_all += prob * tmp_std_all**2 prob_and_mean.append((prob, tmp_mean_pair, tmp_mean_all)) dist_s = self.distribution(pall_flag=sall_flag, null_ibd=null_ibd) #if len(set(fgt))==1: # print tmp_mean_pair, tmp_mean_all, tmp_std_pair, tmp_std_all, dist_s for v_idx, ibd_v in enumerate(dist_s[1]): if ibd_v in combined_dist: combined_dist[ibd_v] += dist_s[0][v_idx] * prob else: combined_dist[ibd_v] = dist_s[0][v_idx] * prob ibd_keys = combined_dist.keys() self.dist_s = ([combined_dist[k] for k in ibd_keys], ibd_keys) self.all_ibd = all_ibd for tmp_ele in prob_and_mean: var_pair += tmp_ele[0] * (tmp_ele[1] - mean_pair)**2 if sall_flag: var_all += tmp_ele[0] * (tmp_ele[2] - mean_all)**2 self.null_mean = mean_pair self.sall_null_mean = mean_all self.null_std = math.sqrt(var_pair) self.sall_null_std = math.sqrt(var_all)
def nullibd(self, rep, n_jobs, sall_flag=False, infer_flag=2, simple=False, verbose=1): #calculate expected mean and std for IBD under H0 if verbose == 1: screen_output.run_out("calculating null distribution...") #the number of nonfounders that should be included in inheritance vector founderid = self.family.founder ###parallel processing### inqueue = multiprocessing.Queue() all_ibd = manager.dict(self.all_ibd) null_ibd = manager.list(self.null_ibd) if self.pinv_sall == []: pinv_sall = Array( c_double, [0 for x in xrange(pow(2, 2 * len(self.family.invnf)))]) else: pinv_sall = (c_double * len(self.pinv_sall))(*self.pinv_sall) pinv_key_dict = manager.dict(self.pinv_key_dict) pinv_pair_dict = manager.dict(self.pinv_pair_dict) for i in xrange(rep): inqueue.put(i) #start n seperate processes procs = [] for proc in range(n_jobs): p = myProcess(proc,self.family,self,inqueue,all_ibd,\ null_ibd,pinv_sall,pinv_key_dict,pinv_pair_dict,sall_flag,infer_flag) p.start() procs.append(p) inqueue.put(None) TIMEOUT = 3600 #10000 if sall_flag else 3600 start = time.time() last_flag = 0 while time.time() - start <= TIMEOUT: if any(p.is_alive() for p in procs): if last_flag > 10: for p in procs: p.terminate() p.join() break elif len(null_ibd) == rep - 1: last_flag += 1 else: time.sleep(.1) # Just to avoid hogging the CPU else: break else: try: #print("timed out, killing all processes") for p in procs: p.terminate() p.join() except: pass while not inqueue.empty(): inqueue.get() self.null_ibd = null_ibd self.pinv_pair_dict = pinv_pair_dict self.all_ibd = all_ibd if sall_flag: self.pinv_sall = [x for x in pinv_sall] self.pinv_key_dict = pinv_key_dict if not simple: #expectation and standard deviation ibd_pair = [x[0] for x in self.null_ibd] self.null_mean = float('%.9f' % (sum(ibd_pair) / len(ibd_pair))) self.null_std = self.std(ibd_pair, self.null_mean) if sall_flag: ibd_all = [x[1] for x in self.null_ibd if x[1] != 0] self.sall_null_mean = float('%.9f' % (sum(ibd_all) / len(ibd_all))) self.sall_null_std = self.std(ibd_all, self.sall_null_mean)