def deploy_countDB(elements, command_base, npacks= 1, sim_dir= './', mem= '3GB', t= '00:30:00', nodes= 2, sample_sim= 0, out_db= 'out_db',command_dir= './',debug= False, deployment= 'local', log_dir= './log'): """ deployment of smaller subpops. - elements: dict, script arguments to pass to daugher script. - adds arguments sims, db and cwd to arguments. - deploy mcount_standalone on a a number npacks of simulations at a time. - use sbatch_launch() to write bash file to launch mcount_stdalone.py. - mcount_stdalone and mcount_stdalone_deploy both read INFO_db for the simulation directory using args.species, passed here in elements arg dict. """ ti= time.time() sims= process_dir(sims_dir= sim_dir) print('available {}'.format(len(sims))) data_kmer= {} data_freqs= {} if sample_sim == 0: sample_sim= len(sims) print('sample {}'.format(sample_sim)) sim_sub= np.random.choice(sims,sample_sim,replace= False) bins= get_bins(array_len= len(sim_sub),npacks=npacks) for cycle in bins: if len(list(set(cycle))) == 1: cycle[1] += 1 aprehend= sim_sub[cycle[0]:cycle[1]] bash_name= '_'.join(aprehend) # aprehend= ','.join(aprehend) arg_dict= dict(elements) command_here= str(command_base) arg_dict['sims']= aprehend arg_dict['db']= out_db arg_dict['cwd']= command_dir for arg,val in arg_dict.items(): suff= '--' if len(arg) == 1: suff = '-' if isinstance(val,bool): if val: command_here += suff + arg + ' ' else: command_here += suff + arg + ' {} '.format(str(val)) if deployment == 'local': bash_launch(command_here,bash_name,log_dir= log_dir) else: sbatch_file= sbatch_launch(command_here,bash_name,batch_dir= '', mem= mem,t= t,nodes= nodes,debug= debug) os.system('sbatch ' + sbatch_file)
def MC_sample_matrix_simple(min_size= 80, samp= [5,20,10], stepup= "increment", diffs= False, frequency_range= [0,1],indfile= 'ind_assignments.txt', outemp= 'ind_assignments{}.txt', count_dir= './count/', dir_launch= '..',main_dir= './', sim_dir= 'mutation_counter/data/sims/', muted_dir= 'mutation_counter/data/mutation_count/', segregating= False, outlog= 'indy.log', row= 24,col= 4, single= False, exclude= False, print_summ= False, sample_sim= 0,collapsed= True,bases= 'ACGT',ksize= 3,ploidy= 2, freq_extract= False, distances= 'PCA',prop_gen_used= 1, scale_genSize= False, return_private= True,haps_extract= True): ''' launch mutation counter pipeline on manipulated population assignments. Use matrix multiplication to extract counts. - v1 relies on count_popKmers() function to count mutations per pop. allows freq. filter and single mutaiton count. ''' ti= time.time() sims= process_dir(sims_dir= sim_dir) print('available {}'.format(len(sims))) tags= [] sim_extend= [] chroms= [] data_kmer= {} data_freqs= {} # if sample_sim == 0: sample_sim= len(sims) print('sample {}'.format(sample_sim)) sim_sub= np.random.choice(sims,sample_sim,replace= False) for sim in sim_sub: ## chromosome chrom= sim.split('.')[0].split('C')[-1].strip('chr') chromosomes= [sim.split('.')[0].split('C')[1]] chromosome_groups = [chromosomes] if exclude: files= read_exclude() else: files= {} ### read vcf t0= time.time() Window, mut_matrix, scale= VCF_read_filter(sim, sim_dir= sim_dir,chrom= chrom,haps_extract= haps_extract, scale_genSize= scale_genSize, collapsed= collapsed,min_size= min_size, samp= samp, stepup= stepup, outemp= outemp, indfile= indfile,diffs= diffs,bases= bases, ksize= ksize, ploidy= ploidy) tag_list, tag_dict, pop_dict= ind_assignment_scatter_v1(sim,dir_sim= sim_dir, haps_extract= haps_extract, min_size= min_size, samp= samp, stepup= stepup, outemp= outemp,indfile= indfile) total_inds= sum([len(x) for x in pop_dict.values()]) if not len(Window) or Window.shape[0] < total_inds: continue ## counts for no tag sim: s0= time.time() pop_summary, PA_dict= count_popKmers(Window, mut_matrix, pop_dict, single= single, prop_gen_used= prop_gen_used, frequency_range= frequency_range,row=row,col=col,segregating= segregating,scale= scale, return_private= return_private) data_kmer[sim]= pop_summary if return_private: pop_summary, dummy= count_popKmers(Window, mut_matrix, pop_dict, single= single, prop_gen_used= prop_gen_used, frequency_range= frequency_range,row=row,col=col,segregating= segregating,scale= scale, PA= PA_dict) data_kmer[sim]= pop_summary if freq_extract: pop_freqs= pop_dict_SFS(Window,pop_dict) data_freqs[sim]= pop_freqs if distances: data_kmer[sim]['pairDist']= pop_distances_PCA(Window,pop_dict) return data_kmer, data_freqs
def MC_sample_matrix_dict(pop_names, pop_lengths,min_size= 80, samp= [5,20,10], stepup= "increment", diffs= False, frequency_range= [0,1],indfile= 'ind_assignments.txt', outemp= 'ind_assignments{}.txt',chrom_idx= 0, count_dir= './count/', dir_launch= '..',main_dir= './', sim_dir= 'mutation_counter/data/sims/', muted_dir= 'mutation_counter/data/mutation_count/', segregating= False, genome_size= 1, outlog= 'indy.log', row= 24,col= 4, single= True, exclude= False, print_summ= False, sample_sim= 0,collapsed= True,bases= 'ACGT',ksize= 3,ploidy= 2, freq_extract= False, sim_del= 'C', distances= 'PCA', Lsteps= 1,scale_genSize= False,prop_gen_used= 1,return_private= True): ''' launch mutation counter pipeline on manipulated population assignments. Use matrix multiplication to extract counts. - v1 relies on count_popKmers() function to count mutations per pop. allows freq. filter and single mutaiton count. ''' ti= time.time() sims= process_dir(sims_dir= sim_dir) print('available {}'.format(len(sims))) tags= [] sim_extend= [] chroms= [] data_kmer= {} data_freqs= {} #sim_sample= np.random.choice(sims,8,replace= False) if sample_sim == 0: sample_sim= len(sims) print('sample {}'.format(sample_sim)) sim_sub= np.random.choice(sims,sample_sim,replace= False) for sim in sim_sub: ## chromosome chrom= sim.split('.')[chrom_idx].split(sim_del)[-1].strip('chr') if exclude: files= read_exclude() else: files= {} ### read vcf t0= time.time() Window, mut_matrix, scale= VCF_read_filter( sim, sim_dir= sim_dir,chrom= chrom,haps_extract= haps_extract, scale_genSize= scale_genSize, collapsed= collapsed,min_size= min_size, samp= samp, stepup= stepup, outemp= outemp, indfile= indfile, diffs= diffs,bases= bases, ksize= ksize, ploidy= ploidy ) tag_list, tag_dict, pop_dict= ind_assignment_scatter_v1(sim,dir_sim= sim_dir, haps_extract= haps_extract, min_size= min_size, samp= samp, stepup= stepup, outemp= outemp,indfile= indfile) total_inds= sum([len(x) for x in pop_dict.values()]) if not len(Window) or Window.shape[0] < total_inds: continue ## counts for no tag sim: s0= time.time() t1= time.time() count_time= t1- t0 if len(tag_list): ### sim_extend.extend([sim]*len(tag_list)) chroms.extend([chrom]*len(tag_list)) ### Window_lengths= np.linspace(1,len(refseq),Lsteps,dtype= int) ### for idx in range(len(tag_list)): seq_idx= 0 present_state= 0 for snp_n in Window_lengths: if snp_n < 10: lrange= list(range(Window.shape[1])) tag_l= 'full' else: while present_state < snp_n: if seq_idx >= (subset_summary.shape[0]-1): present_state= len(refseq) seq_idx= subset_summary.shape[0]-1 else: present_state= subset_summary['POS'][seq_idx] seq_idx += 1 lrange= list(range(seq_idx)) tag_l= str(snp_n * scale) # tag_here= tag_list[idx] + '.' + tag_l tags.append(tag_here) ## tag= tag_list[idx] # new_sim= sim + tag_here pop_dict= tag_dict[tag] ######### ######### pop_summary, PA_dict= count_popKmers(Window[:,lrange], mut_matrix[:,lrange], pop_dict, single= single, prop_gen_used= prop_gen_used, frequency_range= frequency_range,row=row,col=col,segregating= segregating,scale= scale, return_private= return_private) data_kmer[new_sim]= pop_summary if return_private: pop_summary, dummy= count_popKmers(Window[:,lrange], mut_matrix[:,lrange], pop_dict, single= single, prop_gen_used= prop_gen_used, frequency_range= frequency_range,row=row,col=col,segregating= segregating,scale= scale, PA= PA_dict) data_kmer[new_sim]= pop_summary if freq_extract: pop_freqs= pop_dict_SFS(Window,pop_dict) data_freqs[new_sim]= pop_freqs if distances: data_kmer[new_sim]['pairDist']= pop_distances_PCA(Window,pop_dict) if print_summ: print('mut_matrix time: {} s'.format(time_mut / 60)) print('count time: {} s'.format(count_time / 60)) print('est total count time: {} s'.format(count_time*len(tag_list) / 60)) print('replicates: {}'.format(len(tag_list))) print('read time: {} s'.format(read_time / 60)) tf= time.time() time_elapsed= tf - ti print('time elapsed: {}s'.format(time_elapsed)) return data_kmer, data_freqs
def MC_sample_matrix_v1(min_size= 80, samp= [5,20,10], stepup= "increment", diffs= False, frequency_range= [0,1],indfile= 'ind_assignments.txt', outemp= 'ind_assignments{}.txt',chrom_idx= 0, prop_gen_used= 1, count_dir= './count/', dir_launch= '..',main_dir= './', sim_dir= 'mutation_counter/data/sims/', muted_dir= 'mutation_counter/data/mutation_count/', segregating= False, scale_genSize= False, outlog= 'indy.log', row= 24,col= 4, single= True, exclude= False, print_summ= False, sample_sim= 0,collapsed= True,bases= 'ACGT',ksize= 3,ploidy= 2, freq_extract= False, sim_del= 'C', genome_size= 1,haps_extract= False, return_private= True): ''' launch mutation counter pipeline on population assignments. Use matrix multiplication to extract counts. - v1 relies on count_popKmers() function to count mutations per pop. allows freq. filter and single mutaiton count. ''' ti= time.time() sims= process_dir(sims_dir= sim_dir) print('available {}'.format(len(sims))) tags= [] sim_extend= [] chroms= [] data_kmer= {} data_freqs= {} if sample_sim == 0: sample_sim= len(sims) print('sample {}'.format(sample_sim)) sim_sub= np.random.choice(sims,sample_sim,replace= False) for sim in sim_sub: ## chromosome chrom= sim.split('.')[chrom_idx].split(sim_del)[-1].strip('chr') if exclude: files= read_exclude() else: files= {} ### read vcf t0= time.time() Window, mut_matrix, scale= VCF_read_filter(sim, sim_dir= sim_dir,chrom= chrom,haps_extract= haps_extract, scale_genSize= scale_genSize, collapsed= collapsed,min_size= min_size, samp= samp, stepup= stepup, outemp= outemp, indfile= indfile,diffs= diffs,bases= bases, ksize= ksize, ploidy= ploidy) tag_list, tag_dict, pop_dict= ind_assignment_scatter_v1(sim,dir_sim= sim_dir, haps_extract= haps_extract, min_size= min_size, samp= samp, stepup= stepup, outemp= outemp,indfile= indfile) total_inds= sum([len(x) for x in pop_dict.values()]) t1= time.time() read_time= t1- t0 if not len(Window) or Window.shape[0] < total_inds: continue ## counts for no tag sim: s0= time.time() pop_summary, PA_dict= count_popKmers(Window, mut_matrix, pop_dict, single= single, prop_gen_used= prop_gen_used, frequency_range= frequency_range,row=row,col=col,segregating= segregating,scale= scale, return_private= return_private) data_kmer[sim]= pop_summary if return_private: pop_summary, dummy= count_popKmers(Window, mut_matrix, pop_dict, single= single, prop_gen_used= prop_gen_used, frequency_range= frequency_range,row=row,col=col,segregating= segregating,scale= scale, PA= PA_dict) data_kmer[sim]= pop_summary if freq_extract: pop_freqs= pop_dict_SFS(Window,pop_dict) data_freqs[sim]= pop_freqs t1= time.time() count_time= t1- t0 if len(tag_list): ### sim_extend.append(sim) tags.append('') chroms.append(chrom) ### for idx in range(len(tag_list)): sim_extend.extend([sim]*len(tag_list)) tags.extend(tag_list) chroms.extend([chrom]*len(tag_list)) ## tag= tag_list[idx] ind_file= outemp.format(tags[idx]) new_sim= sim + tag pop_dict= tag_dict[tag] pop_summary, dummy= count_popKmers(pop_summary['array'], mut_matrix, pop_dict, single= single, prop_gen_used= prop_gen_used, frequency_range= frequency_range,row=row,col=col,segregating= segregating,scale= scale, PA= PA_dict,counted= True) data_kmer[new_sim]= pop_summary if freq_extract: pop_freqs= pop_dict_SFS(Window,pop_dict) data_freqs[new_sim]= pop_freqs if print_summ: print('mut_matrix time: {} s'.format(time_mut / 60)) print('count time: {} s'.format(count_time / 60)) print('est total count time: {} s'.format(count_time*len(tag_list) / 60)) print('replicates: {}'.format(len(tag_list))) print('read time: {} s'.format(read_time / 60)) tf= time.time() time_elapsed= tf - ti print('time elapsed: {}s'.format(time_elapsed)) return data_kmer, data_freqs