Esempio n. 1
0
def deploy_countDB(elements, command_base, npacks= 1, sim_dir= './',
					mem= '3GB', t= '00:30:00', nodes= 2, sample_sim= 0,
					out_db= 'out_db',command_dir= './',debug= False, deployment= 'local',
					log_dir= './log'):
	"""
	deployment of smaller subpops. 
	- elements: dict, script arguments to pass to daugher script.
	- adds arguments sims, db and cwd to arguments.
	- deploy mcount_standalone on a a number npacks of simulations at a time. 
	- use sbatch_launch() to write bash file to launch mcount_stdalone.py.
	- mcount_stdalone and mcount_stdalone_deploy both read INFO_db for the simulation directory 
	using args.species, passed here in elements arg dict.
	"""
	ti= time.time()
	sims= process_dir(sims_dir= sim_dir)
	print('available {}'.format(len(sims)))

	data_kmer= {}
	data_freqs= {}

	if sample_sim == 0:
	    sample_sim= len(sims)

	print('sample {}'.format(sample_sim))
	sim_sub= np.random.choice(sims,sample_sim,replace= False)


	bins= get_bins(array_len= len(sim_sub),npacks=npacks)
	for cycle in bins:
		
		if len(list(set(cycle))) == 1:
			cycle[1] += 1

		aprehend= sim_sub[cycle[0]:cycle[1]]
		bash_name= '_'.join(aprehend)
		#
		aprehend= ','.join(aprehend)
		arg_dict= dict(elements)

		command_here= str(command_base)

		arg_dict['sims']= aprehend
		arg_dict['db']= out_db
		arg_dict['cwd']= command_dir 

		for arg,val in arg_dict.items():
			suff= '--'
			if len(arg) == 1:
				suff = '-'

			if isinstance(val,bool):
				if val:
					command_here += suff + arg + ' '
			else:
				command_here += suff + arg + ' {} '.format(str(val))

		if deployment == 'local':
			bash_launch(command_here,bash_name,log_dir= log_dir)

		else:
			sbatch_file= sbatch_launch(command_here,bash_name,batch_dir= '', 
				mem= mem,t= t,nodes= nodes,debug= debug)

			os.system('sbatch ' + sbatch_file)
Esempio n. 2
0
def MC_sample_matrix_simple(min_size= 80, samp= [5,20,10], stepup= "increment", diffs= False, frequency_range= [0,1],indfile= 'ind_assignments.txt', outemp= 'ind_assignments{}.txt',
                    count_dir= './count/', dir_launch= '..',main_dir= './', sim_dir= 'mutation_counter/data/sims/', muted_dir= 'mutation_counter/data/mutation_count/', segregating= False,
                    outlog= 'indy.log', row= 24,col= 4, single= False, exclude= False, print_summ= False, sample_sim= 0,collapsed= True,bases= 'ACGT',ksize= 3,ploidy= 2, freq_extract= False,
                    distances= 'PCA',prop_gen_used= 1, scale_genSize= False, return_private= True,haps_extract= True):
    '''
    launch mutation counter pipeline on manipulated population assignments.
    Use matrix multiplication to extract counts. 
    - v1 relies on count_popKmers() function to count mutations per pop. allows freq. filter and single mutaiton count.  
    '''
    
    ti= time.time()
    sims= process_dir(sims_dir= sim_dir)
    print('available {}'.format(len(sims)))

    tags= []
    sim_extend= []
    chroms= []
    
    data_kmer= {}
    data_freqs= {}
    #
    if sample_sim == 0:
        sample_sim= len(sims)

    print('sample {}'.format(sample_sim))
    sim_sub= np.random.choice(sims,sample_sim,replace= False)
    
    for sim in sim_sub:

        ## chromosome
        chrom= sim.split('.')[0].split('C')[-1].strip('chr')
        chromosomes= [sim.split('.')[0].split('C')[1]]
        chromosome_groups = [chromosomes]

        if exclude:
            files= read_exclude()
        else:
            files= {}

        ### read vcf
        t0= time.time()
        Window, mut_matrix, scale= VCF_read_filter(sim, sim_dir= sim_dir,chrom= chrom,haps_extract= haps_extract, scale_genSize= scale_genSize,
            collapsed= collapsed,min_size= min_size, samp= samp, stepup= stepup, outemp= outemp,
            indfile= indfile,diffs= diffs,bases= bases, ksize= ksize, ploidy= ploidy)

        tag_list, tag_dict, pop_dict= ind_assignment_scatter_v1(sim,dir_sim= sim_dir, haps_extract= haps_extract,
                          min_size= min_size, samp= samp, stepup= stepup, outemp= outemp,indfile= indfile)

        total_inds= sum([len(x) for x in pop_dict.values()])
        if not len(Window) or Window.shape[0] < total_inds:
            continue
        ## counts for no tag sim:
        s0= time.time()

        pop_summary, PA_dict= count_popKmers(Window, mut_matrix, pop_dict, single= single, prop_gen_used= prop_gen_used,
                                  frequency_range= frequency_range,row=row,col=col,segregating= segregating,scale= scale,
                                  return_private= return_private)

        data_kmer[sim]= pop_summary

        if return_private: 
            pop_summary, dummy= count_popKmers(Window, mut_matrix, pop_dict, single= single, prop_gen_used= prop_gen_used,
                                      frequency_range= frequency_range,row=row,col=col,segregating= segregating,scale= scale,
                                      PA= PA_dict)

            data_kmer[sim]= pop_summary

        if freq_extract:
            pop_freqs= pop_dict_SFS(Window,pop_dict)
            data_freqs[sim]= pop_freqs

        if distances:
            data_kmer[sim]['pairDist']= pop_distances_PCA(Window,pop_dict)
    
    return data_kmer, data_freqs
Esempio n. 3
0
def MC_sample_matrix_dict(pop_names, pop_lengths,min_size= 80, samp= [5,20,10], stepup= "increment", diffs= False, frequency_range= [0,1],indfile= 'ind_assignments.txt', outemp= 'ind_assignments{}.txt',chrom_idx= 0,
                    count_dir= './count/', dir_launch= '..',main_dir= './', sim_dir= 'mutation_counter/data/sims/', muted_dir= 'mutation_counter/data/mutation_count/', segregating= False, genome_size= 1,
                    outlog= 'indy.log', row= 24,col= 4, single= True, exclude= False, print_summ= False, sample_sim= 0,collapsed= True,bases= 'ACGT',ksize= 3,ploidy= 2, freq_extract= False, sim_del= 'C',
                         distances= 'PCA', Lsteps= 1,scale_genSize= False,prop_gen_used= 1,return_private= True):
    '''
    launch mutation counter pipeline on manipulated population assignments.
    Use matrix multiplication to extract counts. 
    - v1 relies on count_popKmers() function to count mutations per pop. allows freq. filter and single mutaiton count.  
    '''
    
    ti= time.time()
    sims= process_dir(sims_dir= sim_dir)
    print('available {}'.format(len(sims)))

    tags= []
    sim_extend= []
    chroms= []
    
    data_kmer= {}
    data_freqs= {}
    #sim_sample= np.random.choice(sims,8,replace= False)
    if sample_sim == 0:
        sample_sim= len(sims)
    
    print('sample {}'.format(sample_sim))
    sim_sub= np.random.choice(sims,sample_sim,replace= False)
    
    for sim in sim_sub:
        
        ## chromosome
        chrom= sim.split('.')[chrom_idx].split(sim_del)[-1].strip('chr')

        if exclude:
            files= read_exclude()
        else:
            files= {}

        ### read vcf
        t0= time.time()
        Window, mut_matrix, scale= VCF_read_filter(
        	sim, sim_dir= sim_dir,chrom= chrom,haps_extract= haps_extract, scale_genSize= scale_genSize,
            collapsed= collapsed,min_size= min_size, samp= samp, stepup= stepup, outemp= outemp,
            indfile= indfile, diffs= diffs,bases= bases, ksize= ksize, ploidy= ploidy
            )

        tag_list, tag_dict, pop_dict= ind_assignment_scatter_v1(sim,dir_sim= sim_dir, haps_extract= haps_extract,
                          min_size= min_size, samp= samp, stepup= stepup, outemp= outemp,indfile= indfile)

        total_inds= sum([len(x) for x in pop_dict.values()])
        if not len(Window) or Window.shape[0] < total_inds:
            continue
        ## counts for no tag sim:
        s0= time.time()
        t1= time.time()
        count_time= t1- t0
        
        if len(tag_list):
            ###
            sim_extend.extend([sim]*len(tag_list))
            chroms.extend([chrom]*len(tag_list))
            ###
            Window_lengths= np.linspace(1,len(refseq),Lsteps,dtype= int)
            ###
            
            for idx in range(len(tag_list)):

                seq_idx= 0
                present_state= 0

                for snp_n in Window_lengths:
                    if snp_n < 10:
                        lrange= list(range(Window.shape[1]))
                        tag_l= 'full'
                    else:
                        while present_state < snp_n:
                            
                            if seq_idx >= (subset_summary.shape[0]-1):
                                present_state= len(refseq)
                                seq_idx= subset_summary.shape[0]-1
                            else:
                                present_state= subset_summary['POS'][seq_idx]
                                seq_idx += 1

                        lrange= list(range(seq_idx))
                        tag_l= str(snp_n * scale)
                    #
                    tag_here= tag_list[idx] + '.' + tag_l
                    tags.append(tag_here)
                    ##
                    tag= tag_list[idx]
                    #
                    new_sim= sim + tag_here

                    pop_dict= tag_dict[tag]
                    
                    #########
                    #########
                    pop_summary, PA_dict= count_popKmers(Window[:,lrange], mut_matrix[:,lrange], pop_dict, single= single, prop_gen_used= prop_gen_used,
                                              frequency_range= frequency_range,row=row,col=col,segregating= segregating,scale= scale,
                                              return_private= return_private)

                    data_kmer[new_sim]= pop_summary

                    if return_private: 
                        pop_summary, dummy= count_popKmers(Window[:,lrange], mut_matrix[:,lrange], pop_dict, single= single, prop_gen_used= prop_gen_used,
                                                  frequency_range= frequency_range,row=row,col=col,segregating= segregating,scale= scale,
                                                  PA= PA_dict)
                        data_kmer[new_sim]= pop_summary


                    if freq_extract:
                        pop_freqs= pop_dict_SFS(Window,pop_dict)
                        data_freqs[new_sim]= pop_freqs

                    if distances:
                        data_kmer[new_sim]['pairDist']= pop_distances_PCA(Window,pop_dict)

        if print_summ:
            print('mut_matrix time: {} s'.format(time_mut / 60))
            print('count time: {} s'.format(count_time / 60))
            print('est total count time: {} s'.format(count_time*len(tag_list) / 60))
            print('replicates: {}'.format(len(tag_list)))
            print('read time: {} s'.format(read_time / 60))

    tf= time.time()
    time_elapsed= tf - ti
    
    print('time elapsed: {}s'.format(time_elapsed))
    
    return data_kmer, data_freqs
Esempio n. 4
0
def MC_sample_matrix_v1(min_size= 80, samp= [5,20,10], stepup= "increment", diffs= False, frequency_range= [0,1],indfile= 'ind_assignments.txt', outemp= 'ind_assignments{}.txt',chrom_idx= 0, prop_gen_used= 1,
                    count_dir= './count/', dir_launch= '..',main_dir= './', sim_dir= 'mutation_counter/data/sims/', muted_dir= 'mutation_counter/data/mutation_count/', segregating= False, scale_genSize= False,
                    outlog= 'indy.log', row= 24,col= 4, single= True, exclude= False, print_summ= False, sample_sim= 0,collapsed= True,bases= 'ACGT',ksize= 3,ploidy= 2, freq_extract= False, sim_del= 'C',
                    genome_size= 1,haps_extract= False, return_private= True):
    '''
    launch mutation counter pipeline on population assignments.
    Use matrix multiplication to extract counts. 
    - v1 relies on count_popKmers() function to count mutations per pop. allows freq. filter and single mutaiton count.  
    '''

    ti= time.time()
    sims= process_dir(sims_dir= sim_dir)
    print('available {}'.format(len(sims)))

    tags= []
    sim_extend= []
    chroms= []
    
    data_kmer= {}
    data_freqs= {}
    
    if sample_sim == 0:
        sample_sim= len(sims)
    
    print('sample {}'.format(sample_sim))
    sim_sub= np.random.choice(sims,sample_sim,replace= False)
    
    for sim in sim_sub:
        
        ## chromosome
        chrom= sim.split('.')[chrom_idx].split(sim_del)[-1].strip('chr')

        if exclude:
            files= read_exclude()
        else:
            files= {}

        ### read vcf
        t0= time.time()
        Window, mut_matrix, scale= VCF_read_filter(sim, sim_dir= sim_dir,chrom= chrom,haps_extract= haps_extract, scale_genSize= scale_genSize,
            collapsed= collapsed,min_size= min_size, samp= samp, stepup= stepup, outemp= outemp,
            indfile= indfile,diffs= diffs,bases= bases, ksize= ksize, ploidy= ploidy)

        tag_list, tag_dict, pop_dict= ind_assignment_scatter_v1(sim,dir_sim= sim_dir, haps_extract= haps_extract,
                      min_size= min_size, samp= samp, stepup= stepup, outemp= outemp,indfile= indfile)
        
        total_inds= sum([len(x) for x in pop_dict.values()])
        t1= time.time()
        read_time= t1- t0
        if not len(Window) or Window.shape[0] < total_inds:
            continue

        ## counts for no tag sim:
        s0= time.time()
        pop_summary, PA_dict= count_popKmers(Window, mut_matrix, pop_dict, single= single, prop_gen_used= prop_gen_used,
                                  frequency_range= frequency_range,row=row,col=col,segregating= segregating,scale= scale,
                                  return_private= return_private)

        data_kmer[sim]= pop_summary

        if return_private: 
            pop_summary, dummy= count_popKmers(Window, mut_matrix, pop_dict, single= single, prop_gen_used= prop_gen_used,
                                      frequency_range= frequency_range,row=row,col=col,segregating= segregating,scale= scale,
                                      PA= PA_dict)
            data_kmer[sim]= pop_summary


        if freq_extract:
            pop_freqs= pop_dict_SFS(Window,pop_dict)
            data_freqs[sim]= pop_freqs
        
        t1= time.time()
        count_time= t1- t0
        
        if len(tag_list):
            ###
            sim_extend.append(sim)
            tags.append('')
            chroms.append(chrom)
            ###
            
            for idx in range(len(tag_list)):
                
                sim_extend.extend([sim]*len(tag_list))
                tags.extend(tag_list)
                chroms.extend([chrom]*len(tag_list))
                
                ##
                tag= tag_list[idx]
                ind_file= outemp.format(tags[idx])
                new_sim= sim + tag

                pop_dict= tag_dict[tag]
                
                pop_summary, dummy= count_popKmers(pop_summary['array'], mut_matrix, pop_dict, single= single, prop_gen_used= prop_gen_used,
                                  frequency_range= frequency_range,row=row,col=col,segregating= segregating,scale= scale,
                                  PA= PA_dict,counted= True)

                data_kmer[new_sim]= pop_summary

                if freq_extract:
                    pop_freqs= pop_dict_SFS(Window,pop_dict)
                    data_freqs[new_sim]= pop_freqs
                

        if print_summ:
            print('mut_matrix time: {} s'.format(time_mut / 60))
            print('count time: {} s'.format(count_time / 60))
            print('est total count time: {} s'.format(count_time*len(tag_list) / 60))
            print('replicates: {}'.format(len(tag_list)))
            print('read time: {} s'.format(read_time / 60))

    tf= time.time()
    time_elapsed= tf - ti
    
    print('time elapsed: {}s'.format(time_elapsed))

    return data_kmer, data_freqs