def count_popKmers(Window, mut_matrix, pop_dict, single= True, frequency_range= [0,1],row=24,col=4): ''' Extract population mutation counts from _ind x kmer_ mutation matrix. ''' pop_counts= {} num_variants= {} for pop in pop_dict.keys(): pop_gen= Window[pop_dict[pop],:] freqs= np.sum(pop_gen,axis= 0) / pop_gen.shape[0] ## discount alleles outside freq range. in_out= (freqs < frequency_range[0]) | (freqs > frequency_range[1]) pop_gen[:,in_out]= 0 if single: pop_gen= np.sum(pop_gen,axis= 0) > 0 pop_gen= np.array(pop_gen,dtype= int).reshape(1,len(pop_gen)) pop_collapsed_mat= geno_muts_v2(pop_gen, mut_matrix) pop_summed= np.sum(pop_collapsed_mat,axis= 0) pop_counts[pop]= pop_summed.reshape(row,col) num_variants[pop]= np.sum(pop_collapsed_mat) return { 'counts': pop_counts, 'Nvars': num_variants, 'sizes': {z:len(g) for z,g in pop_dict.items()} }
def countkmers_cofactor(pop_gen, mut_matrix, pop_ori, single=True, frequency_range=[0, 1], scale=1, prop_gen_used=1, return_private=False, PA={}): ''' module to count_popKmers. this level allows to dissect populations. ''' freqs = np.sum(pop_gen, axis=0) / pop_gen.shape[0] ## discount alleles outside freq range. in_out = (freqs <= frequency_range[0]) | (freqs >= frequency_range[1]) if PA: shared = [x for x in range(pop_gen.shape[1]) if PA[pop_ori][x] == 0] pop_gen[:, shared] = 0 if single: pop_gen = np.sum(pop_gen, axis=0) > 0 pop_gen = np.array(pop_gen, dtype=int).reshape(1, len(pop_gen)) pop_seg_ori = np.sum(pop_gen, axis=0) > 0 pop_seg_ori = np.array(pop_seg_ori, dtype=int).reshape(1, len(pop_seg_ori)) pop_seg_ori = pop_seg_ori * scale * prop_gen_used pop_gen[:, in_out] = 0 pop_collapsed_mat = geno_muts_v2(pop_gen, mut_matrix) return pop_collapsed_mat, pop_seg_ori
def countkmers_cofactor(pop_gen, mut_matrix, mut_idx, pop_ori, single=True, frequency_range=[0, 1], scale=1, prop_gen_used=1, return_private=False, PA=False): ''' module to count_popKmers. this level allows to dissect populations. ''' t0 = time.time() if PA: freqs = np.sum(pop_gen, axis=0) / pop_gen.shape[0] ## discount alleles outside freq range. in_out = (freqs <= frequency_range[0]) | (freqs >= frequency_range[1]) pop_gen[:, in_out] = 0 pop_seg_ori = np.sum(pop_gen, axis=0) > 0 pop_seg_ori = np.array(pop_seg_ori, dtype=int).reshape(1, len(pop_seg_ori)) if single: pop_gen = pop_seg_ori t1 = time.time() if pop_gen.shape[0] == 1: pop_collapsed_mat = lineAssign(pop_gen[0], mut_idx, nmuts=mut_matrix.shape[0]) else: pop_collapsed_mat = geno_muts_v2(pop_gen, mut_matrix) t2 = time.time() print('#') print(pop_gen.shape) print(t1 - t0) print(t2 - t1) return pop_collapsed_mat, pop_seg_ori
def MC_sample_matrix_v1(min_size= 80, samp= [5,20,10], stepup= "increment", diffs= False, frequency_range= [0,1],indfile= 'ind_assignments.txt', outemp= 'ind_assignments{}.txt', count_dir= './count/', dir_launch= '..',main_dir= './', sim_dir= 'mutation_counter/data/sims/', muted_dir= 'mutation_counter/data/mutation_count/', outlog= 'indy.log', row= 24,col= 4, single= True, exclude= False, print_summ= False, sample_sim= 0,collapsed= True,bases= 'ACGT',ksize= 3,ploidy= 2, freq_extract= False): ''' launch mutation counter pipeline on manipulated population assignments. Use matrix multiplication to extract counts. - v1 relies on count_popKmers() function to count mutations per pop. allows freq. filter and single mutaiton count. ''' ti= time.time() sims= process_dir(sims_dir= sim_dir) print('available {}'.format(len(sims))) tags= [] sim_extend= [] chroms= [] data_kmer= {} data_freqs= {} #sim_sample= np.random.choice(sims,8,replace= False) if sample_sim == 0: sample_sim= len(sims) print('sample {}'.format(sample_sim)) sim_sub= np.random.choice(sims,sample_sim,replace= False) for sim in sim_sub: ## chromosome chrom= sim.split('.')[0].split('C')[-1].strip('chr') chromosomes= [sim.split('.')[0].split('C')[1]] chromosome_groups = [chromosomes] if exclude: files= read_exclude() else: files= {} ### read vcf vcf_dir= sim_dir + sim + '/' vcf_file= vcf_dir + sim + '_' + 'chr' + chrom + '.vcf.gz' t0= time.time() print(sim) genotype, summary, Names= read_vcf_allel(vcf_file) t1= time.time() read_time= t1- t0 if len(genotype) == 0: continue print(genotype.shape) ## read fasta fasta_file= vcf_dir + 'chr{}_{}.fa.gz'.format(chrom,sim) with gzip.open(fasta_file,'r') as f: lines= f.readlines() lines= [x.decode() for x in lines] refseq= lines[1].strip() ### positions= [int(x) for x in summary.POS] wstart= int(min(positions)) wend= int(max(positions)) Wlen= wend - wstart genotype_parse= [x for x in range(summary.shape[0]) if int(summary.POS[x])-1 >= wstart and int(summary.POS[x])-1 <= wend] Window= genotype[:,genotype_parse] subset_summary= summary.loc[genotype_parse,:].reset_index() ## t0= time.time() mut_matrix, flag_reverse, flag_remove= vcf_muts_matrix_v1(refseq,subset_summary,start= wstart,end= wend,ksize= ksize, bases=bases, collapse= collapsed) retain= [x for x in range(Window.shape[1]) if x not in flag_remove] Window= Window[:,retain] subset_summary= subset_summary.loc[retain,:].reset_index() t1= time.time() time_mut= t1 - t0 if diffs: sim_start= sim.split('.')[-1] diff_snps= read_diffs(sim,diff_dir= vcf_dir, start= int(sim_start)) summary_diff= [x for x in range(subset_summary.shape[0]) if subset_summary.POS[x] in diff_snps.keys()] flag_reverse.extend(summary_diff) flag_reverse= list(set(flag_reverse)) if flag_reverse: Window[:,flag_reverse]= ploidy - Window[:,flag_reverse] ind_collapsed_mat= geno_muts_v2(np.array(Window), mut_matrix) tag_list, tag_dict, pop_dict= ind_assignment_scatter_v1(sim,dir_sim= sim_dir, min_size= min_size, samp= samp, stepup= stepup, outemp= outemp,indfile= indfile) #print(tag_list) total_inds= sum([len(x) for x in pop_dict.values()]) if Window.shape[0] < total_inds: continue ## counts for no tag sim: s0= time.time() data_kmer[sim]= count_popKmers(Window, mut_matrix, pop_dict, single= single, frequency_range= frequency_range,row=row,col=col) if freq_extract: pop_freqs= pop_dict_SFS(Window,pop_dict) data_freqs[sim]= pop_freqs t1= time.time() count_time= t1- t0 if len(tag_list): ### sim_extend.append(sim) tags.append('') chroms.append(chrom) ### for idx in range(len(tag_list)): sim_extend.extend([sim]*len(tag_list)) tags.extend(tag_list) chroms.extend([chrom]*len(tag_list)) ## tag= tag_list[idx] ind_file= outemp.format(tags[idx]) new_sim= sim + tag pop_dict= tag_dict[tag] data_kmer[new_sim]= count_popKmers(Window, mut_matrix, pop_dict, single= single, frequency_range= frequency_range,row=row,col=col) if freq_extract: pop_freqs= pop_dict_SFS(Window,pop_dict) data_freqs[new_sim]= pop_freqs if print_summ: print('mut_matrix time: {} s'.format(time_mut / 60)) print('count time: {} s'.format(count_time / 60)) print('est total count time: {} s'.format(count_time*len(tag_list) / 60)) print('replicates: {}'.format(len(tag_list))) print('read time: {} s'.format(read_time / 60)) tf= time.time() time_elapsed= tf - ti print('time elapsed: {}s'.format(time_elapsed)) return data_kmer, data_freqs
def MC_sample_matrix(logfile, min_size= 80, samp= [5,20,10], pops= 'ind_assignments.txt', outemp= 'ind_assignments{}.txt', count_dir= './count/', dir_launch= '..',main_dir= './', sim_dir= 'mutation_counter/data/sims/', muted_dir= 'mutation_counter/data/mutation_count/', outlog= 'indy.log', row= 24,col= 4,exclude= False): ''' launch mutation counter pipeline on manipulated population assignments. Use matrix multiplication to extract counts. ''' sims= process_dir(sims_dir= main_dir+sim_dir) print(len(sims)) tags= [] sim_extend= [] chroms= [] data= {} for sim in sims: ## chromosome chrom= sim.split('.')[0].split('C')[-1].strip('chr') chromosomes= [sim.split('.')[0].split('C')[1]] chromosome_groups = [chromosomes] if exclude: files= read_exclude() else: files= {} ### read vcf row_info= 6 header_info= 9 phased= False vcf_dir= sim_dir + sim + '/' vcf_file= vcf_dir + sim + '_' + 'chr' + chrom + '.vcf.gz' genotype, summary, Names= read_geno_nanumv3(vcf_file, header_info= header_info,phased= phased) ## read fasta fasta_file= vcf_dir + 'chr{}_{}.fa.gz'.format(chrom,sim) with gzip.open(fasta_file,'r') as f: lines= f.readlines() lines= [x.decode() for x in lines] refseq= lines[1].strip() ### positions= [int(x) for x in summary.POS] wstart= int(min(positions)) wend= int(max(positions)) Wlen= wend - wstart ksize= 3 # odd. bases = 'ACGT' collapsed= True genotype_parse= [x for x in range(summary.shape[0]) if int(summary.POS[x])-1 >= wstart and int(summary.POS[x])-1 <= wend] Window= genotype[:,genotype_parse] subset_summary= summary.loc[genotype_parse,:].reset_index() ## mut_matrix, flag_reverse= vcf_muts_matrix_v1(refseq,subset_summary,start= wstart,end= wend,ksize= ksize,bases=bases, collapse= collapsed) if flag_reverse: Window[:,flag_reverse]= 2 - Window[:,flag_reverse] ind_collapsed_mat= geno_muts_v2(np.array(Window), mut_matrix) tag_list, tag_dict, pop_dict= ind_assignment_scatter_v1(sim,main_dir= main_dir, min_size= min_size, samp= samp, outemp= outemp) #print(tag_list) ## counts for no tag sim: pop_counts= { z: np.sum(ind_collapsed_mat[pop_dict[z],:],axis= 0) for z in pop_dict.keys() } pop_counts= { z:g.reshape(row,col) for z,g in pop_counts.items() } num_variants= { z: np.sum(ind_collapsed_mat[pop_dict[z],:]) for z in pop_dict.keys() } data[sim]= { 'counts': pop_counts, 'Nvars': num_variants, 'sizes': {z:len(g) for z,g in pop_dict.items()} } if len(tag_list): ### sim_extend.append(sim) tags.append('') chroms.append(chrom) ### for idx in range(len(tag_list)): sim_extend.extend([sim]*len(tag_list)) tags.extend(tag_list) chroms.extend([chrom]*len(tag_list)) ## tag= tag_list[idx] ind_file= outemp.format(tags[idx]) new_sim= sim + tag pop_dict= tag_dict[tag] pop_sizes= { z: len(g) for z,g in pop_dict.items() } pops= list(set(pop_dict.keys())) ### pop_counts= { z: np.sum(ind_collapsed_mat[pop_dict[z],:],axis= 0) for z in pop_dict.keys() } pop_counts= { z:g.reshape(row,col) for z,g in pop_counts.items() } num_variants= { z: np.sum(ind_collapsed_mat[pop_dict[z],:]) for z in pop_dict.keys() } data[new_sim]= { 'counts': pop_counts, 'Nvars': num_variants, 'sizes': {z:len(g) for z,g in pop_dict.items()} } return data
def count_popKmers(Window, mut_matrix, mut_idx, pop_dict, single=True, frequency_range=[0, 1], segregating=True, scale=1, prop_gen_used=1, return_private=False, return_seg=False, pop_tag='_ss', row=32, col=3): ''' Extract population mutation counts from _ind x kmer_ mutation matrix. ''' pop_counts = {} num_variants = {} pop_seg = {} PA_dict = {} pop_list = list(pop_dict.keys()) for pop in pop_list: t0 = time.time() pop_ori = pop if pop_tag in pop: pop_ori = pop[len(pop_tag):].split('.')[0] klist = sorted(pop_dict[pop]) pop_gen = Window[klist, :] t1 = time.time() if single: pop_gen = np.sum(pop_gen, axis=0) if segregating: pop_gen = pop_gen > 0 pop_gen = np.array(pop_gen, dtype=int).reshape(1, len(pop_gen)) t2 = time.time() if pop_gen.shape[0] == 1: pop_collapsed_mat = lineAssign(pop_gen, mut_idx, nmuts=mut_matrix.shape[0]) else: pop_collapsed_mat = geno_muts_v2(pop_gen, mut_matrix) t3 = time.time() tfetch = t1 - t0 tfilter = t2 - t1 tcount = t3 - t2 ttot = t3 - t0 rate = ttot / (len(klist) / 1000) #print('#') #print('w shape: {}'.format(Window.shape)) #print(pop) #print('tfetch: {} s'.format(tfetch)) #print('t filter: {} s'.format(tfilter)) #print('N {} rate /1K : {}'.format(len(klist),rate)) #print('total {} s'.format(ttot)) #print('count {} s'.format(tcount / ttot)) pop_seg[pop] = pop_gen pop_summed = np.sum(pop_collapsed_mat, axis=0) t2 = time.time() ###### ###### pop_counts[pop] = pop_summed.reshape(row, col) * scale * prop_gen_used num_variants[pop] = np.sum(pop_collapsed_mat) * scale * prop_gen_used pop_summary = { 'counts': pop_counts, 'Nvars': num_variants, 'sizes': {z: len(g) for z, g in pop_dict.items()} } if return_seg: pop_summary['seg'] = pop_seg return pop_summary, PA_dict