def get_mut_bias(self): out_df = open(bt.get_path() + '/data/mut_bias.txt', 'w') out_df.write('\t'.join([ 'Sample', 'Strain', 'Treatment', 'Replicate', 'Time', 'm_sample_ma' ]) + '\n') AT_GC = {} GC_AT = {} to_exclude = bt.mutations_to_exclude() gene_pop_matrix = {} directory = os.fsencode(bt.get_path() + '/data/pool_pop_seq/rebreseq_annotated') for file in os.listdir(directory): filename = os.fsdecode(file) if filename.endswith('-100.gd'): in_df = open(os.path.join(str(directory, 'utf-8'), filename), 'r') pop = filename.split('.')[0] if pop not in AT_GC: AT_GC[pop] = 0 if pop not in GC_AT: GC_AT[pop] = 0 to_keep = [] for line in in_df: line_split = line.strip().split() if line_split[0] == 'SNP': to_keep.append(int(line_split[2])) # all RA occur after SNPs if (line_split[0] == 'RA') and (int(line_split[1]) in to_keep): ref = line_split[6] mut = line_split[7] if (ref == 'A' and mut == 'C') or \ (ref == 'A' and mut == 'G') or \ (ref == 'T' and mut == 'C') or \ (ref == 'T' and mut == 'G'): AT_GC[pop] += 1 elif (ref == 'C' and mut == 'A') or \ (ref == 'C' and mut == 'T') or \ (ref == 'G' and mut == 'A') or \ (ref == 'G' and mut == 'T'): GC_AT[pop] += 1 else: continue AT_GC_list = list(bt.common_entries(GC_AT, AT_GC)) AT_GC_dict = {} for x in AT_GC_list: if (x[1] == 0 and x[2] == 0): continue else: AT_GC_dict[x[0]] = round( ((x[1] + 1) / (x[2] + 1)) / bt.get_bacillus_mut_bias(), 3) for key, value in AT_GC_dict.items(): print(key, value) key_split = re.split(r'[-_]+', key) out_df.write('\t'.join([ key, key_split[1][2], key_split[1][1], key_split[1][3], key_split[2], str(value) ]) + '\n') out_df.close()
def clean_kaas(strain): IN_kaas_path = bt.get_path() + '/data/reference_assemblies_task2/KAAS/' + strain + '_KAAS_result_ko' IN_kaas = pd.read_csv(IN_kaas_path, sep = '\t', names = ['protein_id', 'KO', 'species', 'phylum_genus', 'num']) IN_kaas_subset = IN_kaas.loc[IN_kaas['KO'] != 'K_NA'] OUT_path = bt.get_path() + '/data/reference_assemblies_task2/KAAS/' + strain + '_KAAS_clean.txt' OUT = open(OUT_path, 'w') header = ['protein_id', 'KEGG_Orthology', 'species', 'phylum', 'genus', 'num'] OUT.write('\t'.join(header) + '\n') count = 0 for index, row in IN_kaas_subset.iterrows(): KO_split = row['KO'].split(',') phylum_genus_split = row['phylum_genus'].strip().split('-') for KO in KO_split: if len(phylum_genus_split) == 1: # KEGG used 'Others' to for unknown genus genus = 'Others' if 'Other 'in phylum_genus_split[0]: phylum = phylum_genus_split[0].replace(' ', '-') else: phylum = phylum_genus_split[0].strip() row_out = [row['protein_id'], KO, row['species'], \ phylum_genus_split[0].strip(), 'Others', str(int(row['num']))] OUT.write('\t'.join(row_out) + '\n') else: genus = phylum_genus_split[1].strip() phylum = phylum_genus_split[0].strip() row_out = [row['protein_id'], KO, row['species'], \ phylum, genus, str(int(row['num']))] OUT.write('\t'.join(row_out) + '\n') count += 1 OUT.close()
def module_to_KO(strain): kaas_directory = bt.get_path() + '/data/reference_assemblies_task2/MAPLE/' + strain + '_MAPLE_result/KAAS' data = [['KEGG_Orthology', 'Pathway_ID']] bad_chars = '()-+,-' rgx = re.compile('[%s]' % bad_chars) for filename in os.listdir(kaas_directory): if filename.endswith("_matrix.txt"): for line in open((os.path.join(kaas_directory, filename)), 'r'): line_strip_split = line.strip().split() if len(line_strip_split) > 2 and 'M' in line_strip_split[0]: if '_' in line_strip_split[0]: pathway = line_strip_split[0].split('_')[0] else: pathway = line_strip_split[0] ko_genes = line_strip_split[2:] for ko_gene in ko_genes: test_set_member = [bad_char for bad_char in bad_chars if bad_char in ko_gene] if len(test_set_member) > 0: ko_gene_clean = rgx.sub('', ko_gene) ko_gene_clean_split = ['K' + e for e in ko_gene_clean.split('K') if e] for split_gene in ko_gene_clean_split: if 'M' in split_gene: continue data.append([split_gene, pathway]) else: if 'K' in ko_gene: data.append([ko_gene, pathway]) df = pd.DataFrame(data[1:],columns=data[0]) OUT_path = bt.get_path() + '/data/reference_assemblies_task2/MAPLE/MAPLE_modules/' + strain + '_KO_to_M.txt' df.to_csv(OUT_path, sep = '\t', index = False)
def get_pop_by_gene_matrix(): # just bother with day 100 for now to_exclude = bt.mutations_to_exclude() gene_pop_matrix = {} directory = os.fsencode(bt.get_path() + '/data/pool_pop_seq/rebreseq_annotated') for file in os.listdir(directory): filename = os.fsdecode(file) if filename.endswith('-100.gd'): in_df = open(os.path.join(str(directory, 'utf-8'), filename), 'r') pop = filename.split('.')[0] for line in in_df: line_split = line.strip().split() if (line_split[0] not in bt.get_to_keep()) or \ (line_split[8].split('=')[1] == 'intergenic') or \ (line_split[3] + '_' + line_split[4] in to_exclude): continue # clean locus tags locus_tag = [s for s in line_split if 'locus_tag=' in s][0].split('=')[1] locus_tag_clean = re.sub('[][]', '', locus_tag)#.split(';') locus_tag_clean_split = re.findall(r"[\w']+", locus_tag_clean) for locus in locus_tag_clean_split: if locus in gene_pop_matrix: if pop in gene_pop_matrix[locus]: gene_pop_matrix[locus][pop] += 1 else: gene_pop_matrix[locus][pop] = 1 else: gene_pop_matrix[locus] = {} gene_pop_matrix[locus][pop] = 1 df = pd.DataFrame.from_dict(gene_pop_matrix) df = df.fillna(0) df_out = bt.get_path() + '/data/pool_pop_seq/gene_by_pop.txt' df.to_csv(df_out, sep = '\t', index = True)
def clean_GBK(): IN_path = bt.get_path() + '/data/Bacillus_subtilis_NCIB_3610/GCF_002055965.1_ASM205596v1_genomic.gbff' genome = SeqIO.parse(IN_path, "genbank") # protein_id df_out = open(bt.get_path() + '/data/gene_table.txt', 'w') header = ['LocusTag', 'protein_id' , 'Gene', 'Type', 'Size', 'Start', 'Stop', 'GC', 'Sequence', 'Fold_1', \ 'Fold_2', 'Fold_2_S', 'Fold_2_V', 'Fold_3', 'Fold_4', 'N', 'S', 'Spore_associated'] df_out.write('\t'.join(header) + '\n') gene_features = ['CDS', 'tRNA', 'rRNA', 'ncRNA', 'tmRNA']
def clean_bPTR(): directory = os.fsencode(bt.get_path() + '/data/bPTR') df_out = open(bt.get_path() + '/data/bPTR_clean.txt', 'w') header = ['Sample', 'Strain', 'Treatment', 'Replicate', 'Time' ,'bPTR'] df_out.write('\t'.join(header) + '\n') for file in os.listdir(directory): filename = os.fsdecode(file) if filename.endswith('.tsv') == False: continue bPTR_path = os.path.join(str(directory, 'utf-8'), filename) for i, line in enumerate(open(bPTR_path, 'r')): if i == 0: continue f_clean = filename.split('.')[0] f_clean_split = re.split(r'[-_]+', f_clean) out_line = [f_clean, f_clean_split[1][1], f_clean_split[1][0], f_clean_split[1][2], f_clean_split[2], line.split()[-1]] df_out.write('\t'.join(out_line) + '\n') df_out.close()
def get_json_coverage(): #strains = df_out = open(bt.get_path() + '/data/bacillus_coverage.txt', 'w') directory = bt.get_path() + '/data/rebreseq_json/' header = ['Sample', 'Strain', 'Treatment', 'Replicate', 'Time', 'CP020102', 'CP020103'] df_out.write('\t'.join(header) + '\n') for filename in os.listdir(directory): if filename.endswith(".json") == False: continue sample = filename.split('.')[0] pop = sample.split('_')[0] strain = pop[1] treat = pop[0] rep = pop[2] time = sample.split('_')[1] with open(directory + filename) as f: data = json.load(f) CP020102_cov = data['references']['reference']['CP020102']['coverage_average'] CP020103_cov = data['references']['reference']['CP020103']['coverage_average'] df_out.write('\t'.join([sample, strain, treat, rep, time, str(CP020102_cov), str(CP020103_cov)]) + '\n') df_out.close()
def plot_mut_bias(self): df = pd.read_csv(bt.get_path() + '/data/mut_bias.txt', sep='\t', header='infer', index_col=0) strains = ['B', 'S'] B_1 = df[(df.Strain == 'B') & (df.Treatment == 0)]['m_sample_ma'].tolist() B_10 = df[(df.Strain == 'B') & (df.Treatment == 1)]['m_sample_ma'].tolist() B_100 = df[(df.Strain == 'B') & (df.Treatment == 2)]['m_sample_ma'].tolist() S_1 = df[(df.Strain == 'S') & (df.Treatment == 0)]['m_sample_ma'].tolist() S_10 = df[(df.Strain == 'S') & (df.Treatment == 1)]['m_sample_ma'].tolist() S_100 = df[(df.Strain == 'S') & (df.Treatment == 2)]['m_sample_ma'].tolist() fig, ax = plt.subplots() ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling ax.plot([0]* len(B_1), B_1, marker='o', linestyle='', \ ms=14, color = bt.get_colors()['0'], alpha = 0.9) ax.plot([0.5]* len(S_1), S_1, marker='o', linestyle='', \ ms=14, color = bt.get_colors()['0'], alpha = 0.9, markeredgewidth=2, mfc='none') ax.plot([1.5]* len(B_10), B_10, marker='o', linestyle='', \ ms=14, color = bt.get_colors()['1'], alpha = 0.9) ax.plot([2]* len(S_10), S_10, marker='o', linestyle='', \ ms=14, color = bt.get_colors()['1'], alpha = 0.9, markeredgewidth=2, mfc='none') ax.plot([3]* len(B_100), B_100, marker='o', linestyle='', \ ms=14, color = bt.get_colors()['2'], alpha = 0.9) ax.plot([3.5]* len(S_100), S_100, marker='o', linestyle='', \ ms=14, color = bt.get_colors()['2'], alpha = 0.9, markeredgewidth=2, mfc='none') #fig.canvas.draw() ax.text( 0, 6.8, r'$m=\frac{\mathrm{G + C} \rightarrow \mathrm{A + T} }{\mathrm{A + T} \rightarrow \mathrm{G + C} }$', fontsize=18) labels = [item.get_text() for item in ax.get_xticklabels()] labels[1] = ' 1-Day' labels[4] = ' 10-Day' labels[7] = ' 100-Day' plt.axhline(y=1, color='grey', linestyle='--', lw=4) plt.ylim([0, 8]) ax.set_xticklabels(labels, fontsize=18) ax.set_ylabel(r'$m_{pop} / m_{Ancestor}$', fontsize=20) legend_elements = [ Line2D( [0], [0], marker='o', color='w', label=r'$\mathrm{Wild-type}$', markerfacecolor='k', markersize=14, ), Line2D([0], [0], marker='o', color='w', label=r'$\mathrm{\Delta spo0A}$', markerfacecolor='none', markersize=12, markeredgewidth=2, markeredgecolor='k') ] #legend_elements = [Line2D([0], [0], marker='o', markerfacecolor='g',color='b',label=r'$\mathrm{Wild-type}$'), # Line2D([0], [0], marker='o', ms = 14, color='k', label=r'$\mathrm{\Delta spo0A}$', # markersize=15)] #plt.ticklabel_format(style='sci', axis='y') ax.legend(handles=legend_elements, loc='upper right', bbox_to_anchor=(0.33, 0.8), frameon=False, prop={'size': 11}) #ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.2e')) fig.savefig(bt.get_path() + '/figs/mut.png', bbox_inches='tight', dpi=600) plt.close()
def plot_bPTR(): df = pd.read_csv(bt.get_path() + '/data/bPTR_clean.txt', sep='\t', header='infer', index_col=0) strains = ['B', 'S'] B_1 = df[(df.Strain == 'B') & (df.Treatment == 0)]['bPTR'].tolist() B_10 = df[(df.Strain == 'B') & (df.Treatment == 1)]['bPTR'].tolist() B_100 = df[(df.Strain == 'B') & (df.Treatment == 2)]['bPTR'].tolist() S_1 = df[(df.Strain == 'S') & (df.Treatment == 0)]['bPTR'].tolist() S_10 = df[(df.Strain == 'S') & (df.Treatment == 1)]['bPTR'].tolist() S_100 = df[(df.Strain == 'S') & (df.Treatment == 2)]['bPTR'].tolist() fig, ax = plt.subplots() ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling ax.plot([0]* len(B_1), B_1, marker='o', linestyle='', \ ms=14, color = bt.get_colors()['0'], alpha = 0.9) ax.plot([0.5]* len(S_1), S_1, marker='o', linestyle='', \ ms=14, color = bt.get_colors()['0'], alpha = 0.9, markeredgewidth=2, mfc='none') ax.plot([1.5]* len(B_10), B_10, marker='o', linestyle='', \ ms=14, color = bt.get_colors()['1'], alpha = 0.9) ax.plot([2]* len(S_10), S_10, marker='o', linestyle='', \ ms=14, color = bt.get_colors()['1'], alpha = 0.9, markeredgewidth=2, mfc='none') ax.plot([3]* len(B_100), B_100, marker='o', linestyle='', \ ms=14, color = bt.get_colors()['2'], alpha = 0.9) ax.plot([3.5]* len(S_100), S_100, marker='o', linestyle='', \ ms=14, color = bt.get_colors()['2'], alpha = 0.9, markeredgewidth=2, mfc='none') labels = [item.get_text() for item in ax.get_xticklabels()] labels[1] = ' 1-Day' labels[4] = ' 10-Day' labels[7] = ' 100-Day' #plt.ylim([0,8]) ax.set_xticklabels(labels, fontsize=18) ax.set_ylabel('Peak-to-trough coverage ratio', fontsize=16) legend_elements = [ Line2D( [0], [0], marker='o', color='w', label=r'$\mathrm{Wild-type}$', markerfacecolor='k', markersize=14, ), Line2D([0], [0], marker='o', color='w', label=r'$\mathrm{\Delta spo0A}$', markerfacecolor='none', markersize=12, markeredgewidth=2, markeredgecolor='k') ] #plt.ticklabel_format(style='sci', axis='y') ax.legend(handles=legend_elements, loc='upper right', frameon=False, prop={'size': 11}) #ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.2e')) fig.savefig(bt.get_path() + '/figs/bPTR.png', bbox_inches='tight', dpi=600) plt.close()
def clean_GBK_old(): IN_path = bt.get_path() + '/data/Bacillus_subtilis_NCIB_3610/GCF_002055965.1_ASM205596v1_genomic.gbff' genome = SeqIO.parse(IN_path, "genbank") # protein_id df_out = open(bt.get_path() + '/data/gene_table.txt', 'w') header = ['LocusTag', 'protein_id' , 'Gene', 'Type', 'Size', 'Start', 'Stop', 'GC', 'Sequence', 'Fold_1', \ 'Fold_2', 'Fold_2_S', 'Fold_2_V', 'Fold_3', 'Fold_4', 'N', 'S', 'Spore_associated'] df_out.write('\t'.join(header) + '\n') types_keep = ['CDS', 'rRNA', 'tRNA', 'tmRNA'] total = [] total1 = [] for record in genome: if 'chromosome' in record.description: descript = record.description descript_split = descript.split(' ') descript_split_index = descript_split.index('chromosome') chrom = descript_split[descript_split_index] + '_' + descript_split[descript_split_index + 1].strip(',') elif 'plasmid' in record.description: descript = record.description descript_split = descript.split(' ') descript_split_index = descript_split.index('plasmid') chrom = descript_split[descript_split_index] + '_' + descript_split[descript_split_index + 1].strip(',') else: chrom = 'Genome' for f in record.features: total.append(f) if f.type not in types_keep: continue total1.append(f) if 'gene' in f.qualifiers: gene = f.qualifiers["gene"][0] gene = gene.replace(" ", "_") else: gene = 'nan' locus_tag = f.qualifiers["locus_tag"][0] if 'protein_id' in f.qualifiers: protein_id = f.qualifiers["protein_id"][0] else: protein_id = 'nan' if 'product' in f.qualifiers: if ('spore' in f.qualifiers['product'][0]) or ('sporulation' in f.qualifiers['product'][0]): spore_associated = True else: spore_associated = False else: spore_associated = False size = f.location.end - f.location.start seq = str(f.extract(record.seq)) if f.strand == -1: seq = seq[::-1] GC = round((seq.count('G') + seq.count('C')) / len(seq), 4) if f.type == "CDS": start_rf = int(f.qualifiers['codon_start'][0]) - 1 codons = [seq[i + start_rf: i + start_rf + 3 ] for i in range(0, len(seq), 3)] nuc_list = ['A', 'C', 'G', 'T'] codons = [x for x in codons if (len(x) == 3) and (len(np.setdiff1d(list(x),nuc_list)) == 0)] fold_1 = 0 fold_2 = 0 fold_3 = 0 fold_4 = 0 fold_2_V =0 fold_2_S =0 N = 0 for codon in codons: codon_list = list(codon) N_codon = 0 for g in range(3): fold_count = 0 fold_2_S_i = 0 fold_2_V_i = 0 for nuc in nuc_list: codon_mut_list = list(codon_list) if codon_mut_list[g] == nuc: continue codon_mut_list[g] = nuc codon_mut = "".join(codon_mut_list) S_V = bt.get_ts_tv_dict()[(codon_mut[g], codon[g])] if bt.get_codon_dict()[codon_mut] == bt.get_codon_dict()[codon]: fold_count += 1 if S_V == 'S': fold_2_S_i += 1 elif S_V == 'V': fold_2_V_i += 1 if fold_count == 0: fold_1 += 1 elif fold_count == 1: fold_2 += 1 if fold_2_S_i == 1 and fold_2_V_i == 0: fold_2_S += 1 elif fold_2_S_i == 0 and fold_2_V_i == 1: fold_2_V += 1 else: #print(fold_S_count, fold_V_count) continue elif fold_count == 2: fold_3 += 1 elif fold_count == 3: fold_4 += 1 N_codon += (3 - fold_count) / 3 N += N_codon # synonymous sites. # calculated using http://bioinformatics.cvr.ac.uk/blog/calculating-dnds-for-ngs-datasets/ S = (3*len(codons)) - N N = round(N, 2) S = round(S, 2) # fold_2_S and fold_2_V calculated using Comeron, 1995 out_line = [locus_tag, protein_id, gene, f.type, size, GC, chrom, fold_1, fold_2, \ fold_2_S, fold_2_V, fold_3, fold_4, N, S, spore_associated] else: #m = G + C -> A + T / A + T -> G + C out_line = [locus_tag, protein_id, gene, f.type, size, GC, chrom, 'nan', 'nan', \ 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', spore_associated] print(locus_tag) df_out.write('\t'.join([str(x) for x in out_line]) + '\n') df_out.close()