axis.set_ylabel('Fixed mutations') axis.set_xlim([-1, 22]) if population_idx == 5: axis.set_xlabel('Clones') ######################################## # # Now do the plotting (focal first, then rest) # ######################################## theory_times = numpy.arange(0, 121) * 500 gene_names, start_positions, end_positions, promoter_start_positions, promoter_end_positions, gene_sequences, strands = parse_file.parse_gene_list( ) gene_name_position_map = { gene_names[i]: (start_positions[i], end_positions[i]) for i in xrange(0, len(gene_names)) } state_color_map = { parse_file.clade_hmm_states['FB']: '0.7', parse_file.clade_hmm_states['FM']: '#7a0177', parse_file.clade_hmm_states['Fm']: '#f768a1' } for metapopulation_idx in xrange(0, 2): for population_idx in xrange(0, 6):
mutator_axis.text(0.6, 1.5e05, figure_utils.get_panel_label('b'), fontsize=6, fontweight='bold') #### # # Do calculation # #### excluded_types = set(['sv', 'indel']) reference_sequence = parse_file.parse_reference_genome() gene_data = parse_file.parse_gene_list() repeat_data = parse_file.parse_repeat_list() mask_data = parse_file.parse_mask_list() position_gene_map, effective_gene_lengths, substitution_specific_synonymous_fraction = parse_file.create_annotation_map( gene_data, repeat_data, mask_data) #Ltot = 4.4e06 Ltot = len(reference_sequence) - effective_gene_lengths['masked'] sys.stderr.write("Ltot = %d\n" % Ltot) for population_group in ['nonmutators', 'mutators']: if population_group == 'nonmutators': populations = parse_file.complete_nonmutator_lines color = figure_utils.nonmutator_group_color
def plot_within_taxon_paralleliism(taxon, slope_null=1): fig = plt.figure(figsize=(12, 8)) gene_data = parse_file.parse_gene_list(taxon) gene_names, gene_start_positions, gene_end_positions, promoter_start_positions, promoter_end_positions, gene_sequences, strands, genes, features, protein_ids = gene_data # to get the common gene names for each ID ax_multiplicity = plt.subplot2grid((2, 3), (0, 0), colspan=1) ax_mult_freq = plt.subplot2grid((2, 3), (0, 1), colspan=1) ax_venn = plt.subplot2grid((2, 3), (0, 2), colspan=1) ax_multiplicity.set_xscale('log', base=10) ax_multiplicity.set_yscale('log', base=10) ax_multiplicity.set_xlabel('Gene multiplicity, ' + r'$m$', fontsize=14) ax_multiplicity.set_ylabel('Fraction mutations ' + r'$\geq m$', fontsize=14) ax_multiplicity.text(-0.1, 1.07, pt.sub_plot_labels[0], fontsize=18, fontweight='bold', ha='center', va='center', transform=ax_multiplicity.transAxes) ax_multiplicity.set_ylim([0.001, 1.1]) ax_multiplicity.set_xlim([0.07, 130]) ax_mult_freq.set_xscale('log', base=10) ax_mult_freq.set_yscale('log', base=10) ax_mult_freq.set_xlabel('Gene multiplicity, ' + r'$m$', fontsize=14) ax_mult_freq.set_ylabel('Mean maximum allele frequency, ' + r'$\overline{f}_{max}$', fontsize=11) ax_mult_freq.text(-0.1, 1.07, pt.sub_plot_labels[1], fontsize=18, fontweight='bold', ha='center', va='center', transform=ax_mult_freq.transAxes) ax_venn.axis('off') ax_venn.text(-0.1, 1.07, pt.sub_plot_labels[2], fontsize=18, fontweight='bold', ha='center', va='center', transform=ax_venn.transAxes) alpha_treatment_dict = {'0': 0.5, '1': 0.5, '2': 0.8} significant_multiplicity_dict = {} significant_multiplicity_values_dict = {} multiplicity_dict = {} g_score_p_label_dict = {} all_mults = [] all_freqs = [] treatments_in_taxon = [] label_y_axes = [0.3, 0.2, 0.1] for treatment_idx, treatment in enumerate(pt.treatments): significan_multiplicity_taxon_path = pt.get_path( ) + '/data/timecourse_final/parallel_genes_%s.txt' % (treatment + taxon) if os.path.exists(significan_multiplicity_taxon_path) == False: continue treatments_in_taxon.append(treatment) significan_multiplicity_taxon = open( significan_multiplicity_taxon_path, "r") significan_multiplicity_list = [] for i, line in enumerate(significan_multiplicity_taxon): if i == 0: continue line = line.strip() items = line.split(",") significan_multiplicity_list.append(items[0]) if items[0] not in significant_multiplicity_values_dict: significant_multiplicity_values_dict[items[0]] = {} significant_multiplicity_values_dict[ items[0]][treatment] = float(items[-2]) else: significant_multiplicity_values_dict[ items[0]][treatment] = float(items[-2]) significant_multiplicity_dict[treatment] = significan_multiplicity_list populations = [ treatment + taxon + replicate for replicate in pt.replicates ] # Load convergence matrix convergence_matrix = parse_file.parse_convergence_matrix( pt.get_path() + '/data/timecourse_final/' + ("%s_convergence_matrix.txt" % (treatment + taxon))) gene_parallelism_statistics = mutation_spectrum_utils.calculate_parallelism_statistics( convergence_matrix, populations, Lmin=100) #print(gene_parallelism_statistics) G, pvalue = mutation_spectrum_utils.calculate_total_parallelism( gene_parallelism_statistics) sys.stdout.write("Total parallelism for %s = %g (p=%g)\n" % (treatment + taxon, G, pvalue)) predictors = [] responses = [] gene_hits = [] gene_predictors = [] mean_gene_freqs = [] Ls = [] ax_mult_freqs_x = [] ax_mult_freqs_y = [] for gene_name in convergence_matrix.keys(): convergence_matrix[gene_name][ 'length'] < 50 and convergence_matrix[gene_name]['length'] Ls.append(convergence_matrix[gene_name]['length']) m = gene_parallelism_statistics[gene_name]['multiplicity'] if gene_name not in multiplicity_dict: multiplicity_dict[gene_name] = {} multiplicity_dict[gene_name][treatment] = m else: multiplicity_dict[gene_name][treatment] = m n = 0 nfixed = 0 freqs = [] nf_max = 0 for population in populations: for t, L, f, f_max in convergence_matrix[gene_name][ 'mutations'][population]: fixed_weight = timecourse_utils.calculate_fixed_weight( L, f) predictors.append(m) responses.append(fixed_weight) n += 1 nfixed += fixed_weight # get freqs for regression #if L == parse_file.POLYMORPHIC: #freqs.append(f_max) nf_max += timecourse_utils.calculate_fixed_weight(L, f_max) if n > 0.5: gene_hits.append(n) gene_predictors.append(m) #mean_gene_freqs.append(np.mean(freqs)) if nf_max > 0: ax_mult_freqs_x.append(m) ax_mult_freqs_y.append(nf_max / n) Ls = np.asarray(Ls) ntot = len(predictors) mavg = ntot * 1.0 / len(Ls) predictors, responses = (np.array(x) for x in zip( *sorted(zip(predictors, responses), key=lambda pair: (pair[0])))) gene_hits, gene_predictors = (np.array(x) for x in zip(*sorted( zip(gene_hits, gene_predictors), key=lambda pair: (pair[0])))) rescaled_predictors = np.exp(np.fabs(np.log(predictors / mavg))) null_survival_function = mutation_spectrum_utils.NullMultiplicitySurvivalFunction.from_parallelism_statistics( gene_parallelism_statistics) # default base is 10 theory_ms = np.logspace(-2, 2, 100) theory_survivals = null_survival_function(theory_ms) theory_survivals /= theory_survivals[0] sys.stderr.write("Done!\n") ax_multiplicity.plot(theory_ms, theory_survivals, lw=3, color=pt.get_colors(treatment), alpha=0.8, ls=':', zorder=1) ax_multiplicity.plot( predictors, (len(predictors) - np.arange(0, len(predictors))) * 1.0 / len(predictors), lw=3, color=pt.get_colors(treatment), alpha=0.8, ls='--', label=str(int(10**int(treatment))) + '-day', drawstyle='steps', zorder=2) #ax_multiplicity.text(0.2, 0.3, g_score_p_label_dict['0'], fontsize=25, fontweight='bold', ha='center', va='center', transform=ax_multiplicity.transAxes) #ax_multiplicity.text(0.2, 0.2, g_score_p_label_dict['1'], fontsize=25, fontweight='bold', ha='center', va='center', transform=ax_multiplicity.transAxes) #ax_multiplicity.text(0.2, 0.1, g_score_p_label_dict['2'], fontsize=25, fontweight='bold', ha='center', va='center', transform=ax_multiplicity.transAxes) if pvalue < 0.001: pretty_pvalue = r'$\ll 0.001$' else: pretty_pvalue = '=' + str(round(pvalue, 4)) g_score_p_label = r'$\Delta \ell_{{{}}}=$'.format( str(10**int(treatment))) + str(round( G, 3)) + ', ' + r'$P$' + pretty_pvalue text_color = pt.lighten_color(pt.get_colors(treatment), amount=1.3) ax_multiplicity.text(0.26, label_y_axes[treatment_idx], g_score_p_label, fontsize=7, ha='center', va='center', color='k', transform=ax_multiplicity.transAxes) ax_mult_freq.scatter(ax_mult_freqs_x, ax_mult_freqs_y, color=pt.get_colors(treatment), edgecolors=pt.get_colors(treatment), marker=pt.plot_species_marker(taxon), alpha=alpha_treatment_dict[treatment]) all_mults.extend(ax_mult_freqs_x) all_freqs.extend(ax_mult_freqs_y) #slope, intercept, r_value, p_value, std_err = stats.linregress(np.log10(ax_mult_freqs_x), np.log10(ax_mult_freqs_y)) #print(slope, p_value) # make treatment pairs treatments_in_taxon.sort(key=float) for i in range(0, len(treatments_in_taxon)): for j in range(i + 1, len(treatments_in_taxon)): ax_mult_i_j = plt.subplot2grid((2, 3), (1, i + j - 1), colspan=1) ax_mult_i_j.set_xscale('log', base=10) ax_mult_i_j.set_yscale('log', base=10) ax_mult_i_j.set_xlabel(str(10**int(treatments_in_taxon[i])) + '-day gene multiplicity, ' + r'$m$', fontsize=14) ax_mult_i_j.set_ylabel(str(10**int(treatments_in_taxon[j])) + '-day gene multiplicity, ' + r'$m$', fontsize=14) ax_mult_i_j.plot([0.05, 200], [0.05, 200], lw=3, c='grey', ls='--', zorder=1) ax_mult_i_j.set_xlim([0.05, 200]) ax_mult_i_j.set_ylim([0.05, 200]) ax_mult_i_j.text(-0.1, 1.07, pt.sub_plot_labels[2 + i + j], fontsize=18, fontweight='bold', ha='center', va='center', transform=ax_mult_i_j.transAxes) multiplicity_pair = [ (multiplicity_dict[gene_name][treatments_in_taxon[i]], multiplicity_dict[gene_name][treatments_in_taxon[j]]) for gene_name in sorted(multiplicity_dict) if (multiplicity_dict[gene_name][treatments_in_taxon[i]] > 0) and (multiplicity_dict[gene_name][treatments_in_taxon[j]] > 0) ] significant_multiplicity_pair = [ (significant_multiplicity_values_dict[gene_name][ treatments_in_taxon[i]], significant_multiplicity_values_dict[gene_name][ treatments_in_taxon[j]]) for gene_name in sorted(significant_multiplicity_values_dict) if (treatments_in_taxon[i] in significant_multiplicity_values_dict[gene_name]) and ( treatments_in_taxon[j] in significant_multiplicity_values_dict[gene_name]) ] # get mean colors ccv = ColorConverter() color_1 = np.array( ccv.to_rgb(pt.get_colors(treatments_in_taxon[i]))) color_2 = np.array( ccv.to_rgb(pt.get_colors(treatments_in_taxon[j]))) mix_color = 0.7 * (color_1 + color_2) mix_color = np.min([mix_color, [1.0, 1.0, 1.0]], 0) if (treatments_in_taxon[i] == '0') and (treatments_in_taxon[j] == '1'): #mix_color = pt.lighten_color(mix_color, amount=2.8) mix_color = 'gold' mult_i = [x[0] for x in multiplicity_pair] mult_j = [x[1] for x in multiplicity_pair] ax_mult_i_j.scatter(mult_i, mult_j, marker=pt.plot_species_marker(taxon), facecolors=mix_color, edgecolors='none', alpha=0.8, s=90, zorder=2) mult_significant_i = [x[0] for x in significant_multiplicity_pair] mult_significant_j = [x[1] for x in significant_multiplicity_pair] ax_mult_i_j.scatter(mult_significant_i, mult_significant_j, marker=pt.plot_species_marker(taxon), facecolors=mix_color, edgecolors='k', lw=1.5, alpha=0.7, s=90, zorder=3) #slope_mult, intercept_mult, r_value_mult, p_value_mult, std_err_mult = stats.linregress(np.log10(mult_significant_i), np.log10(mult_significant_j)) mult_ij = mult_significant_i + mult_significant_j + mult_i + mult_j ax_mult_i_j.set_xlim([min(mult_ij) * 0.5, max(mult_ij) * 1.5]) ax_mult_i_j.set_ylim([min(mult_ij) * 0.5, max(mult_ij) * 1.5]) # null slope of 1 #ratio = (slope_mult - slope_null) / std_err_mult #p_value_mult_new_null = stats.t.sf(np.abs(ratio), len(mult_significant_j)-2)*2 #if p_value_mult_new_null < 0.05: # x_log10_fit_range = np.linspace(np.log10(min(mult_i) * 0.5), np.log10(max(mult_i) * 1.5), 10000) # y_fit_range = 10 ** (slope_mult*x_log10_fit_range + intercept_mult) # ax_mult_i_j.plot(10**x_log10_fit_range, y_fit_range, c='k', lw=3, linestyle='--', zorder=4) #ax_mult_i_j.text(0.05, 0.9, r'$\beta_{1}=$'+str(round(slope_mult,3)), fontsize=12, transform=ax_mult_i_j.transAxes) #ax_mult_i_j.text(0.05, 0.82, r'$r^{2}=$'+str(round(r_value_mult**2,3)), fontsize=12, transform=ax_mult_i_j.transAxes) #ax_mult_i_j.text(0.05, 0.74, pt.get_p_value_latex(p_value_mult_new_null), fontsize=12, transform=ax_mult_i_j.transAxes) #if taxon == 'F': # subset_tuple = (len( significant_multiplicity_dict['0']), \ # len( significant_multiplicity_dict['1']), \ # len(set(significant_multiplicity_dict['0']) & set(significant_multiplicity_dict['1']))) # venn = venn2(subsets = subset_tuple, ax=ax_venn, set_labels=('', '', ''), set_colors=(pt.get_colors('0'), pt.get_colors('1'))) # c = venn2_circles(subsets=subset_tuple, ax=ax_venn, linestyle='dashed') subset_tuple = (len( significant_multiplicity_dict['0']), \ len( significant_multiplicity_dict['1']), \ len(set(significant_multiplicity_dict['0']) & set(significant_multiplicity_dict['1'])), \ len(significant_multiplicity_dict['2']), \ len(set(significant_multiplicity_dict['0']) & set(significant_multiplicity_dict['2'])), \ len(set(significant_multiplicity_dict['1']) & set(significant_multiplicity_dict['2'])), \ len(set(significant_multiplicity_dict['1']) & set(significant_multiplicity_dict['1']) & set(significant_multiplicity_dict['2']))) venn = venn3(subsets=subset_tuple, ax=ax_venn, set_labels=('', '', ''), set_colors=(pt.get_colors('0'), pt.get_colors('1'), pt.get_colors('2'))) c = venn3_circles(subsets=subset_tuple, ax=ax_venn, linestyle='dashed') ax_mult_freq.set_xlim([min(all_mults) * 0.5, max(all_mults) * 1.5]) ax_mult_freq.set_ylim([min(all_freqs) * 0.5, max(all_freqs) * 1.5]) fig.suptitle(pt.latex_dict[taxon], fontsize=30) fig.subplots_adjust(wspace=0.3) #hspace=0.3, wspace=0.5 fig_name = pt.get_path() + "/figs/multiplicity_%s.jpg" % taxon fig.savefig(fig_name, format='jpg', bbox_inches="tight", pad_inches=0.4, dpi=600) plt.close()
# mutual information / joing entropy joint_entropy = stats.entropy(array_1,array_2) standardized_gene_overlap = {} for taxon in pt.taxa: #if taxon == 'J': # continue gene_dict = {} N_significant_genes_dict = {} gene_data = parse_file.parse_gene_list(taxon) gene_names, gene_start_positions, gene_end_positions, promoter_start_positions, promoter_end_positions, gene_sequences, strands, genes, features, protein_ids = gene_data #locus_tag_to_gene_dict = {} #for gene_name_idx, gene_name in enumerate(gene_names): # gene = genes[gene_name_idx] # if gene == '': # continue # locus_tag_to_gene_dict[gene_name] = genes[gene_name_idx] if taxon == 'J': treatments_convergence = ['0', '1'] else: treatments_convergence = ['0', '1', '2']
import phik np.random.seed(123456789) # to-do: re-do analysis for enriched genes in *either* treatment you're comparing # read in nonsignificant genes and add those conts in.. permutations_divergence = 10000 treatment_pairs = [['0','1'],['0','2'],['1','2']] gene_data_B = parse_file.parse_gene_list('B') gene_names_B, gene_start_positions_B, gene_end_positions_B, promoter_start_positions_B, promoter_end_positions_B, gene_sequences_B, strands_B, genes_B, features_B, protein_ids_B = gene_data_B gene_name_dict = dict(zip(gene_names_B, genes_B )) protein_id_dict = dict(zip(gene_names_B, protein_ids_B )) significant_multiplicity_dict = {} significant_n_mut_dict = {} gene_size_dict = {} gene_mean_size_dict = {} for taxon in pt.taxa: significant_multiplicity_dict[taxon] = {} significant_n_mut_dict[taxon] = {} gene_size_dict[taxon] = {} gene_data = parse_file.parse_gene_list(taxon)
def process_output(): for strain in strains: parse_gene_list = pf.parse_gene_list(taxon=strain) for treatment in treatments: for rep in reps: print('%s%s%s' % (treatment, strain, rep)) sample = '%s%s%s' % (treatment, strain, rep) snp_timecourse_filename = '%s%s%s_snp_timecourse.bz' % ( treatment, strain, rep) snp_timecourse_path = pt.get_path( ) + '/data/timecourse_snp/' + snp_timecourse_filename snp_file = bz2.open(snp_timecourse_path, "rt") depth_timecourse_filename = '%s%s%s_depth_timecourse.bz' % ( treatment, strain, rep) depth_timecourse_path = pt.get_path( ) + '/data/timecourse_depth/' + depth_timecourse_filename depth_file = bz2.open(depth_timecourse_path, "rt") for depth in depth_file: depth_split = [x.strip() for x in depth.split(',')] D_pt_median = depth_split[-1].split(' ') D_pt_median = np.asarray([float(x) for x in D_pt_median]) file_mutations = [] for snp in snp_file: snp_split = [x.strip() for x in snp.split(',')] t_pm = snp_split[3].split(' ') t_pm = np.asarray([int(x) for x in t_pm]) A_pm = snp_split[4].split(' ') A_pm = np.asarray([int(x) for x in A_pm]) D_pm = snp_split[5].split(' ') D_pm = np.asarray([int(x) for x in D_pm]) if len(t_pm) == 1: continue # remove D_pmt < 5 remove_D_pmt = np.asarray( [x for x, y in enumerate(D_pm) if y < 5]) D_pt_median_copy = np.empty_like(D_pt_median) D_pt_median_copy[:] = D_pt_median if len(remove_D_pmt) > 0: t_pm = np.delete(t_pm, remove_D_pmt) A_pm = np.delete(A_pm, remove_D_pmt) D_pm = np.delete(D_pm, remove_D_pmt) D_pt_median_copy = np.delete(D_pt_median_copy, remove_D_pmt) # remove low coverage timepoints d_pm = D_pm / D_pt_median_copy # don't look at trajectories with fewer than four if len(t_pm[1:]) < 4: continue l_list = [ get_likelihood(t_, t=t_pm, d_pm=d_pm) for t_ in t_pm[1:-2] ] l_list = sorted(l_list, key=lambda x: x[-1]) max_l = l_list[-1] # r threshold of 0.5 too conservative #if max_l[-2] >= 0.5: # continue n = len(t_pm) sigma = np.sqrt((1 / n) * sum(d_pm**2) - (((1 / n) * sum(d_pm))**2)) if (max_l[4] == 0) or (max_l[2] == 0): continue delta_l = max_l[3] * np.log( sigma / max_l[4]) + max_l[1] * np.log(sigma / max_l[2]) # try permutation test for log likelihood? # need upper threshold for delta_l, chose 20 for now #if delta_l > 20: # continue C_star_mut, I_mut, T_mut = get_test_statistics( t_pm, A_pm, D_pm) if np.isnan(np.sum([C_star_mut, I_mut, T_mut])): continue f_pm = A_pm / D_pm if (f_pm[0] > 0.9) and (f_pm[-1] > 0.9): continue #if f_pm[-1] > 0.05: if (sample == '0C1') or (sample == '0C4'): last_timepoint = -2 else: last_timepoint = -1 if f_pm[last_timepoint] > 0.05: allele_split = snp_split[2].split('->') anc = allele_split[0] der = allele_split[1] site = int(snp_split[1]) genes_names_site = [] for k in list(range(len(parse_gene_list[0]))): gene_name = parse_gene_list[0][k] start = int(parse_gene_list[1][k]) stop = int(parse_gene_list[2][k]) if (site >= start) and (site <= stop): genes_names_site.append(gene_name) genes_names_site_merged = ",".join(genes_names_site) file_mutations.append([ snp_split[0], genes_names_site_merged, snp_split[1], anc, der, str(f_pm[-1]) ]) # out file header = [ 'Contig', 'Locus_tag', 'Site', 'Ancestral', 'Derived', 'Final_freq' ] mutation_filename = '%s%s%s_snp_final.txt' % (treatment, strain, rep) mutation_file = open( pt.get_path() + '/data/timecourse_prelim_poly/' + mutation_filename, 'w') mutation_file.write('\t'.join(header) + '\n') for mutation in file_mutations: mutation_file.write("\t".join(mutation)) mutation_file.write("\n") mutation_file.close()