def likelihood_subsample(taxon, treatment, ntot_subsample=50, fmax_cutoff=0.8, fmin_cutoff=0.0, subsamples=10000): # ntot_subsample minimum number of mutations # Load convergence matrix convergence_matrix = parse_file.parse_convergence_matrix(pt.get_path() + '/data/timecourse_final/' +("%s_convergence_matrix.txt" % (treatment+taxon))) populations = [treatment+taxon + replicate for replicate in pt.replicates ] gene_parallelism_statistics = mutation_spectrum_utils.calculate_parallelism_statistics(convergence_matrix,populations, fmax_min=fmax_cutoff) G_subsample_list = [] for i in range(subsamples): G_subsample = mutation_spectrum_utils.calculate_subsampled_total_parallelism(gene_parallelism_statistics, ntot_subsample=ntot_subsample) G_subsample_list.append(G_subsample) G_subsample_list.sort() G_CIs_dict = {} G_subsample_mean = np.mean(G_subsample_list) G_subsample_025 = G_subsample_list[ int( 0.025 * subsamples) ] G_subsample_975 = G_subsample_list[ int( 0.975 * subsamples) ] G_CIs_dict['G_mean'] = G_subsample_mean G_CIs_dict['G_025'] = G_subsample_025 G_CIs_dict['G_975'] = G_subsample_975 return G_CIs_dict
def parse_reference_genome(taxon): filename= pt.get_path() + '/' + pt.get_ref_gbff_dict(taxon) reference_sequences = [] # GBK file if filename[-3:] == 'gbk': file = open(filename,"r") origin_reached = False for line in file: if line.startswith("ORIGIN"): origin_reached=True if origin_reached: items = line.split() if items[0].isdigit(): reference_sequences.extend(items[1:]) file.close() # FASTA file else: file = open(filename,"r") file.readline() # header for line in file: reference_sequences.append(line.strip()) file.close() reference_sequence = "".join(reference_sequences).upper() return reference_sequence
def parse_simulation_output(): saved_data_file = '%s/data/simulations/test.dat' % (pt.get_path()) sampled_timepoints = pickle.load(open(saved_data_file, "rb")) allele_freq_trajectory_dict = {} for key, value in sampled_timepoints.items(): #print(value) N = sum([value[x]['n_clone_active'] for x in value.keys()]) M = sum([value[x]['n_clone_dormant'] for x in value.keys()]) print(N, M)
def calculate_likelihood_ratio_fmax(taxon, treatment, ntot_subsample=50, fmax_partition=0.8, subsamples=10000): convergence_matrix = parse_file.parse_convergence_matrix( pt.get_path() + '/data/timecourse_final/' + ("%s_convergence_matrix.txt" % (treatment + taxon))) populations = [ treatment + taxon + replicate for replicate in pt.replicates ] gene_parallelism_statistics = mutation_spectrum_utils.calculate_parallelism_statistics( convergence_matrix, populations, fmax_min=fmax_cutoff) G_subsample_list = []
def parse_well_mixed_state_timecourse(population): haplotype_filename = pt.get_path() + '/data/timecourse_final/' +('%s_well_mixed_state_timecourse.txt' % population) file = open(haplotype_filename,"r") times = numpy.array([float(item) for item in file.readline().split(",")]) num_unborn = numpy.array([float(item) for item in file.readline().split(",")]) num_extinct = numpy.array([float(item) for item in file.readline().split(",")]) num_fixed = numpy.array([float(item) for item in file.readline().split(",")]) num_polymorphic = numpy.array([float(item) for item in file.readline().split(",")]) states = [] for line in file: Ls = numpy.array([float(item) for item in line.split(",")]) states.append(Ls) file.close() return times, states
def parse_annotated_timecourse(population, only_passed=True, min_coverage=5): mutations = [] timecourse_filename = pt.get_path() + '/data/timecourse_final/' +("%s_annotated_timecourse.txt" % population) file = open(timecourse_filename, "r") header_line = file.readline() items = header_line.strip().split(",") times = [] # 13 for i in range(16,len(items),2): times.append(int(items[i].split(":")[1])) times = numpy.array(times) # depth line depth_line = file.readline() items = depth_line.strip().split(",") avg_depths = [] for i in range(16,len(items),2): avg_depths.append(float(items[i+1])) avg_depths = numpy.array(avg_depths) population_avg_depth_times = times[times<1000000] population_avg_depths = avg_depths[times<1000000] clone_avg_depth_times = times[times>1000000]-1000000 clone_avg_depths = avg_depths[times>1000000] for line in file: items = line.strip().split(",") location = int(items[0]) gene_name = items[1].strip() allele = items[2].strip() var_type = items[3].strip() codon = items[4].strip() position_in_codon = items[5].strip() if (position_in_codon != 'None') and (position_in_codon != 'unknown'): position_in_codon = int(position_in_codon) fold_count = items[6].strip() if (fold_count != 'None') and (fold_count != 'unknown'): fold_count = int(fold_count) test_statistic = float(items[7]) pvalue = float(items[8]) cutoff_idx = int(items[9]) depth_fold_change = float(items[10]) depth_change_pvalue = float(items[11]) duplication_idx = int(items[12]) fold_increase = float(items[13]) duplication_pvalue = float(items[14]) passed_str = items[15] if passed_str.strip()=='PASS': passed = True else: passed = False alts = [] depths = [] for i in range(16,len(items),2): alts.append(int(float(items[i]))) depths.append(int(float(items[i+1]))) alts = numpy.array(alts) depths = numpy.array(depths) # zero out timepoints with individual coverage lower than some threshold alts *= (depths>=min_coverage)*(avg_depths>=min_coverage) depths *= (depths>=min_coverage)*(avg_depths>=min_coverage) pop_times = times[(times<1000000)] pop_alts = alts[(times<1000000)] pop_depths = depths[(times<1000000)] clone_times = times[(times>1000000)]-1000000 clone_alts = alts[(times>1000000)] clone_depths = depths[(times>1000000)] if passed or (not only_passed): mutations.append((location, gene_name, allele, var_type, codon, position_in_codon, fold_count, test_statistic, pvalue, cutoff_idx, depth_fold_change, depth_change_pvalue, pop_times, pop_alts, pop_depths, clone_times, clone_alts, clone_depths)) file.close() #print(mutations[0]) # sort by position keys = [mutation[0] for mutation in mutations] keys, mutations = (list(t) for t in zip(*sorted(zip(keys, mutations)))) return mutations, (population_avg_depth_times, population_avg_depths, clone_avg_depth_times, clone_avg_depths)
def parse_gene_list(taxon, reference_sequence=None): gene_names = [] start_positions = [] end_positions = [] promoter_start_positions = [] promoter_end_positions = [] gene_sequences = [] strands = [] genes = [] features = [] protein_ids = [] filename= pt.get_path() + '/' + pt.get_ref_gbff_dict(taxon) gene_features = ['CDS', 'tRNA', 'rRNA', 'ncRNA', 'tmRNA'] recs = [rec for rec in SeqIO.parse(filename, "genbank")] count_riboswitch = 0 for rec in recs: reference_sequence = rec.seq contig = rec.annotations['accessions'][0] for feat in rec.features: if 'pseudo' in list((feat.qualifiers.keys())): continue if (feat.type == "source") or (feat.type == "gene"): continue locations = re.findall(r"[\w']+", str(feat.location)) if feat.type in gene_features: locus_tag = feat.qualifiers['locus_tag'][0] elif (feat.type=="regulatory"): locus_tag = feat.qualifiers["regulatory_class"][0] + '_' + str(count_riboswitch) count_riboswitch += 1 else: continue # for frameshifts, split each CDS seperately and merge later # Fix this for Deinococcus, it has a frameshift in three pieces split_list = [] if 'join' in locations: location_str = str(feat.location) minus_position = [] if '-' in location_str: minus_position = [r.start() for r in re.finditer('-', location_str)] pos_position = [] if '+' in location_str: if taxon == 'D': pos_position = [pos for pos, char in enumerate(location_str) if char == '+'] elif taxon == 'J': pos_position = [pos for pos, char in enumerate(location_str) if char == '+'] else: pos_position = [r.start() for r in re.finditer('+', location_str)] if len(minus_position) + len(pos_position) == 2: if len(minus_position) == 2: strand_symbol_one = '-' strand_symbol_two = '-' elif len(pos_position) == 2: strand_symbol_one = '+' strand_symbol_two = '+' else: # I don't think this is possible, but might as well code it up if minus_position[0] < pos_position[0]: strand_symbol_one = '-' strand_symbol_two = '+' else: strand_symbol_one = '+' strand_symbol_two = '-' start_one = int(locations[1]) stop_one = int(locations[2]) start_two = int(locations[3]) stop_two = int(locations[4]) locus_tag1 = locus_tag + '_1' locus_tag2 = locus_tag + '_2' split_list.append([locus_tag1, start_one, stop_one, strand_symbol_one]) split_list.append([locus_tag2, start_two, stop_two, strand_symbol_two]) else: if len(pos_position) == 3: strand_symbol_one = '+' strand_symbol_two = '+' strand_symbol_three = '+' start_one = int(locations[1]) stop_one = int(locations[2]) start_two = int(locations[3]) stop_two = int(locations[4]) start_three = int(locations[5]) stop_three = int(locations[6]) locus_tag1 = locus_tag + '_1' locus_tag2 = locus_tag + '_2' locus_tag3 = locus_tag + '_3' split_list.append([locus_tag1, start_one, stop_one, strand_symbol_one]) split_list.append([locus_tag2, start_two, stop_two, strand_symbol_two]) split_list.append([locus_tag3, start_three, stop_three, strand_symbol_three]) else: strand_symbol = str(feat.location)[-2] start = int(locations[0]) stop = int(locations[1]) split_list.append([locus_tag, start, stop, strand_symbol]) for split_item in split_list: locus_tag = split_item[0] start = split_item[1] stop = split_item[2] strand_symbol = split_item[3] if feat.type == 'CDS': # why was a -1 there originally? #gene_sequence = reference_sequence[start-1:stop] gene_sequence = str(reference_sequence[start:stop]) else: gene_sequence = "" if 'gene' in list((feat.qualifiers.keys())): gene = feat.qualifiers['gene'][0] else: gene = "" if 'protein_id' in list((feat.qualifiers.keys())): protein_id = feat.qualifiers['protein_id'][0] else: protein_id = "" if strand_symbol == '+': promoter_start = start - 100 # by arbitrary definition, we treat the 100bp upstream as promoters promoter_end = start - 1 strand = 'forward' else: promoter_start = stop+1 promoter_end = stop+100 strand = 'reverse' if gene_sequence!="" and (not len(gene_sequence)%3==0): print(locus_tag, start, "Not a multiple of 3") continue # dont need to check if gene names are unique because we're using # locus tags start_positions.append(start) end_positions.append(stop) promoter_start_positions.append(promoter_start) promoter_end_positions.append(promoter_end) gene_names.append(locus_tag) gene_sequences.append(gene_sequence) strands.append(strand) genes.append(gene) features.append(feat.type) protein_ids.append(protein_id) gene_names, start_positions, end_positions, promoter_start_positions, promoter_end_positions, gene_sequences, strands, genes, features, protein_ids = (list(x) for x in zip(*sorted(zip(gene_names, start_positions, end_positions, promoter_start_positions, promoter_end_positions, gene_sequences, strands, genes, features, protein_ids), key=lambda pair: pair[1]))) return gene_names, numpy.array(start_positions), numpy.array(end_positions), numpy.array(promoter_start_positions), numpy.array(promoter_end_positions), gene_sequences, strands, genes, features, protein_ids
maple_annotation_dict = {} kegg_maple_map_all_taxa = {} treatment_count_dict = {} for treatment in treatments: treatment_count_dict[treatment] = 0 for taxon in taxa: protein_id_kegg_dict = {} protein_id_kegg = open( pt.get_path() + '/data/reference_assemblies_task2/MAPLE/%s_MAPLE_result/query.fst.ko' % taxon, 'r') # make protein ID => KEGG map for i, line in enumerate(protein_id_kegg): line = line.strip() items = line.split("\t") protein_id = items[0] if items[1] != 'K_NA': protein_id_kegg_dict[items[0]] = items[1] significant_genes_path = pt.get_path( ) + '/data/timecourse_final/parallel_genes_%s.txt' % (treatment + taxon) if os.path.exists(significant_genes_path) == False: continue
import scipy.stats as stats import parse_file import timecourse_utils import mutation_spectrum_utils np.random.seed(123456789) treatments = pt.treatments replicates = pt.replicates color_range = np.linspace(0.0, 1.0, 10) rgb_blue = cm.get_cmap('Blues')(color_range) rgb_red = cm.get_cmap('Reds')(color_range) path_IN = pt.get_path() + '/data/spore_assay/Sporulation_170912_long.txt' IN = pd.read_csv(path_IN, sep='\t') IN = IN.loc[IN['Time_hours'] <= 400] #d100 IN_0B1_100 = IN.loc[(IN['Pop'] == '0B1') & (IN['Day'] == 100)] IN_2B1_100 = IN.loc[(IN['Pop'] == '2B1') & (IN['Day'] == 100)] IN_mean_0B1_100 = IN_0B1_100['Vegetative_percent'].groupby( IN_0B1_100['Time_hours']).mean().reset_index() IN_mean_2B1_100 = IN_2B1_100['Vegetative_percent'].groupby( IN_2B1_100['Time_hours']).mean().reset_index() IN_std_0B1_100 = IN_0B1_100['Vegetative_percent'].groupby( IN_0B1_100['Time_hours']).std().reset_index() IN_std_2B1_100 = IN_2B1_100['Vegetative_percent'].groupby( IN_2B1_100['Time_hours']).std().reset_index() # Day 500 IN_0B1_500 = IN.loc[(IN['Pop'] == '0B1') & (IN['Day'] == 500)]
def plot_mutation_trajectory_taxon(taxon): if taxon == 'J': treatments = ['0', '2'] sub_plot_labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'] sub_plot_count_step = 2 dim = (6, 15) else: treatments = pt.treatments sub_plot_labels = [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o' ] sub_plot_count_step = 3 dim = (10, 15) sys.stderr.write("Loading mutation data...\n") mutation_trajectories = {} fixed_mutation_trajectories = {} delta_mutation_trajectories = {} #transit_times = {} median_trajectories = {} n_muts_trajectories = {} for treatment in treatments: for replicate in pt.replicates: population = treatment + taxon + replicate if population in pt.populations_to_ignore: continue sys.stderr.write("Processing %s...\t" % population) times, Ms, fixed_Ms = parse_file.get_mutation_fixation_trajectories( population) times_, medians_log10, num_muts = parse_file.get_mutation_fixation_trajectories_median_freq_and_mut_number( population) if isinstance(fixed_Ms, float) == True: fixed_Ms = np.asarray([0] * len(times)) fixed_mutation_trajectories[population] = (times, fixed_Ms) mutation_trajectories[population] = (times, np.log10(Ms)) delta_mutation_trajectories[population] = (times[1:], np.log10(Ms[1:] / Ms[:-1])) median_trajectories[population] = (times_, medians_log10) n_muts_trajectories[population] = (times_, num_muts) sys.stderr.write("analyzed %d mutations!\n" % len(Ms)) fig = plt.figure(figsize=dim) column_count = 0 for treatment in treatments: ax_t_vs_M = plt.subplot2grid((5, len(treatments)), (0, column_count), colspan=1) ax_t_vs_delta_M = plt.subplot2grid((5, len(treatments)), (1, column_count), colspan=1) ax_t_vs_F = plt.subplot2grid((5, len(treatments)), (2, column_count), colspan=1) ax_t_vs_median_freq = plt.subplot2grid((5, len(treatments)), (3, column_count), colspan=1) ax_t_vs_number_muts = plt.subplot2grid((5, len(treatments)), (4, column_count), colspan=1) ax_t_vs_M.text(-0.1, 1.07, sub_plot_labels[column_count], fontsize=14, fontweight='bold', ha='center', va='center', transform=ax_t_vs_M.transAxes) ax_t_vs_delta_M.text(-0.1, 1.07, sub_plot_labels[column_count + sub_plot_count_step], fontsize=14, fontweight='bold', ha='center', va='center', transform=ax_t_vs_delta_M.transAxes) ax_t_vs_F.text(-0.1, 1.07, sub_plot_labels[column_count + sub_plot_count_step * 2], fontsize=14, fontweight='bold', ha='center', va='center', transform=ax_t_vs_F.transAxes) ax_t_vs_median_freq.text(-0.1, 1.07, sub_plot_labels[column_count + sub_plot_count_step * 3], fontsize=14, fontweight='bold', ha='center', va='center', transform=ax_t_vs_median_freq.transAxes) ax_t_vs_number_muts.text(-0.1, 1.07, sub_plot_labels[column_count + sub_plot_count_step * 4], fontsize=14, fontweight='bold', ha='center', va='center', transform=ax_t_vs_number_muts.transAxes) treatment_taxon_populations = [] all_medians = [] all_numbers = [] for replicate in pt.replicates: population = treatment + taxon + replicate if population in pt.populations_to_ignore: continue Mts, Ms = mutation_trajectories[population] fixed_Mts, fixed_Ms = fixed_mutation_trajectories[population] deltaMts, deltaMs = delta_mutation_trajectories[population] median_trajectories_ts, median_trajectories_ = median_trajectories[ population] n_muts_trajectories_ts, n_muts_trajectories_ = n_muts_trajectories[ population] ax_t_vs_M.plot(Mts, 10**Ms, 'o-', color=pt.get_colors(treatment), marker=pt.plot_species_marker(taxon), fillstyle=pt.plot_species_fillstyle(taxon), alpha=1, markersize=7, linewidth=3, markeredgewidth=1.5, zorder=1) ax_t_vs_M.set_yscale('log', base=10) ax_t_vs_M.tick_params(axis='x', labelsize=8) # back transform to format plot axes ax_t_vs_delta_M.plot(deltaMts, 10**deltaMs, color=pt.get_colors(treatment), marker=pt.plot_species_marker(taxon), fillstyle=pt.plot_species_fillstyle(taxon)) ax_t_vs_delta_M.set_yscale('log', base=10) ax_t_vs_F.plot(fixed_Mts, fixed_Ms, 'o-', color=pt.get_colors(treatment), marker=pt.plot_species_marker(taxon), fillstyle=pt.plot_species_fillstyle(taxon), alpha=1, markersize=7, linewidth=3, markeredgewidth=1.5, zorder=1) #ax_M_vs_F.set_xlabel('Days, ' + r'$t$', fontsize = 12) ax_t_vs_median_freq.plot( median_trajectories_ts, 10**median_trajectories_, 'o-', color=pt.get_colors(treatment), marker=pt.plot_species_marker(taxon), fillstyle=pt.plot_species_fillstyle(taxon), alpha=1, markersize=7, linewidth=3, markeredgewidth=1.5, zorder=1) ax_t_vs_median_freq.set_yscale('log', base=10) #ax_t_vs_median_freq.tick_params(axis='y', labelsize=6) ax_t_vs_median_freq.yaxis.set_tick_params(labelsize=8) all_medians.extend(median_trajectories_.tolist()) ax_t_vs_number_muts.plot( n_muts_trajectories_ts, n_muts_trajectories_, 'o-', color=pt.get_colors(treatment), marker=pt.plot_species_marker(taxon), fillstyle=pt.plot_species_fillstyle(taxon), alpha=1, markersize=7, linewidth=3, markeredgewidth=1.5, zorder=1) ax_t_vs_number_muts.set_yscale('log', base=10) ax_t_vs_number_muts.tick_params(axis='y', labelsize=8) all_numbers.extend(n_muts_trajectories_.tolist()) treatment_taxon_populations.append(population) print(10**(min(all_medians) * 0.8), 10**(max(all_medians) * 1.2)) ax_t_vs_median_freq.set_ylim( [10**(min(all_medians)) * 0.8, 10**(max(all_medians)) * 1.2]) ax_t_vs_number_muts.set_ylim( [min(all_numbers) * 0.8, max(all_numbers) * 1.2]) avg_Mts, avg_Ms = timecourse_utils.average_trajectories([ mutation_trajectories[population] for population in treatment_taxon_populations ]) avg_deltaMts, avg_deltaMs = timecourse_utils.average_trajectories([ delta_mutation_trajectories[population] for population in treatment_taxon_populations ]) ax_t_vs_delta_M.axhline(y=1, c='grey', linestyle=':', lw=3, zorder=1) ax_t_vs_M.plot(avg_Mts, 10**avg_Ms, '--', color='k', marker=" ", alpha=1, linewidth=4, zorder=2) ax_t_vs_delta_M.plot(avg_deltaMts, 10**avg_deltaMs, '--', color='k', marker=" ", alpha=1, linewidth=4, zorder=2) # keep them on the same y axes if taxon == 'C': ax_t_vs_delta_M.set_ylim([0.2, 42]) elif taxon == 'D': ax_t_vs_delta_M.set_ylim([0.2, 20]) if (column_count == 0): legend_elements = [ Line2D([0], [0], ls='--', color='k', lw=1.5, label=r'$\overline{M}(t)$') ] ax_t_vs_M.legend(handles=legend_elements, loc='lower right', fontsize=8) ax_t_vs_M.set_title(str(10**int(treatment)) + '-day transfers', fontsize=17) #if treatment == '2': # ax_M_vs_F.yaxis.set_major_locator(MaxNLocator(integer=True)) if column_count == 0: ax_t_vs_M.set_ylabel('Mutations, ' + r'$M(t)$', fontsize=15) ax_t_vs_F.set_ylabel('Fixed mutations', fontsize=15) ax_t_vs_delta_M.set_ylabel('Change in mutations,\n' + r'$M(t)/M(t-1)$', fontsize=15) ax_t_vs_median_freq.set_ylabel( 'Median mutation freq.\nat time $t$', fontsize=15) ax_t_vs_number_muts.set_ylabel('Number of mutations\nat time $t$', fontsize=15) column_count += 1 fig.text(0.53, 0.05, 'Days, ' + r'$t$', ha='center', fontsize=28) fig.suptitle(pt.latex_genus_dict[taxon], fontsize=30) fig_name = pt.get_path() + '/figs/rate_%s.pdf' % taxon fig.savefig(fig_name, format='pdf', bbox_inches="tight", pad_inches=0.4, dpi=600) plt.close()
def plot_within_taxon_paralleliism(taxon, slope_null=1): fig = plt.figure(figsize=(12, 8)) gene_data = parse_file.parse_gene_list(taxon) gene_names, gene_start_positions, gene_end_positions, promoter_start_positions, promoter_end_positions, gene_sequences, strands, genes, features, protein_ids = gene_data # to get the common gene names for each ID ax_multiplicity = plt.subplot2grid((2, 3), (0, 0), colspan=1) ax_mult_freq = plt.subplot2grid((2, 3), (0, 1), colspan=1) ax_venn = plt.subplot2grid((2, 3), (0, 2), colspan=1) ax_multiplicity.set_xscale('log', base=10) ax_multiplicity.set_yscale('log', base=10) ax_multiplicity.set_xlabel('Gene multiplicity, ' + r'$m$', fontsize=14) ax_multiplicity.set_ylabel('Fraction mutations ' + r'$\geq m$', fontsize=14) ax_multiplicity.text(-0.1, 1.07, pt.sub_plot_labels[0], fontsize=18, fontweight='bold', ha='center', va='center', transform=ax_multiplicity.transAxes) ax_multiplicity.set_ylim([0.001, 1.1]) ax_multiplicity.set_xlim([0.07, 130]) ax_mult_freq.set_xscale('log', base=10) ax_mult_freq.set_yscale('log', base=10) ax_mult_freq.set_xlabel('Gene multiplicity, ' + r'$m$', fontsize=14) ax_mult_freq.set_ylabel('Mean maximum allele frequency, ' + r'$\overline{f}_{max}$', fontsize=11) ax_mult_freq.text(-0.1, 1.07, pt.sub_plot_labels[1], fontsize=18, fontweight='bold', ha='center', va='center', transform=ax_mult_freq.transAxes) ax_venn.axis('off') ax_venn.text(-0.1, 1.07, pt.sub_plot_labels[2], fontsize=18, fontweight='bold', ha='center', va='center', transform=ax_venn.transAxes) alpha_treatment_dict = {'0': 0.5, '1': 0.5, '2': 0.8} significant_multiplicity_dict = {} significant_multiplicity_values_dict = {} multiplicity_dict = {} g_score_p_label_dict = {} all_mults = [] all_freqs = [] treatments_in_taxon = [] label_y_axes = [0.3, 0.2, 0.1] for treatment_idx, treatment in enumerate(pt.treatments): significan_multiplicity_taxon_path = pt.get_path( ) + '/data/timecourse_final/parallel_genes_%s.txt' % (treatment + taxon) if os.path.exists(significan_multiplicity_taxon_path) == False: continue treatments_in_taxon.append(treatment) significan_multiplicity_taxon = open( significan_multiplicity_taxon_path, "r") significan_multiplicity_list = [] for i, line in enumerate(significan_multiplicity_taxon): if i == 0: continue line = line.strip() items = line.split(",") significan_multiplicity_list.append(items[0]) if items[0] not in significant_multiplicity_values_dict: significant_multiplicity_values_dict[items[0]] = {} significant_multiplicity_values_dict[ items[0]][treatment] = float(items[-2]) else: significant_multiplicity_values_dict[ items[0]][treatment] = float(items[-2]) significant_multiplicity_dict[treatment] = significan_multiplicity_list populations = [ treatment + taxon + replicate for replicate in pt.replicates ] # Load convergence matrix convergence_matrix = parse_file.parse_convergence_matrix( pt.get_path() + '/data/timecourse_final/' + ("%s_convergence_matrix.txt" % (treatment + taxon))) gene_parallelism_statistics = mutation_spectrum_utils.calculate_parallelism_statistics( convergence_matrix, populations, Lmin=100) #print(gene_parallelism_statistics) G, pvalue = mutation_spectrum_utils.calculate_total_parallelism( gene_parallelism_statistics) sys.stdout.write("Total parallelism for %s = %g (p=%g)\n" % (treatment + taxon, G, pvalue)) predictors = [] responses = [] gene_hits = [] gene_predictors = [] mean_gene_freqs = [] Ls = [] ax_mult_freqs_x = [] ax_mult_freqs_y = [] for gene_name in convergence_matrix.keys(): convergence_matrix[gene_name][ 'length'] < 50 and convergence_matrix[gene_name]['length'] Ls.append(convergence_matrix[gene_name]['length']) m = gene_parallelism_statistics[gene_name]['multiplicity'] if gene_name not in multiplicity_dict: multiplicity_dict[gene_name] = {} multiplicity_dict[gene_name][treatment] = m else: multiplicity_dict[gene_name][treatment] = m n = 0 nfixed = 0 freqs = [] nf_max = 0 for population in populations: for t, L, f, f_max in convergence_matrix[gene_name][ 'mutations'][population]: fixed_weight = timecourse_utils.calculate_fixed_weight( L, f) predictors.append(m) responses.append(fixed_weight) n += 1 nfixed += fixed_weight # get freqs for regression #if L == parse_file.POLYMORPHIC: #freqs.append(f_max) nf_max += timecourse_utils.calculate_fixed_weight(L, f_max) if n > 0.5: gene_hits.append(n) gene_predictors.append(m) #mean_gene_freqs.append(np.mean(freqs)) if nf_max > 0: ax_mult_freqs_x.append(m) ax_mult_freqs_y.append(nf_max / n) Ls = np.asarray(Ls) ntot = len(predictors) mavg = ntot * 1.0 / len(Ls) predictors, responses = (np.array(x) for x in zip( *sorted(zip(predictors, responses), key=lambda pair: (pair[0])))) gene_hits, gene_predictors = (np.array(x) for x in zip(*sorted( zip(gene_hits, gene_predictors), key=lambda pair: (pair[0])))) rescaled_predictors = np.exp(np.fabs(np.log(predictors / mavg))) null_survival_function = mutation_spectrum_utils.NullMultiplicitySurvivalFunction.from_parallelism_statistics( gene_parallelism_statistics) # default base is 10 theory_ms = np.logspace(-2, 2, 100) theory_survivals = null_survival_function(theory_ms) theory_survivals /= theory_survivals[0] sys.stderr.write("Done!\n") ax_multiplicity.plot(theory_ms, theory_survivals, lw=3, color=pt.get_colors(treatment), alpha=0.8, ls=':', zorder=1) ax_multiplicity.plot( predictors, (len(predictors) - np.arange(0, len(predictors))) * 1.0 / len(predictors), lw=3, color=pt.get_colors(treatment), alpha=0.8, ls='--', label=str(int(10**int(treatment))) + '-day', drawstyle='steps', zorder=2) #ax_multiplicity.text(0.2, 0.3, g_score_p_label_dict['0'], fontsize=25, fontweight='bold', ha='center', va='center', transform=ax_multiplicity.transAxes) #ax_multiplicity.text(0.2, 0.2, g_score_p_label_dict['1'], fontsize=25, fontweight='bold', ha='center', va='center', transform=ax_multiplicity.transAxes) #ax_multiplicity.text(0.2, 0.1, g_score_p_label_dict['2'], fontsize=25, fontweight='bold', ha='center', va='center', transform=ax_multiplicity.transAxes) if pvalue < 0.001: pretty_pvalue = r'$\ll 0.001$' else: pretty_pvalue = '=' + str(round(pvalue, 4)) g_score_p_label = r'$\Delta \ell_{{{}}}=$'.format( str(10**int(treatment))) + str(round( G, 3)) + ', ' + r'$P$' + pretty_pvalue text_color = pt.lighten_color(pt.get_colors(treatment), amount=1.3) ax_multiplicity.text(0.26, label_y_axes[treatment_idx], g_score_p_label, fontsize=7, ha='center', va='center', color='k', transform=ax_multiplicity.transAxes) ax_mult_freq.scatter(ax_mult_freqs_x, ax_mult_freqs_y, color=pt.get_colors(treatment), edgecolors=pt.get_colors(treatment), marker=pt.plot_species_marker(taxon), alpha=alpha_treatment_dict[treatment]) all_mults.extend(ax_mult_freqs_x) all_freqs.extend(ax_mult_freqs_y) #slope, intercept, r_value, p_value, std_err = stats.linregress(np.log10(ax_mult_freqs_x), np.log10(ax_mult_freqs_y)) #print(slope, p_value) # make treatment pairs treatments_in_taxon.sort(key=float) for i in range(0, len(treatments_in_taxon)): for j in range(i + 1, len(treatments_in_taxon)): ax_mult_i_j = plt.subplot2grid((2, 3), (1, i + j - 1), colspan=1) ax_mult_i_j.set_xscale('log', base=10) ax_mult_i_j.set_yscale('log', base=10) ax_mult_i_j.set_xlabel(str(10**int(treatments_in_taxon[i])) + '-day gene multiplicity, ' + r'$m$', fontsize=14) ax_mult_i_j.set_ylabel(str(10**int(treatments_in_taxon[j])) + '-day gene multiplicity, ' + r'$m$', fontsize=14) ax_mult_i_j.plot([0.05, 200], [0.05, 200], lw=3, c='grey', ls='--', zorder=1) ax_mult_i_j.set_xlim([0.05, 200]) ax_mult_i_j.set_ylim([0.05, 200]) ax_mult_i_j.text(-0.1, 1.07, pt.sub_plot_labels[2 + i + j], fontsize=18, fontweight='bold', ha='center', va='center', transform=ax_mult_i_j.transAxes) multiplicity_pair = [ (multiplicity_dict[gene_name][treatments_in_taxon[i]], multiplicity_dict[gene_name][treatments_in_taxon[j]]) for gene_name in sorted(multiplicity_dict) if (multiplicity_dict[gene_name][treatments_in_taxon[i]] > 0) and (multiplicity_dict[gene_name][treatments_in_taxon[j]] > 0) ] significant_multiplicity_pair = [ (significant_multiplicity_values_dict[gene_name][ treatments_in_taxon[i]], significant_multiplicity_values_dict[gene_name][ treatments_in_taxon[j]]) for gene_name in sorted(significant_multiplicity_values_dict) if (treatments_in_taxon[i] in significant_multiplicity_values_dict[gene_name]) and ( treatments_in_taxon[j] in significant_multiplicity_values_dict[gene_name]) ] # get mean colors ccv = ColorConverter() color_1 = np.array( ccv.to_rgb(pt.get_colors(treatments_in_taxon[i]))) color_2 = np.array( ccv.to_rgb(pt.get_colors(treatments_in_taxon[j]))) mix_color = 0.7 * (color_1 + color_2) mix_color = np.min([mix_color, [1.0, 1.0, 1.0]], 0) if (treatments_in_taxon[i] == '0') and (treatments_in_taxon[j] == '1'): #mix_color = pt.lighten_color(mix_color, amount=2.8) mix_color = 'gold' mult_i = [x[0] for x in multiplicity_pair] mult_j = [x[1] for x in multiplicity_pair] ax_mult_i_j.scatter(mult_i, mult_j, marker=pt.plot_species_marker(taxon), facecolors=mix_color, edgecolors='none', alpha=0.8, s=90, zorder=2) mult_significant_i = [x[0] for x in significant_multiplicity_pair] mult_significant_j = [x[1] for x in significant_multiplicity_pair] ax_mult_i_j.scatter(mult_significant_i, mult_significant_j, marker=pt.plot_species_marker(taxon), facecolors=mix_color, edgecolors='k', lw=1.5, alpha=0.7, s=90, zorder=3) #slope_mult, intercept_mult, r_value_mult, p_value_mult, std_err_mult = stats.linregress(np.log10(mult_significant_i), np.log10(mult_significant_j)) mult_ij = mult_significant_i + mult_significant_j + mult_i + mult_j ax_mult_i_j.set_xlim([min(mult_ij) * 0.5, max(mult_ij) * 1.5]) ax_mult_i_j.set_ylim([min(mult_ij) * 0.5, max(mult_ij) * 1.5]) # null slope of 1 #ratio = (slope_mult - slope_null) / std_err_mult #p_value_mult_new_null = stats.t.sf(np.abs(ratio), len(mult_significant_j)-2)*2 #if p_value_mult_new_null < 0.05: # x_log10_fit_range = np.linspace(np.log10(min(mult_i) * 0.5), np.log10(max(mult_i) * 1.5), 10000) # y_fit_range = 10 ** (slope_mult*x_log10_fit_range + intercept_mult) # ax_mult_i_j.plot(10**x_log10_fit_range, y_fit_range, c='k', lw=3, linestyle='--', zorder=4) #ax_mult_i_j.text(0.05, 0.9, r'$\beta_{1}=$'+str(round(slope_mult,3)), fontsize=12, transform=ax_mult_i_j.transAxes) #ax_mult_i_j.text(0.05, 0.82, r'$r^{2}=$'+str(round(r_value_mult**2,3)), fontsize=12, transform=ax_mult_i_j.transAxes) #ax_mult_i_j.text(0.05, 0.74, pt.get_p_value_latex(p_value_mult_new_null), fontsize=12, transform=ax_mult_i_j.transAxes) #if taxon == 'F': # subset_tuple = (len( significant_multiplicity_dict['0']), \ # len( significant_multiplicity_dict['1']), \ # len(set(significant_multiplicity_dict['0']) & set(significant_multiplicity_dict['1']))) # venn = venn2(subsets = subset_tuple, ax=ax_venn, set_labels=('', '', ''), set_colors=(pt.get_colors('0'), pt.get_colors('1'))) # c = venn2_circles(subsets=subset_tuple, ax=ax_venn, linestyle='dashed') subset_tuple = (len( significant_multiplicity_dict['0']), \ len( significant_multiplicity_dict['1']), \ len(set(significant_multiplicity_dict['0']) & set(significant_multiplicity_dict['1'])), \ len(significant_multiplicity_dict['2']), \ len(set(significant_multiplicity_dict['0']) & set(significant_multiplicity_dict['2'])), \ len(set(significant_multiplicity_dict['1']) & set(significant_multiplicity_dict['2'])), \ len(set(significant_multiplicity_dict['1']) & set(significant_multiplicity_dict['1']) & set(significant_multiplicity_dict['2']))) venn = venn3(subsets=subset_tuple, ax=ax_venn, set_labels=('', '', ''), set_colors=(pt.get_colors('0'), pt.get_colors('1'), pt.get_colors('2'))) c = venn3_circles(subsets=subset_tuple, ax=ax_venn, linestyle='dashed') ax_mult_freq.set_xlim([min(all_mults) * 0.5, max(all_mults) * 1.5]) ax_mult_freq.set_ylim([min(all_freqs) * 0.5, max(all_freqs) * 1.5]) fig.suptitle(pt.latex_dict[taxon], fontsize=30) fig.subplots_adjust(wspace=0.3) #hspace=0.3, wspace=0.5 fig_name = pt.get_path() + "/figs/multiplicity_%s.jpg" % taxon fig.savefig(fig_name, format='jpg', bbox_inches="tight", pad_inches=0.4, dpi=600) plt.close()
if treatment + taxon in pt.treatment_taxa_to_ignore: sys.stderr.write( "Skipping %s, too few surviving replicates ...\n" % (treatment + taxon)) continue populations = [ treatment + taxon + replicate for replicate in pt.replicates ] sys.stderr.write("Analyzing %s level parallelism for %s...\n" % (level, treatment + taxon)) # Load convergence matrix convergence_matrix = parse_file.parse_convergence_matrix( pt.get_path() + '/data/timecourse_final/' + ("%s_convergence_matrix.txt" % (treatment + taxon))) # Calculate basic parallellism statistics gene_parallelism_statistics_minor = mutation_spectrum_utils.calculate_parallelism_statistics( convergence_matrix, populations, fmax_max=0.5) gene_parallelism_statistics_major = mutation_spectrum_utils.calculate_parallelism_statistics( convergence_matrix, populations, fmax_min=0.5) # Do same thing for multiplicity statistic pooled_multiplicities_minor = numpy.array([ gene_parallelism_statistics_minor[gene_name]['multiplicity'] for gene_name in gene_parallelism_statistics_minor.keys() ]) pooled_multiplicities_minor.sort() pooled_multiplicities_major = numpy.array([
def calculate_divergence_correlations(): sys.stdout.write("Starting divergence tests...\n") divergence_dict = {} for treatment_pair_idx, treatment_pair in enumerate(treatment_pairs): treatment_pair_set = (treatment_pair[0], treatment_pair[1]) divergence_dict[treatment_pair_set] = {} if '1' in treatment_pair: taxa = ['B','C','D','F','P'] else: taxa = pt.taxa for taxon in taxa: #result = [(x[treatment_pair[0]],x[treatment_pair[1]]) for x in significant_multiplicity_dict[taxon].values() if (treatment_pair[0] in x) and (treatment_pair[1] in x)] #result = [(x[treatment_pair[0]],x[treatment_pair[1]], x) for x in significant_n_mut_dict[taxon].values() if (treatment_pair[0] in x) and (treatment_pair[1] in x)] result = [(dicts[treatment_pair[0]],dicts[treatment_pair[1]], keys) for keys, dicts in significant_n_mut_dict[taxon].items() if (treatment_pair[0] in dicts) and (treatment_pair[1] in dicts)] n_x = [int(x[0]) for x in result] n_y = [int(x[1]) for x in result] gene_names = [x[2] for x in result] gene_sizes_taxon_treatment_pair = [gene_size_dict[taxon][gene_i] for gene_i in gene_names] gene_sizes_taxon_treatment_pair = np.asarray(gene_sizes_taxon_treatment_pair) taxon_Lmean = gene_mean_size_dict[taxon] n_matrix = np.asarray([n_x, n_y]) mult_matrix = n_matrix * (taxon_Lmean / gene_sizes_taxon_treatment_pair) rel_mult_matrix = mult_matrix/mult_matrix.sum(axis=1)[:,None] pearsons_corr = np.corrcoef(rel_mult_matrix[0,:], rel_mult_matrix[1,:])[1,0] pearsons_corr_squared = pearsons_corr**2 pearsons_corr_squared_null = [] for k in range(permutations_divergence): if (k % 2000 == 0) and (k>0): sys.stdout.write("%d iterations\n" % (k)) n_matrix_random = phik.simulation.sim_2d_data_patefield(n_matrix) mult_matrix_random = n_matrix_random * (taxon_Lmean / gene_sizes_taxon_treatment_pair) rel_mult_matrix_random = mult_matrix_random/mult_matrix_random.sum(axis=1)[:,None] pearsons_corr_random = np.corrcoef(rel_mult_matrix_random[0,:], rel_mult_matrix_random[1,:])[1,0] pearsons_corr_squared_random = pearsons_corr_random**2 pearsons_corr_squared_null.append(pearsons_corr_squared_random) pearsons_corr_squared_null = np.asarray(pearsons_corr_squared_null) Z_corr = (pearsons_corr_squared - np.mean(pearsons_corr_squared_null)) / np.std(pearsons_corr_squared_null) P_corr = (len(pearsons_corr_squared_null[pearsons_corr_squared_null<pearsons_corr_squared])+1) / (permutations_divergence+1) divergence_dict[treatment_pair_set][taxon] = {} divergence_dict[treatment_pair_set][taxon]['pearsons_corr_squared'] = pearsons_corr_squared divergence_dict[treatment_pair_set][taxon]['P_value'] = P_corr divergence_dict[treatment_pair_set][taxon]['Z_corr'] = Z_corr sys.stdout.write("%d vs %d-day, %s: rho^2=%f, P=%f, Z=%f\n" % (10**int(treatment_pair[0]), 10**int(treatment_pair[1]), taxon, pearsons_corr_squared, P_corr, Z_corr)) sys.stdout.write("Dumping pickle......\n") with open(pt.get_path()+'/data/divergence_pearsons.pickle', 'wb') as handle: pickle.dump(divergence_dict, handle, protocol=pickle.HIGHEST_PROTOCOL) sys.stdout.write("Done!\n")
# gene = genes[gene_name_idx] # if gene == '': # continue # locus_tag_to_gene_dict[gene_name] = genes[gene_name_idx] if taxon == 'J': treatments_convergence = ['0', '1'] else: treatments_convergence = ['0', '1', '2'] for treatment in treatments_convergence: genes_significant_file_path = pt.get_path() +'/data/timecourse_final/' + ("parallel_%ss_%s.txt" % ('gene', treatment+taxon)) genes_nonsignificant_file_path = pt.get_path() +'/data/timecourse_final/' + ("parallel_not_significant_%ss_%s.txt" % ('gene', treatment+taxon)) if os.path.exists(genes_significant_file_path) == False: continue genes_significant_file = open(genes_significant_file_path, 'r') first_line_significant = genes_significant_file.readline() N_significant_genes = 0 genes = [] for line in genes_significant_file: line_split = line.strip().split(', ') gene_name = line_split[0]
-0.01, 'Maximum observed allele frequency, ' + r'$f_{max}$', ha='center', va='center', fontsize=18) fig.text(-0.01, 0.5, r'$pN/pS$' + ' for mutations ' + r'$\geq f_{max}$', ha='center', va='center', rotation='vertical', fontsize=18) fig.subplots_adjust(hspace=0.4, wspace=0.6) #hspace=0.3, wspace=0.5 fig.tight_layout() fig.savefig(pt.get_path() + '/figs/dn_ds_fmax.pdf', format='pdf', bbox_inches="tight", pad_inches=0.4, dpi=600) plt.close() record_strs = [ ",".join( ['treatment_pair', 'taxon', 'tree_name', 'mean_absolute_difference']) ] msd_dict = {} for taxon in taxa: if taxon == 'J':
# loop through taxa and get M(700) for all reps in each treatment for treatment in pt.treatments: fmax_dict[treatment] = {} for taxon in taxa: if taxon == 'J': treatments = ['0', '2'] else: treatments = pt.treatments for treatment in treatments: convergence_matrix = parse_file.parse_convergence_matrix( pt.get_path() + '/data/timecourse_final/' + ("%s_convergence_matrix.txt" % (treatment + taxon))) f_max_all = [] #for population in populations: for replicate in pt.replicates: population = treatment + taxon + replicate if population in pt.populations_to_ignore: continue for gene_name in sorted(convergence_matrix.keys()): for t, L, f, f_max in convergence_matrix[gene_name][ 'mutations'][population]:
gene_dict = {} gene_data = parse_file.parse_gene_list('B') gene_names, gene_start_positions, gene_end_positions, promoter_start_positions, promoter_end_positions, gene_sequences, strands, genes, features, protein_ids = gene_data locus_tag_to_gene_dict = {} for gene_name_idx, gene_name in enumerate(gene_names): gene = genes[gene_name_idx] if gene == '': continue locus_tag_to_gene_dict[gene_name] = genes[gene_name_idx] for taxon in taxa: for treatment in treatments: genes_significant_file_path = pt.get_path( ) + '/data/timecourse_final/' + ("parallel_%ss_%s.txt" % ('gene', treatment + taxon)) output_notsignificant_file_path = pt.get_path( ) + '/data/timecourse_final/' + ( "parallel_not_significant_%ss_%s.txt" % ('gene', treatment + taxon)) if os.path.exists(genes_significant_file_path) == False: continue genes_significant_file = open(genes_significant_file_path, 'r') genes_notsignificant_file = open(output_notsignificant_file_path, 'r') first_line_significant = genes_significant_file.readline() first_line_notsignificant = genes_notsignificant_file.readline() for line in genes_significant_file:
def run_analyses(): r2s_obs_dict = {} #r2s_null_dict = {} for treatment in ['0', '1', '2']: r2s_obs_dict[treatment] = {} for taxon in taxa: r2s_all = [] ratio_f_all = [] abs_delta_f_all = [] for replicate in replicates: population = treatment + taxon + replicate sys.stderr.write("Processing %s...\n" % population) mutations, depth_tuple = parse_file.parse_annotated_timecourse( population) population_avg_depth_times, population_avg_depths, clone_avg_depth_times, clone_avg_depths = depth_tuple state_times, state_trajectories = parse_file.parse_well_mixed_state_timecourse( population) times = mutations[0][12] Ms = np.zeros_like(times) * 1.0 fixed_Ms = np.zeros_like(times) * 1.0 for mutation_idx_i in range(0, len(mutations)): location_i, gene_name_i, allele_i, var_type_i, codon_i, position_in_codon_i, AAs_count_i, test_statistic_i, pvalue_i, cutoff_idx_i, depth_fold_change_i, depth_change_pvalue_i, times_i, alts_i, depths_i, clone_times_i, clone_alts_i, clone_depths_i = mutations[ mutation_idx_i] state_Ls_i = state_trajectories[mutation_idx_i] good_idx_i, filtered_alts_i, filtered_depths_i = timecourse_utils.mask_timepoints( times_i, alts_i, depths_i, var_type_i, cutoff_idx_i, depth_fold_change_i, depth_change_pvalue_i) freqs_i = timecourse_utils.estimate_frequencies( filtered_alts_i, filtered_depths_i) masked_times_i = times[good_idx_i] masked_freqs_i = freqs_i[good_idx_i] masked_state_Ls_i = state_Ls_i[good_idx_i] P_idx_i = np.where(masked_state_Ls_i == 3)[0] if len(P_idx_i) < min_trajectory_length: continue first_P_i = P_idx_i[0] last_P_i = P_idx_i[-1] masked_freqs_P_i = masked_freqs_i[first_P_i:last_P_i + 1] masked_times_P_i = masked_times_i[first_P_i:last_P_i + 1] delta_masked_freqs_P_i = masked_freqs_P_i[ 1:] - masked_freqs_P_i[:-1] delta_masked_times_P_i = masked_times_P_i[:-1] #abs_delta_f = np.absolute(freqs_i[1:] - freqs_i[:-1]) #freqs_i_no_zero = freqs_i[freqs_i>0] # we want to get the ratio of freqs for freqs_i_k, freqs_i_l in zip(freqs_i[1:], freqs_i[:-1]): if (freqs_i_k == 0) or (freqs_i_l == 0): continue abs_delta_f_all.append( np.absolute(freqs_i_k - freqs_i_l)) ratio_f_all.append(freqs_i_k / freqs_i_l) #ratio_f = freqs_i_no_zero[] for mutation_idx_j in range(mutation_idx_i + 1, len(mutations)): location_j, gene_name_j, allele_j, var_type_j, codon_j, position_in_codon_j, AAs_count_j, test_statistic_j, pvalue_j, cutoff_jdx_j, depth_fold_change_j, depth_change_pvalue_j, times_j, alts_j, depths_j, clone_times_j, clone_alts_j, clone_depths_j = mutations[ mutation_idx_j] state_Ls_j = state_trajectories[mutation_idx_j] good_idx_j, filtered_alts_j, filtered_depths_j = timecourse_utils.mask_timepoints( times_j, alts_j, depths_j, var_type_j, cutoff_jdx_j, depth_fold_change_j, depth_change_pvalue_j) freqs_j = timecourse_utils.estimate_frequencies( filtered_alts_j, filtered_depths_j) masked_times_j = times[good_idx_j] masked_freqs_j = freqs_j[good_idx_j] masked_state_Ls_j = state_Ls_j[good_idx_j] P_jdx_j = np.where(masked_state_Ls_j == 3)[0] if len(P_jdx_j) < min_trajectory_length: continue first_P_j = P_jdx_j[0] last_P_j = P_jdx_j[-1] masked_freqs_P_j = masked_freqs_j[first_P_j:last_P_j + 1] masked_times_P_j = masked_times_j[first_P_j:last_P_j + 1] delta_masked_freqs_P_j = masked_freqs_P_j[ 1:] - masked_freqs_P_j[:-1] # delta_f = f_t_plus_1 - f_t delta_masked_times_P_j = masked_times_P_j[:-1] intersect_times = np.intersect1d( delta_masked_times_P_i, delta_masked_times_P_j) if len(intersect_times) >= 3: intersect_idx_i = [ np.where(delta_masked_times_P_i == intersect_time)[0][0] for intersect_time in intersect_times ] intersect_delta_i = delta_masked_freqs_P_i[ intersect_idx_i] intersect_idx_j = [ np.where(delta_masked_times_P_j == intersect_time)[0][0] for intersect_time in intersect_times ] intersect_delta_j = delta_masked_freqs_P_j[ intersect_idx_j] if len(intersect_delta_i) != len( intersect_delta_j): print(len(intersect_delta_j), len(intersect_delta_j)) r2 = stats.pearsonr(intersect_delta_i, intersect_delta_j)[0]**2 r2s_all.append(r2) r2s_all = np.asarray(r2s_all) ratio_f_all = np.asarray(ratio_f_all) abs_delta_f_all = np.asarray(abs_delta_f_all) #r2s_obs_dict[treatment + taxon] = {} #r2s_obs_dict[treatment + taxon]['r2'] = r2s_all #r2s_obs_dict[treatment + taxon]['ratio_f'] = ratio_f_all #r2s_obs_dict[treatment + taxon]['abs_delta_f'] = abs_delta_f_all r2s_obs_dict[treatment][taxon] = {} r2s_obs_dict[treatment][taxon]['r2'] = r2s_all r2s_obs_dict[treatment][taxon]['ratio_f'] = ratio_f_all r2s_obs_dict[treatment][taxon]['abs_delta_f'] = abs_delta_f_all with open(pt.get_path() + '/data/mutation_dynamics.pickle', 'wb') as handle: pickle.dump(r2s_obs_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
#r2s_obs_dict[treatment + taxon]['r2'] = r2s_all #r2s_obs_dict[treatment + taxon]['ratio_f'] = ratio_f_all #r2s_obs_dict[treatment + taxon]['abs_delta_f'] = abs_delta_f_all r2s_obs_dict[treatment][taxon] = {} r2s_obs_dict[treatment][taxon]['r2'] = r2s_all r2s_obs_dict[treatment][taxon]['ratio_f'] = ratio_f_all r2s_obs_dict[treatment][taxon]['abs_delta_f'] = abs_delta_f_all with open(pt.get_path() + '/data/mutation_dynamics.pickle', 'wb') as handle: pickle.dump(r2s_obs_dict, handle, protocol=pickle.HIGHEST_PROTOCOL) #run_analyses() with open(pt.get_path() + '/data/mutation_dynamics.pickle', 'rb') as handle: r2s_obs_dict = pickle.load(handle) analyses = ['abs_delta_f', 'ratio_f', 'r2'] # get KS distance ks_dict = {} p_value_list = [] for analysis in analyses: ks_dict[analysis] = {} for treatment_idx, treatment in enumerate(pt.treatments): ks_dict[analysis][treatment] = {} D, p_value = stats.ks_2samp(r2s_obs_dict[treatment]['B'][analysis], r2s_obs_dict[treatment]['S'][analysis]) ks_dict[analysis][treatment]['D'] = D ks_dict[analysis][treatment]['p_value'] = p_value
replist, repnum = scipy.stats.find_repeats(X[i]) for t in repnum: ties += t * (t * t - 1) c = 1 - ties / float(k * (k * k - 1) * n) Q /= c # Approximate the p-value ddof1 = k - 1 p_unc = scipy.stats.chi2.sf(Q, ddof1) # Create output dataframe stats = pd.DataFrame({'Source': within, 'ddof1': ddof1, 'Q': np.round(Q, 3), 'p-unc': p_unc, }, index=['Friedman']) col_order = ['Source', 'ddof1', 'Q', 'p-unc'] stats = stats.reindex(columns=col_order) stats.dropna(how='all', axis=1, inplace=True) return stats data = pd.read_csv(pt.get_path() +'/data/rm_anova.csv', sep=',' ) print(data)
ntot_subsample = 50 subsamples = 10000 # ntot_subsample minimum number of mutations G_subsample_dict = {} G_all_mutations_dict = {} for taxon in ['B', 'S']: for treatment in treatments: # Load convergence matrix convergence_matrix = parse_file.parse_convergence_matrix( pt.get_path() + '/data/timecourse_final/' + ("%s_convergence_matrix.txt" % (treatment + taxon))) populations = [ treatment + taxon + replicate for replicate in replicates ] gene_parallelism_statistics = mutation_spectrum_utils.calculate_parallelism_statistics( convergence_matrix, populations, Lmin=100) G_subsample_list = [] for i in range(subsamples): G_subsample = mutation_spectrum_utils.calculate_subsampled_total_parallelism( gene_parallelism_statistics, ntot_subsample=ntot_subsample)
def run_simulation(): # weird sampling going on #4,292,969 # mutation rate from Lynch paper, assume 10% sites are beneficial #mu = (3.28*10**-10 ) * 0.1 #L = 4292969 # keep order of magnitude for conveinance mu = (1.0 * 10**-10) L = 1000000 N = 10**6 M = 10 K = N / M c = 0.00001 s_scale = 10**-3 # average time in a dormant state = M n_active_to_dormant = int(c * N) n_dormant_to_active = int(c * K * M) if n_active_to_dormant != n_dormant_to_active: print("Unqueal number of individuals switching states!!") # rate of entering dormancy, per-capita = c # rate of exiting dormancy, per-capita = c*K #d = (c* K) / N #r = c / M # double mutants slow the simulation so we're assuming single mutants # e.g., the largest lineage size = 10**6, generated L*mu*N (~1000) mutants per-generation # probability that an individual gets two mutations ~= 10 ** -7 generations_to_sample = [330 * i for i in range(1, 11)] sampled_timepoints = {} generations = 3300 n_clone_lineages = 0 clone_size_dict = {} clone_size_dict[n_clone_lineages] = {} clone_size_dict[n_clone_lineages]['n_clone_active'] = N clone_size_dict[n_clone_lineages]['n_clone_dormant'] = M clone_size_dict[n_clone_lineages]['s'] = 1 clone_size_dict[n_clone_lineages]['mutations'] = set([]) # pre-assign fitness benefits to all sites all_sites = set(range(L)) fitness_effects = numpy.random.exponential(scale=s_scale, size=L) # dict of what clones have a given mutation for generation in range(generations): # generate dormancy transition rates for all lineages # get keys and make sure they're in the same order #clones_active = [ clone_i for clone_i in clone_size_dict.keys() if ('n_clone_active' in clone_size_dict[clone_i]) and (clone_size_dict[clone_i]['n_clone_active'] > 0) ] #clones_active.sort() #clones_dormant = [ clone_i for clone_i in clone_size_dict.keys() if ('n_clone_dormant' in clone_size_dict[clone_i]) and (clone_size_dict[clone_i]['n_clone_dormant'] > 0) ] #clones_dormant.sort() # get array of clone labels, the number of times each label is in the array is the size of the lineage clone_labels_active = [[int(clone_i)] * clone_size_dict[clone_i]['n_clone_active'] for clone_i in clone_size_dict.keys()] clone_labels_dormant = [ [int(clone_i)] * clone_size_dict[clone_i]['n_clone_dormant'] for clone_i in clone_size_dict.keys() if ('n_clone_dormant' in clone_size_dict[clone_i]) and ( clone_size_dict[clone_i]['n_clone_dormant'] > 0) ] clone_labels_active = numpy.concatenate(clone_labels_active).ravel() clone_labels_dormant = numpy.concatenate(clone_labels_dormant).ravel() clone_labels_active = clone_labels_active.astype(numpy.int) clone_labels_active = clone_labels_active.astype(numpy.int) # number of dormant individuals not constant??? print(generation, len(clone_labels_active), len(clone_labels_dormant)) active_to_dormant_sample = numpy.random.choice( clone_labels_active, size=n_active_to_dormant, replace=False) active_to_dormant_sample_bincount = numpy.bincount( active_to_dormant_sample) active_to_dormant_sample_bincount_nonzero = numpy.nonzero( active_to_dormant_sample_bincount)[0] dormant_to_active_sample = numpy.random.choice( clone_labels_dormant, size=n_dormant_to_active, replace=False) dormant_to_active_sample_bincount = numpy.bincount( dormant_to_active_sample) dormant_to_active_sample_bincount_nonzero = numpy.nonzero( dormant_to_active_sample_bincount)[0] for active_to_dormant_clone_i, active_to_dormant_n_clone_i in zip( active_to_dormant_sample_bincount_nonzero, active_to_dormant_sample_bincount[ active_to_dormant_sample_bincount_nonzero]): clone_size_dict[active_to_dormant_clone_i][ 'n_clone_active'] -= active_to_dormant_n_clone_i if 'n_clone_dormant' not in clone_size_dict[ active_to_dormant_clone_i]: clone_size_dict[active_to_dormant_clone_i][ 'n_clone_dormant'] = 0 clone_size_dict[active_to_dormant_clone_i][ 'n_clone_dormant'] += active_to_dormant_n_clone_i for dormant_to_active_clone_i, dormant_to_active_n_clone_i in zip( dormant_to_active_sample_bincount_nonzero, dormant_to_active_sample_bincount[ dormant_to_active_sample_bincount_nonzero]): clone_size_dict[dormant_to_active_clone_i][ 'n_clone_dormant'] -= dormant_to_active_n_clone_i if 'n_clone_dormant' not in clone_size_dict[ dormant_to_active_clone_i]: clone_size_dict[dormant_to_active_clone_i][ 'n_clone_active'] = 0 clone_size_dict[dormant_to_active_clone_i][ 'n_clone_active'] += dormant_to_active_n_clone_i # now move on to evolution for clone_i in list(clone_size_dict): if (clone_size_dict[clone_i]['n_clone_dormant'] == 0): if (clone_size_dict[clone_i]['n_clone_active'] == 0): del clone_size_dict[clone_i] continue else: continue #print(clone_size_dict.keys()) n_clone_i = clone_size_dict[clone_i]['n_clone_active'] # mutation step# # lineage size can't be negative n_mutations_clone = min(numpy.random.poisson(mu * L * n_clone_i), n_clone_i) if n_mutations_clone == 0: continue # remove these individuals from the clone clone_size_dict[clone_i]['n_clone_active'] -= n_mutations_clone # all individuals in the clone have the same mutations # so just sample from nonmutated sites in the ancestral clone non_mutated_sites = all_sites - clone_size_dict[clone_i][ 'mutations'] # sample without replacement #mutated_sites = random.sample(non_mutated_sites, n_mutations_clone) mutated_sites = numpy.random.choice(list(non_mutated_sites), size=n_mutations_clone, replace=False) #print(mutated_sites) #unique, counts = numpy.unique(mutated_sites, return_counts=True) for mutated_site in mutated_sites: n_clone_lineages += 1 clone_size_dict[n_clone_lineages] = {} clone_size_dict[n_clone_lineages]['n_clone_active'] = 1 clone_size_dict[n_clone_lineages]['n_clone_dormant'] = 0 clone_size_dict[n_clone_lineages]['s'] = clone_size_dict[ clone_i]['s'] + fitness_effects[mutated_site] clone_size_dict[n_clone_lineages][ 'mutations'] = clone_size_dict[clone_i]['mutations'].copy( ) clone_size_dict[n_clone_lineages]['mutations'].add( mutated_site) #if (clone_size_dict[clone_i]['n_clone_active'] == 0) and (clone_size_dict[clone_i]['n_clone_dormant'] == 0): # del clone_size_dict[clone_i] #sampling_numerator = numpy.asarray( [ clone_size_dict[clone_i]['n_clone']*numpy.exp(clone_size_dict[clone_i]['s']) for clone_i in sorted(clone_size_dict.keys())] ) sampling_numerator = numpy.asarray([ clone_size_dict[clone_i]['n_clone_active'] * numpy.exp(clone_size_dict[clone_i]['s']) for clone_i in clone_size_dict.keys() ]) sampling_probability = sampling_numerator / sum(sampling_numerator) clone_sizes_after_selection = numpy.random.multinomial( N, sampling_probability) for clone_i_idx, clone_i in enumerate(list(clone_size_dict)): clone_i_size = clone_sizes_after_selection[clone_i_idx] #if clone_i_size == 0: # del clone_size_dict[clone_i] #else: clone_size_dict[clone_i]['n_clone_active'] = clone_i_size if generation % 100 == 0: sys.stderr.write("%d generations...\n" % generation) if generation in generations_to_sample: clone_size_dict_copy = clone_size_dict.copy() sampled_timepoints[generation] = clone_size_dict_copy N = sum([ clone_size_dict[x]['n_clone_active'] for x in clone_size_dict.keys() ]) M = sum([ clone_size_dict[x]['n_clone_dormant'] for x in clone_size_dict.keys() ]) print(generation, N, M) saved_data_file = '%s/data/simulations/test2.dat' % (pt.get_path()) with open(saved_data_file, 'wb') as outfile: pickle.dump(sampled_timepoints, outfile, protocol=pickle.HIGHEST_PROTOCOL)
from collections import Counter from itertools import combinations import scipy.stats as stats import pandas as pd import phylo_tools as pt import parse_file import timecourse_utils import mutation_spectrum_utils import phylo_tools as pt import json json_path = pt.get_path() + '/data/rebreseq_json/' coverages_all = [] for filename in os.listdir(json_path): if filename.endswith(".json"): filepath = '%s%s' % (json_path, filename) with open(filepath) as f: data = json.load(f) #print(data.keys()) #print()
markersize=10, color='w', markerfacecolor=pt.colors_dict['1']), Line2D([0], [0], marker='o', markersize=10, color='w', markerfacecolor=pt.colors_dict['2']) ] axes[0].legend(custom_lines, ['1-day', '10-days', '100-day'], loc='upper right') fig.subplots_adjust(hspace=0.4, wspace=0.6) #hspace=0.3, wspace=0.5 fig.tight_layout() fig.savefig(pt.get_path() + '/figs/mutation_spectra_pca.pdf', format='pdf', bbox_inches="tight", pad_inches=0.4, dpi=600) plt.close() reject, pvals_corrected, alphacSidak, alphacBonf = multitest.multipletests( anova_pvalues, alpha=0.05, method='fdr_bh') fig = plt.figure(figsize=(9, 6)) gs = gridspec.GridSpec(nrows=2, ncols=3) all_subplot_counts = 0 dn_ds_count = 0 for taxon_list_idx, taxon_list in enumerate([['B', 'C', 'D'], ['F', 'J', 'P']]):
fmax_cutoffs = np.asarray([0, 0.2, 0.4, 0.6, 0.8]) G_dict_all = {} taxa = ['B', 'C', 'D', 'F', 'J', 'P'] treatments = ['0', '1'] ntotal_dict = {} for taxon in taxa: sys.stdout.write("Sub-sampling taxon: %s\n" % (taxon)) G_dict_all[taxon] = {} if taxon == 'J': ntotal = 50 else: # calculate ntot for all frequency cutoffs convergence_matrix = parse_file.parse_convergence_matrix( pt.get_path() + '/data/timecourse_final/' + ("%s_convergence_matrix.txt" % ('1' + taxon))) populations = ['1' + taxon + replicate for replicate in pt.replicates] gene_parallelism_statistics = mutation_spectrum_utils.calculate_parallelism_statistics( convergence_matrix, populations, fmax_min=max(fmax_cutoffs)) ntotal = 0 for gene_i, gene_parallelism_statistics_i in gene_parallelism_statistics.items( ): ntotal += gene_parallelism_statistics_i['observed'] ntotal_dict[taxon] = ntotal for treatment in treatments: if treatment + taxon in pt.treatment_taxa_to_ignore: continue G_dict_all[taxon][treatment] = {}
for treatment in treatments: pvalues = [] for replicate in replicates: population = treatment + taxon + replicate if population in pt.populations_to_ignore: continue if population in pt.samples_to_remove: times_to_ignore = pt.samples_to_remove[population] else: times_to_ignore = None #file = open(input_filename_template % population,"r") likelihood_filename = '%s_likelihood_timecourse.bz' % (population) likelihood_timecourse_path = pt.get_path( ) + '/data/timecourse_likelihood/' + likelihood_filename file = bz2.open(likelihood_timecourse_path, "rt") file.readline() # depth line! for line in file: items = line.split(",") location = int(items[1]) total_times = np.array( [float(subitem) for subitem in items[3].split()]) total_alts = np.array( [float(subitem) for subitem in items[4].split()]) total_depths = np.array( [float(subitem) for subitem in items[5].split()]) if 'None' in items[pvalue_idx].split()[0]:
all_poly_list.append((position, allele)) num_processed_mutations += 1 t = timecourse_utils.calculate_appearance_time( masked_times, masked_freqs, masked_state_Ls) convergence_matrix[identifier]['mutations'][population].append( (t, masked_state_Ls[-1], masked_freqs[-1], max(masked_freqs))) sys.stderr.write("processed %d mutations!\n" % num_processed_mutations) # Print it out output_filename = pt.get_path() + '/data/timecourse_final/' + ( "%s_convergence_matrix.txt" % (treatment + taxon)) convergence_matrix_file = open(output_filename, "w") # Header convergence_matrix_file.write( ", ".join(["Identifier"] + ["Size"] + [population for population in populations])) for identifier in sorted(convergence_matrix.keys()): length = convergence_matrix[identifier]['length'] mutations = convergence_matrix[identifier]['mutations'] convergence_matrix_file.write("\n")
def calculate_genome_length(taxon=None): reference_sequence = pt.classFASTA(pt.get_path() +'/'+ pt.get_ref_fna_dict()[taxon]).readFASTA() return sum([len(contig[1]) for contig in reference_sequence])
ax.set_xticklabels(['1-day', '10-days', '100-days'], fontweight='bold', fontsize=12) legend_elements = [ Line2D([0], [0], color='none', marker='o', label=pt.latex_dict['B'], markerfacecolor='k', markersize=13), Line2D([0], [0], marker='o', color='none', label=pt.latex_dict['S'], markerfacecolor='w', markersize=13, markeredgewidth=2) ] # Create the figure ax.legend(handles=legend_elements, loc='upper right') fig.subplots_adjust(hspace=0.3, wspace=0.5) fig_name = pt.get_path() + '/figs/plot_dn_ds.jpg' fig.savefig(fig_name, format='jpg', bbox_inches="tight", pad_inches=0.4, dpi=600) plt.close()
def merge_metadata(): # first get dictionary for barcodes one and two for GSF2124, GSF2056 GSF_files = [ 'GSF2056-run1-plates1-2-demultiplexing-summary', 'GSF2056-run2-plates3-4-demultiplexing-summary', 'SampleSheet-GSF2124-run3-plates1-2', 'GSF2124 Lennon Run 3 Plates 3-4 Run Summary Sorted', 'GSF2124-run5-plates5-6-demultiplexing-summay' ] ignore_lines = [ 'Undetermined', 'Sample', 'Lane', 'Sample_ID', ' Chemistry', 'Description', 'Assay', 'Application', 'Workflow', 'Date', 'Experiment Name', 'IEMFileVersion', 'Lane Summary', '"GSF2124 Lennon Plates 5-6', 'GSF2124-plates5-6-run5 Summary', '', 'Chemistry', 'GSF2056-run2-plates3-4 Lennon Summary', 'GSF2056-run1-plates1-2 Lennon/Shoemaker Summary' ] GSF_bc_dict = {} df_out = open( pt.get_path() + '/data/library_metadata/' + 'new_sample_names.txt', 'w') meta_path = open( pt.get_path() + '/data/library_metadata/' + 'sample_names.txt', 'r') for GSF_file in GSF_files: GSF_file_ = open( pt.get_path() + '/data/library_metadata/' + GSF_file + '.csv', 'r') for GSF_line in GSF_file_: GSF_line = GSF_line.strip() #.split(',') if len(GSF_line) < 20: continue GSF_line = GSF_line.split(',') if GSF_line[0] in ignore_lines: continue if GSF_line[2] == 'Undetermined': continue if GSF_line[0] == '1': GSF_line = GSF_line[2:] GSF_bc_dict[GSF_line[0]] = {} if '+' in GSF_line[1]: BC_split = GSF_line[1].split('+') GSF_BC1 = BC_split[0] GSF_BC2 = BC_split[1] else: GSF_BC1 = GSF_line[5] GSF_BC2 = GSF_line[7] GSF_bc_dict[GSF_line[0]]['BC1'] = GSF_BC1 GSF_bc_dict[GSF_line[0]]['BC2'] = GSF_BC2 for line in meta_path: line = line.strip() line_dash = line.split('/') run = line_dash[0] if 'GSF' not in run: run = 'HCGS' + run if '_' in run: run = run.replace('_', '-') file_name = line_dash[-1] file_name_spl = re.split('-|_', file_name) if file_name_spl[0] == 'GSF2124': gsf_bc_key = file_name.rsplit('_', 3)[0] BC1 = GSF_bc_dict[gsf_bc_key]['BC1'] BC2 = GSF_bc_dict[gsf_bc_key]['BC2'] if len(file_name_spl) == 9: pop = file_name_spl[4] day = file_name_spl[5][1:] R = file_name_spl[-2] elif len(file_name_spl) == 10: pop = file_name_spl[3] + file_name_spl[4] + file_name_spl[5] day = file_name_spl[6] R = file_name_spl[-2] elif len(file_name_spl) == 11: pop = file_name_spl[4] + file_name_spl[5] + file_name_spl[6] day = file_name_spl[7] R = file_name_spl[-2] elif file_name_spl[0] == 'GSF2056': gsf_bc_key = file_name.rsplit('_', 3)[0] BC1 = GSF_bc_dict[gsf_bc_key]['BC1'] BC2 = GSF_bc_dict[gsf_bc_key]['BC2'] if len(file_name_spl) == 13: pop = file_name_spl[3] + file_name_spl[7] + file_name_spl[8] day = file_name_spl[9] R = file_name_spl[-2] end = file_name_spl[-1] elif len(file_name_spl) == 14: pop = file_name_spl[4] + file_name_spl[8] + file_name_spl[9] day = file_name_spl[10] R = file_name_spl[-2] end = file_name_spl[-1] elif 'HCGS' in run: if len(file_name_spl) == 8: pop = file_name_spl[0] day = file_name_spl[1] BC1 = file_name_spl[3] BC2 = file_name_spl[4] R = file_name_spl[-2] end = file_name_spl[-1] elif len(file_name_spl) == 9: pop = file_name_spl[1] + file_name_spl[2] day = file_name_spl[3][1:] BC1 = file_name_spl[4] BC2 = file_name_spl[5] R = file_name_spl[-2] end = file_name_spl[-1] elif len(file_name_spl) == 6: pop = file_name_spl[0][1:] day = '100' BC1 = file_name_spl[1] BC2 = file_name_spl[2] R = file_name_spl[4] end = file_name_spl[-1] elif len(file_name_spl) == 7: pop = file_name_spl[0] day = file_name_spl[1] BC1 = file_name_spl[2] BC2 = file_name_spl[3] R = file_name_spl[5] end = file_name_spl[-1] if 'L' in pop: pop = pop.replace('L', '') new_name = '_'.join([run, pop, day, BC1, BC2, R, end])