hmm_appearance_times.append(hmm_appearance_time) true_appearance_times.append(true_appearance_time) hmm_transit_times.append(hmm_transit_time) true_transit_times.append(true_transit_time) if population_idx == focal_population_idx: focal_population_empirical_freqs = empirical_freqs hmm_appearance_times = numpy.array(hmm_appearance_times) true_appearance_times = numpy.array(true_appearance_times) hmm_transit_times = numpy.array(hmm_transit_times) true_transit_times = numpy.array(true_transit_times) transit_errors = hmm_transit_times - true_transit_times transit_error_xs, transit_error_survivals = stats_utils.calculate_unnormalized_survival_from_vector( transit_errors) appearance_errors = hmm_appearance_times - true_appearance_times error_xs, error_survivals = stats_utils.calculate_unnormalized_survival_from_vector( appearance_errors) bins = numpy.arange(-20, 21) * 500 + 250 print len(total_snp_trajectories), "populations" print len(appearance_errors), "trajectories" #### # # Set up figure # ####
#x0 = marker_coverages[i] xs, CDFs = stats_utils.calculate_unnormalized_CDF_from_histogram(sample_coverage_histogram) pylab.plot(xs, CDFs[-1]-CDFs, '-') pylab.semilogx([1],[1]) pylab.xlabel('Coverage, D') pylab.ylabel('Fraction sites with coverage >= D') #pylab.xlim([1e-01,1e01]) pylab.savefig('%s/%s_genomic_coverage_distribution.pdf' % (parse_midas_data.analysis_directory,species_name),bbox_inches='tight',transparent=True) pylab.figure(2) median_coverages.sort() median_coverage_xs, median_coverage_survivals = stats_utils.calculate_unnormalized_survival_from_vector(median_coverages, min_x=0.1, max_x=10000) # Load gene coverage information for species_name sys.stderr.write("Loading pangenome data for %s...\n" % species_name) gene_samples, gene_names, gene_presence_matrix, gene_depth_matrix, marker_coverages, gene_reads_matrix = parse_midas_data.parse_pangenome_data(species_name) sys.stderr.write("Done!\n") marker_coverages = numpy.clip(marker_coverages, 2e-01,1e04) marker_coverages.sort() marker_coverage_xs, marker_coverage_survivals = stats_utils.calculate_unnormalized_survival_from_vector(marker_coverages, min_x=0.1, max_x=10000) median_marker_coverage_xs, median_marker_coverage_survivals = stats_utils.calculate_unnormalized_survival_from_vector(median_marker_coverages, min_x=0.1, max_x=10000) mean_marker_coverage_xs, mean_marker_coverage_survivals = stats_utils.calculate_unnormalized_survival_from_vector(mean_marker_coverages, min_x=0.1, max_x=10000)
print "d=", max_ds[i] print "Site", "Polymorphic", "Inconsistent" for variant_type in sorted(polymorphic_variant_types[i].keys()): variant_type, polymorphic_variant_types[i][ variant_type], inconsistent_variant_types[i][variant_type] print "" pylab.figure(4, figsize=(3.42, 2)) pylab.suptitle(species_name) for i in xrange(0, len(polymorphic_freqs)): if len(polymorphic_freqs[i]) == 0: continue xs, ns = stats_utils.calculate_unnormalized_survival_from_vector( polymorphic_freqs[i]) pylab.step(xs, ns * 1.0 / ns[0], '-', label='Polymorphic ($d=%g$)' % max_ds[i]) print 'Polymorphic (d=%g), n=%g' % (max_ds[i], ns[0]) if len(inconsistent_freqs[1]) > 0: xs, ns = stats_utils.calculate_unnormalized_survival_from_vector( inconsistent_freqs[1]) pylab.step(xs, ns * 1.0 / ns[0], 'r-', linewidth=2, label=('Inconsistent ($d=%g$)' % max_ds[1]))
combination_type = "sample" # Load subject and sample metadata sys.stderr.write("Loading HMP metadata...\n") subject_sample_map = parse_midas_data.parse_subject_sample_map() sys.stderr.write("Done!\n") # Calculate num timepoints per sample num_timepoints_per_subject = [] for subject in subject_sample_map.keys(): num_timepoints_per_subject.append(len(subject_sample_map[subject].keys())) num_timepoints_per_subject.sort() num_timepoints_per_subject = numpy.array(num_timepoints_per_subject) num_timepoints, num_subjects = stats_utils.calculate_unnormalized_survival_from_vector( num_timepoints_per_subject) pylab.figure(1, figsize=(5, 3)) pylab.step(num_timepoints + 0.25, num_subjects, where='pre') pylab.semilogy([0], [1]) pylab.xlim([0.5, 9]) pylab.ylim([0.3, 300]) pylab.xlabel('Num timepoints, $T$') pylab.ylabel('Num subjects with $\geq T$') print len(num_timepoints_per_subject), max(num_timepoints_per_subject) pylab.savefig('%s/num_timepoints_per_subject.pdf' % parse_midas_data.analysis_directory, bbox_inches='tight') # Load marker gene coverages species_coverage_matrix, sample_list, species_list = parse_midas_data.parse_global_marker_gene_coverages(
ds.append(d) vs.append(v) random_ds = numpy.array(random_ds) ds = numpy.array(ds) vs = numpy.array(vs) sys.stderr.write("Done!\n") print len(ds), "total singletons" print(vs > 0.5).sum(), "1D" print(vs < 0.5).sum(), "4D" # Now plot them. xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(random_ds) d_axis.step(xs, 1 - ns * 1.0 / ns[0], '-', color='0.7') xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(ds) d_axis.step(xs, 1 - ns * 1.0 / ns[0], 'b-') d_axis.semilogx([1e-05, 2e-05], [1, 1]) dstars = numpy.logspace(-4, -2, 20) fraction_nonsynonymous = [] for dstar in dstars: less_idxs = (ds <= dstar) if less_idxs.sum() > 1:
gene_difference_axis.spines['right'].set_visible(False) gene_difference_axis.get_xaxis().tick_bottom() gene_difference_axis.get_yaxis().tick_left() gene_difference_axis.semilogx([1, 1]) gene_difference_axis.set_xlim([1, 1e04]) gene_difference_axis.set_ylim([0, 1.174]) low_divergence_snp_differences = numpy.array(low_divergence_snp_differences) low_divergence_gene_differences = numpy.array(low_divergence_gene_differences) low_divergence_clock_null_gene_differences = numpy.array( low_divergence_clock_null_gene_differences) normal_divergence_gene_differences = numpy.array( normal_divergence_gene_differences) xs, ns = stats_utils.calculate_unnormalized_survival_from_vector( low_divergence_gene_differences, min_x=0.1, max_x=1e04) snp_difference_axis.step(xs, 1 - ns * 1.0 / ns[0], 'r-', label='Closely\nrelated', zorder=1) xs, ns = stats_utils.calculate_unnormalized_survival_from_vector( low_divergence_gene_differences, min_x=0.1, max_x=1e04) gene_difference_axis.step(xs, 1 - ns * 1.0 / ns[0], 'r-', label='Closely\nrelated', zorder=1) xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
pooled_snp_change_distribution = numpy.clip(pooled_snp_change_distribution, 1e-01, 1e08) pooled_between_snp_change_distribution = numpy.clip( pooled_between_snp_change_distribution, 1e-01, 1e08) pooled_min_between_snp_change_distribution = numpy.clip( pooled_min_between_snp_change_distribution, 1e-01, 1e08) pooled_snp_axis.fill_between([modification_difference_threshold, 1e05], [1, 1], [1e03, 1e03], color='0.8') #xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(pooled_between_snp_change_distribution, min_x=1e-02, max_x=1e09) #pooled_snp_axis.step(xs,ns,'-',color='r',linewidth=0.5, alpha=0.5, label='Between-host', where='mid') xs, ns = stats_utils.calculate_unnormalized_survival_from_vector( pooled_min_between_snp_change_distribution, min_x=1e-02, max_x=1e09) pooled_snp_axis.step(xs, ns, '-', color='r', linewidth=0.5, alpha=0.5, label='Between-host', where='mid') xs, ns = stats_utils.calculate_unnormalized_survival_from_vector( pooled_snp_change_distribution, min_x=1e-02, max_x=1e09) pooled_snp_axis.step(xs, ns,
haploid_distribution_axis.plot(sample_highcoverage_counts+normal(0,0.1,size=sample_highcoverage_counts.shape), sample_haploid_counts+normal(0,0.1,size=sample_highcoverage_counts.shape),'.',color=haploid_color,alpha=0.5,markersize=2) pavg = sample_haploid_counts.sum()*1.0/sample_highcoverage_counts.sum() num_bootstraps=100 bootstrapped_haploid_countss = [] for bootstrap_idx in xrange(0,num_bootstraps): bootstrapped_haploid_countss.append( binomial(sample_highcoverage_counts, pavg) ) pooled_bootstrapped_haploid_fractions = [] for bootstrap_idx in xrange(0,num_bootstraps): pooled_bootstrapped_haploid_fractions.extend( bootstrapped_haploid_countss[bootstrap_idx][sample_highcoverage_counts>=1]*1.0/sample_highcoverage_counts[sample_highcoverage_counts>=1] ) pooled_bootstrapped_haploid_fractions = numpy.array( pooled_bootstrapped_haploid_fractions ) xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(pooled_bootstrapped_haploid_fractions ) haploid_cdf_axis.step(xs,ns*1.0/ns[0],'-',color='0.7',label='Null') xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(sample_haploid_fractions[sample_highcoverage_counts>=1]) haploid_cdf_axis.step(xs,ns*1.0/ns[0],'-',color=haploid_color,label='Obs') haploid_cdf_axis.set_xlim([0,1]) haploid_cdf_axis.legend(loc='upper right',frameon=False,numpoints=1) ######### # # Haploid distribution # for species_idx in xrange(0,len(num_haploid_samples)):
print marker_genes print marker_gene_idxs.sum() sample_idxs = (parse_midas_data.calculate_unique_samples(subject_sample_map, gene_samples))*(marker_coverages>=min_coverage) prevalences = gene_diversity_utils.calculate_fractional_gene_prevalences(gene_depth_matrix[:,sample_idxs], marker_coverages[sample_idxs],min_copynum=0.3) reference_prevalences = prevalences[reference_gene_idxs] metaphlan2_prevalences = prevalences[metaphlan2_gene_idxs] marker_prevalences = prevalences[marker_gene_idxs] print marker_prevalences pangenome_xs, pangenome_survivals = stats_utils.calculate_unnormalized_survival_from_vector(prevalences, min_x=0, max_x=1) reference_xs, reference_survivals = stats_utils.calculate_unnormalized_survival_from_vector(reference_prevalences, min_x=0, max_x=1) metaphlan2_xs, metaphlan2_survivals = stats_utils.calculate_unnormalized_survival_from_vector(metaphlan2_prevalences, min_x=0, max_x=1) marker_xs, marker_survivals = stats_utils.calculate_unnormalized_survival_from_vector(marker_prevalences, min_x=0, max_x=1) pylab.figure(1,figsize=(3.42,4)) pylab.title(species_name) #pylab.step(pangenome_xs, pangenome_survivals/pangenome_survivals[0],label='Pan-genome') #pylab.step(reference_xs, reference_survivals/reference_survivals[0],label='Reference') #pylab.step(metaphlan2_xs, metaphlan2_survivals/metaphlan2_survivals[0],label='Metaphlan2') #pylab.step(marker_xs, marker_survivals/marker_survivals[0],label='MIDAS Marker') #pylab.ylim([1e-02,1])
# Set up figure prevalence_fig = plt.figure(figsize=(3.42, 2)) # Set up grids to hold figure panels outer_grid = gridspec.GridSpec(1, 1) prevalence_axis = plt.Subplot(prevalence_fig, outer_grid[0]) prevalence_fig.add_subplot(prevalence_axis) prevalence_fig.suptitle(species_name, fontsize=7) prevalence_axis.set_ylabel('Fraction genes $\geq p$') prevalence_axis.set_xlabel('Prevalence of gene, $p$') prevalence_axis.set_xlim([0, 1]) prevalence_axis.set_ylim([0, 1]) xs, ns = stats_utils.calculate_unnormalized_survival_from_vector( pangenome_prevalences) prevalence_axis.step(xs, ns * 1.0 / ns[0], 'k-', label='Total pan genome') xs, ns = stats_utils.calculate_unnormalized_survival_from_vector( between_host_gene_prevalences) prevalence_axis.step(xs, ns * 1.0 / ns[0], 'r-', label='Between host differences') if len(low_divergence_between_host_gene_prevalences) > 0: print low_divergence_between_host_gene_prevalences print low_divergence_between_host_gene_prevalences.mean() print len(low_divergence_between_host_gene_prevalences), len( between_host_gene_prevalences) xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
bootstrapped_restricted_appearance_times[var_type] = numpy.array(bootstrapped_restricted_appearance_times[var_type]) D = stats_utils.calculate_ks_distance(observed_restricted_appearance_times[var_type], restricted_appearance_times) pvalue = ((bootstrapped_restricted_kss[var_type]>=D).sum()+1.0)/(len(bootstrapped_restricted_kss[var_type])+1.0) sys.stdout.write('%s: %g\n' % (var_type, pvalue)) ###### # # Now do plotting # ###### all_ts, all_survivals = stats_utils.calculate_unnormalized_survival_from_vector(pooled_appearance_times, min_x=-1000,max_x=100000) time_axis.step(all_ts, all_survivals/all_survivals[0], color='k', label='All types') #missense_time_axis.step(all_ts, all_survivals/all_survivals[0], color='k', label='All') restricted_ts, restricted_survivals = stats_utils.calculate_unnormalized_survival_from_vector(restricted_appearance_times, min_x=-1000,max_x=100000) #missense_time_axis.step(all_ts, restricted_survivals/restricted_survivals[0], color='k', label='All (excluding sv)',alpha=0.5) for var_type in parse_file.var_types: color = figure_utils.get_var_type_color(var_type) vartype_ts, vartype_survivals = stats_utils.calculate_unnormalized_survival_from_vector(observed_appearance_times[var_type], min_x=-1000, max_x=100000) time_axis.step(vartype_ts, vartype_survivals/vartype_survivals[0], color=color, alpha=0.7) #, label=var_type)
pylab.plot(mafs, synonymous_sfs*mafs*(1-mafs)/(synonymous_sfs*mafs*(1-mafs)).sum(), 'b.-',label='4D') pylab.plot(mafs, nonsynonymous_sfs*mafs*(1-mafs)/(nonsynonymous_sfs*mafs*(1-mafs)).sum(),'r.-',label='1D') pylab.xlim([0,0.5]) pylab.legend(loc='upper right',frameon=False,fontsize=6) pylab.savefig('%s/%s_pooled_sfs.pdf' % (parse_midas_data.analysis_directory, species_name), bbox_inches='tight') pylab.savefig('%s/%s_pooled_sfs.png' % (parse_midas_data.analysis_directory, species_name), bbox_inches='tight', dpi=300) pylab.figure(2,figsize=(3.42,2)) pylab.suptitle(species_name) #xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(polymorphic_freqs) #pylab.step(xs,ns*1.0/ns[0],'b-',label='All polymorphisms') if len(null_inconsistent_freqs)>0: xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(null_inconsistent_freqs) pylab.step(xs,ns*1.0/ns[0],'-',color='0.7',linewidth=0.5, label=('Unlinked expectation')) if len(inconsistent_freqs)>0: xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(inconsistent_freqs) pylab.step(xs,ns*1.0/ns[0],'r-',label=('Inconsistent ($d=%g$)' % max_clade_d)) pylab.xlim([0,0.5]) pylab.ylim([0,1]) pylab.xlabel('Within-clade MAF, $f$') pylab.ylabel('SNPs $\geq f$') pylab.legend(loc='upper right', frameon=False,fontsize=6) pylab.savefig('%s/%s_phylogenetically_inconsistent_sfs.pdf' % (parse_midas_data.analysis_directory,species_name),bbox_inches='tight') pylab.savefig('%s/%s_phylogenetically_inconsistent_sfs.png' % (parse_midas_data.analysis_directory,species_name),bbox_inches='tight',dpi=300)
snp_samples[j]]: pair_snp_substitution_rates.append( snp_substitution_matrix[i, j]) if snp_substitution_matrix[i, j] < min_substitution_rate: min_substitution_rate = snp_substitution_matrix[i, j] closest_snp_substitution_rates.append(min_substitution_rate) all_closest_rates.extend(closest_snp_substitution_rates) all_pair_rates.extend(pair_snp_substitution_rates) print numpy.sort(all_closest_rates) print numpy.sort(all_pair_rates) xs, ns = stats_utils.calculate_unnormalized_survival_from_vector( all_closest_rates, min_x=1e-06, max_x=1e09) pylab.step(xs, ns / ns[0], '-', color='r', linewidth=0.5, alpha=0.5, label='Between-host', where='mid', zorder=2) xs, ns = stats_utils.calculate_unnormalized_survival_from_vector( all_pair_rates, min_x=1e-06, max_x=1e09) pylab.step(xs, ns / ns[0], '-',
################################################### pylab.figure(figsize=(6, 6)) prevalence_axis = pylab.subplot(111) prevalence_axis.set_ylabel('Fraction genes $\leq p$', labelpad=2) prevalence_axis.set_xlabel('Expected number, $p$', labelpad=2) prevalence_axis.set_xlim([0, 20]) #prevalence_axis.set_ylim([0,1.1]) prevalence_axis.spines['top'].set_visible(False) prevalence_axis.spines['right'].set_visible(False) prevalence_axis.get_xaxis().tick_bottom() prevalence_axis.get_yaxis().tick_left() xs, ns = stats_utils.calculate_unnormalized_survival_from_vector( numpy.asarray(p_val_arrays['losses']['present'])) prevalence_axis.step(xs, 1 - ns * 1.0 / ns[0], 'b-', label='Within-host present genes', zorder=2) xs, ns = stats_utils.calculate_unnormalized_survival_from_vector( numpy.asarray(p_val_arrays['losses']['pangenome'])) prevalence_axis.step(xs, 1 - ns * 1.0 / ns[0], 'r-', label='Between-host gene changes', zorder=1) xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
if gene_parallelism_statistics_major[gene_name]['observed'] >= nmin: print(gene_parallelism_statistics_major[gene_name]) pooled_pvalues_major.append(gene_logpvalues_major[gene_name]) pooled_pvalues_major = numpy.asarray(pooled_pvalues_major) if len(pooled_pvalues_major) == 0: continue pooled_pvalues_major.sort() print(pooled_pvalues_major) null_pvalue_survival_minor = mutation_spectrum_utils.NullGeneLogpSurvivalFunction.from_parallelism_statistics( gene_parallelism_statistics_minor, nmin=nmin) null_pvalue_survival_major = mutation_spectrum_utils.NullGeneLogpSurvivalFunction.from_parallelism_statistics( gene_parallelism_statistics_major, nmin=nmin) observed_ps_minor, observed_pvalue_survival_minor = stats_utils.calculate_unnormalized_survival_from_vector( pooled_pvalues_minor, min_x=-4) observed_ps_major, observed_pvalue_survival_major = stats_utils.calculate_unnormalized_survival_from_vector( pooled_pvalues_major, min_x=-8) # Pvalue version threshold_idx_minor = numpy.nonzero( (null_pvalue_survival_minor(observed_ps_minor) * 1.0 / observed_pvalue_survival_minor) < FDR)[0][0] pstar_minor = observed_ps_minor[ threshold_idx_minor] # lowest value where this is true num_significant_minor = observed_pvalue_survival_minor[ threshold_idx_minor] # Pvalue version for minor threshold_idx_major = numpy.nonzero( (null_pvalue_survival_major(observed_ps_major) * 1.0 /
# Calculate which pairs of idxs belong to the same sample, which to the same subject # and which to different subjects high_coverage_same_sample_idxs, high_coverage_same_subject_idxs, high_coverage_diff_subject_idxs = parse_midas_data.calculate_subject_pairs( subject_sample_map, high_coverage_samples) same_sample_idxs = parse_midas_data.apply_sample_index_map_to_indices( sample_idx_map, high_coverage_same_sample_idxs) same_subject_idxs = parse_midas_data.apply_sample_index_map_to_indices( sample_idx_map, high_coverage_same_subject_idxs) diff_subject_idxs = parse_midas_data.apply_sample_index_map_to_indices( sample_idx_map, high_coverage_diff_subject_idxs) hamming_timepoints = gene_hamming_matrix[same_subject_idxs] hamming_timepoints.sort() hamming_timepoints_dns, hamming_timepoints_survivals = stats_utils.calculate_unnormalized_survival_from_vector( hamming_timepoints, min_x=0.1, max_x=1e05) hamming_timepoints_survivals /= hamming_timepoints_survivals[0] hamming_between = gene_hamming_matrix[diff_subject_idxs] hamming_between.sort() hamming_between_dns, hamming_between_survivals = stats_utils.calculate_unnormalized_survival_from_vector( hamming_between, min_x=0.1, max_x=1e05) hamming_between_survivals /= hamming_between_survivals[0] gene_counts = gene_presence_matrix.sum(axis=0) gene_counts.sort() gene_count_ns, gene_count_survivals = stats_utils.calculate_unnormalized_survival_from_vector( gene_counts, min_x=0.1, max_x=1e05) gene_count_survivals /= gene_count_survivals[0] print "Median gene count=", numpy.median(gene_presence_matrix.sum(axis=0))
print low_divergence_between_host_gene_prevalences.mean() print len(low_divergence_between_host_gene_prevalences), len( between_host_gene_prevalences) h = numpy.histogram(low_divergence_between_host_gene_prevalences, bins=prevalence_bins)[0] #prevalence_axis.plot(prevalence_locations, h*1.0/h.sum(),'r.-',label=('d<%g' % modification_divergence_threshold), alpha=0.5,markersize=3) h = numpy.histogram(within_host_gene_prevalences, bins=prevalence_bins)[0] #prevalence_axis.plot(prevalence_locations, h*1.0/h.sum(),'b.-',label='Within-host',markersize=3) print len(within_host_gene_prevalences), "within-host changes" # CDF version xs, ns = stats_utils.calculate_unnormalized_survival_from_vector( within_host_gene_prevalences) prevalence_axis.step(xs, 1 - ns * 1.0 / ns[0], 'b-', label='Within-host', zorder=2) xs, ns = stats_utils.calculate_unnormalized_survival_from_vector( between_host_gene_prevalences) prevalence_axis.step(xs, 1 - ns * 1.0 / ns[0], 'r-', label='Between-host', zorder=1) xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
# Calculate basic parallellism statistics gene_parallelism_statistics = mutation_spectrum_utils.calculate_parallelism_statistics( convergence_matrix, populations) # Calculate G score for entire gene (G=n*g) gene_G_scores = mutation_spectrum_utils.calculate_G_scores( gene_parallelism_statistics) pooled_G_scores = numpy.asarray(list(gene_G_scores.values())) pooled_G_scores.sort() null_G_survival = mutation_spectrum_utils.NullGeneGSurvivalFunction.from_parallelism_statistics( gene_parallelism_statistics) observed_Gs, observed_G_survival = stats_utils.calculate_unnormalized_survival_from_vector( pooled_G_scores) # Do same thing for multiplicity statistic pooled_multiplicities = numpy.array([ gene_parallelism_statistics[gene_name]['multiplicity'] for gene_name in gene_parallelism_statistics.keys() ]) pooled_multiplicities.sort() null_multiplicity_survival = mutation_spectrum_utils.NullGeneMultiplicitySurvivalFunction.from_parallelism_statistics( gene_parallelism_statistics) observed_ms, observed_multiplicity_survival = stats_utils.calculate_unnormalized_survival_from_vector( pooled_multiplicities) # Do same thing for num hits
bootstrapped_fake_low_ps.extend( binomial(sample_sizes, low_p) * 1.0 / sample_sizes) bootstrapped_fake_all_ps.extend( binomial(sample_sizes, all_p) * 1.0 / sample_sizes) #xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(bootstrapped_low_ps, min_x=0,max_x=2) #sharing_axis.step(xs,ns*1.0/ns[0],'r-',label='Low $d_S$ (matched)',zorder=3) #xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(bootstrapped_all_ps, min_x=0,max_x=2) #sharing_axis.step(xs,ns*1.0/ns[0],'k-',label='All (matched)',zorder=2) #xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(bootstrapped_fake_low_ps, min_x=0,max_x=1) #sharing_axis.step(xs,ns*1.0/ns[0],'r-',label='Low $d_S$ (pooled)',zorder=1,alpha=0.5) xs, ns = stats_utils.calculate_unnormalized_survival_from_vector( real_all_ps[all_doubleton_opportunities > min_opportunities], min_x=0, max_x=2) sharing_axis.step(xs, ns * 1.0 / ns[0], 'k-', label='Between hosts (all)', zorder=1) #,alpha=0.5) xs, ns = stats_utils.calculate_unnormalized_survival_from_vector( real_low_ps[low_doubleton_opportunities > min_opportunities], min_x=0, max_x=2) sharing_axis.step(xs, ns * 1.0 / ns[0], 'r-', label='Between hosts\n(closely related)',
color='0.7', linewidth=0) g_axis.plot(tstars, upper_early_dGs, color='0.6', linewidth=0.25) g_axis.fill_between(tstars, lower_late_dGs, numpy.zeros_like(tstars), color='0.7', linewidth=0) g_axis.plot(tstars, lower_late_dGs, color='0.6', linewidth=0.25) g_axis.plot(tstars, early_dGs, '-', color=early_color, label='$\leq t^*$') g_axis.plot(tstars, late_dGs, '-', color=late_color, label='$> t^*$') g_axis.legend(loc='upper right', frameon=False) early_survival_ms, early_survivals = stats_utils.calculate_unnormalized_survival_from_vector( ms[ts <= tstar], min_x=0.1, max_x=100) late_survival_ms, late_survivals = stats_utils.calculate_unnormalized_survival_from_vector( ms[ts > tstar], min_x=0.1, max_x=100) all_survival_ms, all_survivals = stats_utils.calculate_unnormalized_survival_from_vector( ms, min_x=0.1, max_x=100) theory_ms = numpy.logspace(0, 2, 100) multiplicity_axis.loglog(theory_ms, null_survival_function(theory_ms), color='0.7', linewidth=0.5) multiplicity_axis.step(all_survival_ms, all_survivals * 1.0 / all_survivals[0], color=all_color, linewidth=0.5, label='All')
x_t_idx = numpy.fabs(Xts-t).argmin() m_t_idx = numpy.fabs(Mts-t).argmin() sts.append(t) ss.append( Xs[x_t_idx]/(Ms[m_t_idx]+(Ms[m_t_idx]==0) ) ) #if population in parse_file.complete_nonmutator_lines: # print t, Xts[t_idx], Mts[t_idx], Xs[t_idx], Ms[t_idx], ss[-1] sts = numpy.array(sts) ss = numpy.array(ss) transit_times[population].sort() transit_times[population] = numpy.array(transit_times[population]) dts, dt_survival = stats_utils.calculate_unnormalized_survival_from_vector(transit_times[population], min_x=0) if population in parse_file.complete_nonmutator_lines: # We're dealing with a non-mutator population colorVal = parse_file.get_line_color(population) linestyle = 'o-' zorder = 12-nonmutator_idx nonmutator_idx += 1 fixation_axis.plot(Ms, fixed_Ms, linestyle, color=colorVal, alpha=1, markersize=1,linewidth=0.5,zorder=zorder, markeredgewidth=0) late_Xts, late_Xs, late_std_Xs = late_fitness_trajectories[population]
min_copynum_distribution = gene_copynum_matrix[desired_gene_idxs,:].min(axis=0) for gene_name in desired_gene_names: gene_idx = numpy.nonzero(gene_names==gene_name)[0][0] gene_copynum_distribution = gene_copynum_matrix[gene_idx,:] print gene_copynum_matrix.shape, gene_copynum_distribution.shape #print gene_copynum_distribution xvalues, ns = stats_utils.calculate_unnormalized_survival_from_vector(gene_copynum_distribution, min_x=0, max_x=gene_copynum_distribution.max()) pylab.step(xvalues,ns,label=gene_name) #pylab.semilogy([4],[4]) xvalues, ns = stats_utils.calculate_unnormalized_survival_from_vector(min_copynum_distribution, min_x=0, max_x=min_copynum_distribution.max()) pylab.step(xvalues,ns,label='Both') pylab.legend(loc='upper right',frameon=False) pylab.savefig('../morteza_collaboration/ben_figures/Alistipes_onderdonkii_gene_gain_HMP_prevalence.pdf',bbox_inches='tight') #pylab.show()