Ejemplo n.º 1
0
        hmm_appearance_times.append(hmm_appearance_time)
        true_appearance_times.append(true_appearance_time)
        hmm_transit_times.append(hmm_transit_time)
        true_transit_times.append(true_transit_time)

    if population_idx == focal_population_idx:
        focal_population_empirical_freqs = empirical_freqs

hmm_appearance_times = numpy.array(hmm_appearance_times)
true_appearance_times = numpy.array(true_appearance_times)
hmm_transit_times = numpy.array(hmm_transit_times)
true_transit_times = numpy.array(true_transit_times)

transit_errors = hmm_transit_times - true_transit_times
transit_error_xs, transit_error_survivals = stats_utils.calculate_unnormalized_survival_from_vector(
    transit_errors)

appearance_errors = hmm_appearance_times - true_appearance_times
error_xs, error_survivals = stats_utils.calculate_unnormalized_survival_from_vector(
    appearance_errors)

bins = numpy.arange(-20, 21) * 500 + 250

print len(total_snp_trajectories), "populations"
print len(appearance_errors), "trajectories"

####
#
# Set up figure
#
####
    
    #x0 = marker_coverages[i]
    xs, CDFs = stats_utils.calculate_unnormalized_CDF_from_histogram(sample_coverage_histogram)
    pylab.plot(xs, CDFs[-1]-CDFs, '-')

pylab.semilogx([1],[1])
pylab.xlabel('Coverage, D')
pylab.ylabel('Fraction sites with coverage >= D')
#pylab.xlim([1e-01,1e01])
pylab.savefig('%s/%s_genomic_coverage_distribution.pdf' % (parse_midas_data.analysis_directory,species_name),bbox_inches='tight',transparent=True)

pylab.figure(2)

median_coverages.sort()

median_coverage_xs, median_coverage_survivals = stats_utils.calculate_unnormalized_survival_from_vector(median_coverages, min_x=0.1, max_x=10000)


# Load gene coverage information for species_name
sys.stderr.write("Loading pangenome data for %s...\n" % species_name)
gene_samples, gene_names, gene_presence_matrix, gene_depth_matrix, marker_coverages, gene_reads_matrix = parse_midas_data.parse_pangenome_data(species_name)
sys.stderr.write("Done!\n")

marker_coverages = numpy.clip(marker_coverages, 2e-01,1e04)

marker_coverages.sort()

marker_coverage_xs, marker_coverage_survivals = stats_utils.calculate_unnormalized_survival_from_vector(marker_coverages, min_x=0.1, max_x=10000)

median_marker_coverage_xs, median_marker_coverage_survivals = stats_utils.calculate_unnormalized_survival_from_vector(median_marker_coverages, min_x=0.1, max_x=10000)
mean_marker_coverage_xs, mean_marker_coverage_survivals = stats_utils.calculate_unnormalized_survival_from_vector(mean_marker_coverages, min_x=0.1, max_x=10000)
        print "d=", max_ds[i]
        print "Site", "Polymorphic", "Inconsistent"
        for variant_type in sorted(polymorphic_variant_types[i].keys()):
            variant_type, polymorphic_variant_types[i][
                variant_type], inconsistent_variant_types[i][variant_type]
        print ""

    pylab.figure(4, figsize=(3.42, 2))
    pylab.suptitle(species_name)

    for i in xrange(0, len(polymorphic_freqs)):

        if len(polymorphic_freqs[i]) == 0:
            continue

        xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
            polymorphic_freqs[i])
        pylab.step(xs,
                   ns * 1.0 / ns[0],
                   '-',
                   label='Polymorphic ($d=%g$)' % max_ds[i])
        print 'Polymorphic (d=%g), n=%g' % (max_ds[i], ns[0])

    if len(inconsistent_freqs[1]) > 0:
        xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
            inconsistent_freqs[1])
        pylab.step(xs,
                   ns * 1.0 / ns[0],
                   'r-',
                   linewidth=2,
                   label=('Inconsistent ($d=%g$)' % max_ds[1]))
Ejemplo n.º 4
0
combination_type = "sample"

# Load subject and sample metadata
sys.stderr.write("Loading HMP metadata...\n")
subject_sample_map = parse_midas_data.parse_subject_sample_map()
sys.stderr.write("Done!\n")

# Calculate num timepoints per sample
num_timepoints_per_subject = []
for subject in subject_sample_map.keys():
    num_timepoints_per_subject.append(len(subject_sample_map[subject].keys()))

num_timepoints_per_subject.sort()
num_timepoints_per_subject = numpy.array(num_timepoints_per_subject)

num_timepoints, num_subjects = stats_utils.calculate_unnormalized_survival_from_vector(
    num_timepoints_per_subject)

pylab.figure(1, figsize=(5, 3))
pylab.step(num_timepoints + 0.25, num_subjects, where='pre')
pylab.semilogy([0], [1])
pylab.xlim([0.5, 9])
pylab.ylim([0.3, 300])
pylab.xlabel('Num timepoints, $T$')
pylab.ylabel('Num subjects with $\geq T$')
print len(num_timepoints_per_subject), max(num_timepoints_per_subject)
pylab.savefig('%s/num_timepoints_per_subject.pdf' %
              parse_midas_data.analysis_directory,
              bbox_inches='tight')

# Load marker gene coverages
species_coverage_matrix, sample_list, species_list = parse_midas_data.parse_global_marker_gene_coverages(
Ejemplo n.º 5
0
    ds.append(d)
    vs.append(v)

random_ds = numpy.array(random_ds)
ds = numpy.array(ds)
vs = numpy.array(vs)

sys.stderr.write("Done!\n")

print len(ds), "total singletons"
print(vs > 0.5).sum(), "1D"
print(vs < 0.5).sum(), "4D"

# Now plot them.

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(random_ds)
d_axis.step(xs, 1 - ns * 1.0 / ns[0], '-', color='0.7')

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(ds)
d_axis.step(xs, 1 - ns * 1.0 / ns[0], 'b-')

d_axis.semilogx([1e-05, 2e-05], [1, 1])

dstars = numpy.logspace(-4, -2, 20)
fraction_nonsynonymous = []

for dstar in dstars:

    less_idxs = (ds <= dstar)

    if less_idxs.sum() > 1:
Ejemplo n.º 6
0
gene_difference_axis.spines['right'].set_visible(False)
gene_difference_axis.get_xaxis().tick_bottom()
gene_difference_axis.get_yaxis().tick_left()

gene_difference_axis.semilogx([1, 1])
gene_difference_axis.set_xlim([1, 1e04])
gene_difference_axis.set_ylim([0, 1.174])

low_divergence_snp_differences = numpy.array(low_divergence_snp_differences)
low_divergence_gene_differences = numpy.array(low_divergence_gene_differences)
low_divergence_clock_null_gene_differences = numpy.array(
    low_divergence_clock_null_gene_differences)
normal_divergence_gene_differences = numpy.array(
    normal_divergence_gene_differences)

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
    low_divergence_gene_differences, min_x=0.1, max_x=1e04)
snp_difference_axis.step(xs,
                         1 - ns * 1.0 / ns[0],
                         'r-',
                         label='Closely\nrelated',
                         zorder=1)

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
    low_divergence_gene_differences, min_x=0.1, max_x=1e04)
gene_difference_axis.step(xs,
                          1 - ns * 1.0 / ns[0],
                          'r-',
                          label='Closely\nrelated',
                          zorder=1)

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
pooled_snp_change_distribution = numpy.clip(pooled_snp_change_distribution,
                                            1e-01, 1e08)
pooled_between_snp_change_distribution = numpy.clip(
    pooled_between_snp_change_distribution, 1e-01, 1e08)
pooled_min_between_snp_change_distribution = numpy.clip(
    pooled_min_between_snp_change_distribution, 1e-01, 1e08)

pooled_snp_axis.fill_between([modification_difference_threshold, 1e05], [1, 1],
                             [1e03, 1e03],
                             color='0.8')

#xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(pooled_between_snp_change_distribution, min_x=1e-02, max_x=1e09)

#pooled_snp_axis.step(xs,ns,'-',color='r',linewidth=0.5, alpha=0.5, label='Between-host', where='mid')

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
    pooled_min_between_snp_change_distribution, min_x=1e-02, max_x=1e09)

pooled_snp_axis.step(xs,
                     ns,
                     '-',
                     color='r',
                     linewidth=0.5,
                     alpha=0.5,
                     label='Between-host',
                     where='mid')

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
    pooled_snp_change_distribution, min_x=1e-02, max_x=1e09)

pooled_snp_axis.step(xs,
                     ns,
Ejemplo n.º 8
0
haploid_distribution_axis.plot(sample_highcoverage_counts+normal(0,0.1,size=sample_highcoverage_counts.shape), sample_haploid_counts+normal(0,0.1,size=sample_highcoverage_counts.shape),'.',color=haploid_color,alpha=0.5,markersize=2)    

pavg = sample_haploid_counts.sum()*1.0/sample_highcoverage_counts.sum()

num_bootstraps=100
bootstrapped_haploid_countss = []
for bootstrap_idx in xrange(0,num_bootstraps):
    
    bootstrapped_haploid_countss.append( binomial(sample_highcoverage_counts, pavg) )
    
pooled_bootstrapped_haploid_fractions = []
for bootstrap_idx in xrange(0,num_bootstraps):
    pooled_bootstrapped_haploid_fractions.extend( bootstrapped_haploid_countss[bootstrap_idx][sample_highcoverage_counts>=1]*1.0/sample_highcoverage_counts[sample_highcoverage_counts>=1] )
pooled_bootstrapped_haploid_fractions = numpy.array( pooled_bootstrapped_haploid_fractions )

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(pooled_bootstrapped_haploid_fractions )
haploid_cdf_axis.step(xs,ns*1.0/ns[0],'-',color='0.7',label='Null')

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(sample_haploid_fractions[sample_highcoverage_counts>=1])
haploid_cdf_axis.step(xs,ns*1.0/ns[0],'-',color=haploid_color,label='Obs')
haploid_cdf_axis.set_xlim([0,1])

haploid_cdf_axis.legend(loc='upper right',frameon=False,numpoints=1)


#########
#
# Haploid distribution
#
for species_idx in xrange(0,len(num_haploid_samples)):
        
Ejemplo n.º 9
0
print marker_genes

print marker_gene_idxs.sum()

sample_idxs = (parse_midas_data.calculate_unique_samples(subject_sample_map, gene_samples))*(marker_coverages>=min_coverage)

prevalences = gene_diversity_utils.calculate_fractional_gene_prevalences(gene_depth_matrix[:,sample_idxs], marker_coverages[sample_idxs],min_copynum=0.3)

reference_prevalences = prevalences[reference_gene_idxs]
metaphlan2_prevalences = prevalences[metaphlan2_gene_idxs]
marker_prevalences = prevalences[marker_gene_idxs]

print marker_prevalences

pangenome_xs, pangenome_survivals = stats_utils.calculate_unnormalized_survival_from_vector(prevalences, min_x=0, max_x=1)

reference_xs, reference_survivals = stats_utils.calculate_unnormalized_survival_from_vector(reference_prevalences, min_x=0, max_x=1)

metaphlan2_xs, metaphlan2_survivals = stats_utils.calculate_unnormalized_survival_from_vector(metaphlan2_prevalences, min_x=0, max_x=1)

marker_xs, marker_survivals = stats_utils.calculate_unnormalized_survival_from_vector(marker_prevalences, min_x=0, max_x=1)

pylab.figure(1,figsize=(3.42,4))
pylab.title(species_name)

#pylab.step(pangenome_xs, pangenome_survivals/pangenome_survivals[0],label='Pan-genome')
#pylab.step(reference_xs, reference_survivals/reference_survivals[0],label='Reference')
#pylab.step(metaphlan2_xs, metaphlan2_survivals/metaphlan2_survivals[0],label='Metaphlan2')
#pylab.step(marker_xs, marker_survivals/marker_survivals[0],label='MIDAS Marker')
#pylab.ylim([1e-02,1])
# Set up figure
prevalence_fig = plt.figure(figsize=(3.42, 2))

# Set up grids to hold figure panels
outer_grid = gridspec.GridSpec(1, 1)

prevalence_axis = plt.Subplot(prevalence_fig, outer_grid[0])
prevalence_fig.add_subplot(prevalence_axis)
prevalence_fig.suptitle(species_name, fontsize=7)

prevalence_axis.set_ylabel('Fraction genes $\geq p$')
prevalence_axis.set_xlabel('Prevalence of gene, $p$')
prevalence_axis.set_xlim([0, 1])
prevalence_axis.set_ylim([0, 1])

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
    pangenome_prevalences)
prevalence_axis.step(xs, ns * 1.0 / ns[0], 'k-', label='Total pan genome')

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
    between_host_gene_prevalences)
prevalence_axis.step(xs,
                     ns * 1.0 / ns[0],
                     'r-',
                     label='Between host differences')

if len(low_divergence_between_host_gene_prevalences) > 0:
    print low_divergence_between_host_gene_prevalences
    print low_divergence_between_host_gene_prevalences.mean()
    print len(low_divergence_between_host_gene_prevalences), len(
        between_host_gene_prevalences)
    xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
    bootstrapped_restricted_appearance_times[var_type] = numpy.array(bootstrapped_restricted_appearance_times[var_type])

    D = stats_utils.calculate_ks_distance(observed_restricted_appearance_times[var_type], restricted_appearance_times)
    
    pvalue = ((bootstrapped_restricted_kss[var_type]>=D).sum()+1.0)/(len(bootstrapped_restricted_kss[var_type])+1.0)
        
    sys.stdout.write('%s: %g\n' % (var_type, pvalue))
    
######
#
# Now do plotting
#
######            

all_ts, all_survivals = stats_utils.calculate_unnormalized_survival_from_vector(pooled_appearance_times, min_x=-1000,max_x=100000)

time_axis.step(all_ts, all_survivals/all_survivals[0], color='k', label='All types')
#missense_time_axis.step(all_ts, all_survivals/all_survivals[0], color='k', label='All')

restricted_ts, restricted_survivals = stats_utils.calculate_unnormalized_survival_from_vector(restricted_appearance_times, min_x=-1000,max_x=100000)
#missense_time_axis.step(all_ts, restricted_survivals/restricted_survivals[0], color='k', label='All (excluding sv)',alpha=0.5)



for var_type in parse_file.var_types:
 
    color = figure_utils.get_var_type_color(var_type)
    vartype_ts, vartype_survivals = stats_utils.calculate_unnormalized_survival_from_vector(observed_appearance_times[var_type], min_x=-1000, max_x=100000)
    time_axis.step(vartype_ts, vartype_survivals/vartype_survivals[0], color=color, alpha=0.7) #, label=var_type)
    pylab.plot(mafs, synonymous_sfs*mafs*(1-mafs)/(synonymous_sfs*mafs*(1-mafs)).sum(), 'b.-',label='4D')
    pylab.plot(mafs, nonsynonymous_sfs*mafs*(1-mafs)/(nonsynonymous_sfs*mafs*(1-mafs)).sum(),'r.-',label='1D')
    
    pylab.xlim([0,0.5])
    pylab.legend(loc='upper right',frameon=False,fontsize=6)
    pylab.savefig('%s/%s_pooled_sfs.pdf' % (parse_midas_data.analysis_directory, species_name), bbox_inches='tight')
    pylab.savefig('%s/%s_pooled_sfs.png' % (parse_midas_data.analysis_directory, species_name), bbox_inches='tight', dpi=300)
 
    pylab.figure(2,figsize=(3.42,2))
    pylab.suptitle(species_name)

    #xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(polymorphic_freqs)
    #pylab.step(xs,ns*1.0/ns[0],'b-',label='All polymorphisms')
    
    if len(null_inconsistent_freqs)>0:
        xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(null_inconsistent_freqs)
        pylab.step(xs,ns*1.0/ns[0],'-',color='0.7',linewidth=0.5, label=('Unlinked expectation'))
     
    
    if len(inconsistent_freqs)>0:
        xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(inconsistent_freqs)
        pylab.step(xs,ns*1.0/ns[0],'r-',label=('Inconsistent ($d=%g$)' % max_clade_d))
       
    pylab.xlim([0,0.5])
    pylab.ylim([0,1])
    pylab.xlabel('Within-clade MAF, $f$')
    pylab.ylabel('SNPs $\geq f$')
    pylab.legend(loc='upper right', frameon=False,fontsize=6)
    
    pylab.savefig('%s/%s_phylogenetically_inconsistent_sfs.pdf' % (parse_midas_data.analysis_directory,species_name),bbox_inches='tight')
    pylab.savefig('%s/%s_phylogenetically_inconsistent_sfs.png' % (parse_midas_data.analysis_directory,species_name),bbox_inches='tight',dpi=300)
                        snp_samples[j]]:
                    pair_snp_substitution_rates.append(
                        snp_substitution_matrix[i, j])

                if snp_substitution_matrix[i, j] < min_substitution_rate:
                    min_substitution_rate = snp_substitution_matrix[i, j]

        closest_snp_substitution_rates.append(min_substitution_rate)

    all_closest_rates.extend(closest_snp_substitution_rates)
    all_pair_rates.extend(pair_snp_substitution_rates)

print numpy.sort(all_closest_rates)
print numpy.sort(all_pair_rates)

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
    all_closest_rates, min_x=1e-06, max_x=1e09)
pylab.step(xs,
           ns / ns[0],
           '-',
           color='r',
           linewidth=0.5,
           alpha=0.5,
           label='Between-host',
           where='mid',
           zorder=2)

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
    all_pair_rates, min_x=1e-06, max_x=1e09)
pylab.step(xs,
           ns / ns[0],
           '-',
###################################################

pylab.figure(figsize=(6, 6))
prevalence_axis = pylab.subplot(111)

prevalence_axis.set_ylabel('Fraction genes $\leq p$', labelpad=2)
prevalence_axis.set_xlabel('Expected number, $p$', labelpad=2)
prevalence_axis.set_xlim([0, 20])
#prevalence_axis.set_ylim([0,1.1])

prevalence_axis.spines['top'].set_visible(False)
prevalence_axis.spines['right'].set_visible(False)
prevalence_axis.get_xaxis().tick_bottom()
prevalence_axis.get_yaxis().tick_left()

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
    numpy.asarray(p_val_arrays['losses']['present']))
prevalence_axis.step(xs,
                     1 - ns * 1.0 / ns[0],
                     'b-',
                     label='Within-host present genes',
                     zorder=2)

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
    numpy.asarray(p_val_arrays['losses']['pangenome']))
prevalence_axis.step(xs,
                     1 - ns * 1.0 / ns[0],
                     'r-',
                     label='Between-host gene changes',
                     zorder=1)

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
Ejemplo n.º 15
0
            if gene_parallelism_statistics_major[gene_name]['observed'] >= nmin:
                print(gene_parallelism_statistics_major[gene_name])
                pooled_pvalues_major.append(gene_logpvalues_major[gene_name])
        pooled_pvalues_major = numpy.asarray(pooled_pvalues_major)
        if len(pooled_pvalues_major) == 0:
            continue
        pooled_pvalues_major.sort()

        print(pooled_pvalues_major)

        null_pvalue_survival_minor = mutation_spectrum_utils.NullGeneLogpSurvivalFunction.from_parallelism_statistics(
            gene_parallelism_statistics_minor, nmin=nmin)
        null_pvalue_survival_major = mutation_spectrum_utils.NullGeneLogpSurvivalFunction.from_parallelism_statistics(
            gene_parallelism_statistics_major, nmin=nmin)

        observed_ps_minor, observed_pvalue_survival_minor = stats_utils.calculate_unnormalized_survival_from_vector(
            pooled_pvalues_minor, min_x=-4)
        observed_ps_major, observed_pvalue_survival_major = stats_utils.calculate_unnormalized_survival_from_vector(
            pooled_pvalues_major, min_x=-8)

        # Pvalue version
        threshold_idx_minor = numpy.nonzero(
            (null_pvalue_survival_minor(observed_ps_minor) * 1.0 /
             observed_pvalue_survival_minor) < FDR)[0][0]
        pstar_minor = observed_ps_minor[
            threshold_idx_minor]  # lowest value where this is true
        num_significant_minor = observed_pvalue_survival_minor[
            threshold_idx_minor]

        # Pvalue version for minor
        threshold_idx_major = numpy.nonzero(
            (null_pvalue_survival_major(observed_ps_major) * 1.0 /
Ejemplo n.º 16
0
# Calculate which pairs of idxs belong to the same sample, which to the same subject
# and which to different subjects
high_coverage_same_sample_idxs, high_coverage_same_subject_idxs, high_coverage_diff_subject_idxs = parse_midas_data.calculate_subject_pairs(
    subject_sample_map, high_coverage_samples)

same_sample_idxs = parse_midas_data.apply_sample_index_map_to_indices(
    sample_idx_map, high_coverage_same_sample_idxs)
same_subject_idxs = parse_midas_data.apply_sample_index_map_to_indices(
    sample_idx_map, high_coverage_same_subject_idxs)
diff_subject_idxs = parse_midas_data.apply_sample_index_map_to_indices(
    sample_idx_map, high_coverage_diff_subject_idxs)

hamming_timepoints = gene_hamming_matrix[same_subject_idxs]
hamming_timepoints.sort()
hamming_timepoints_dns, hamming_timepoints_survivals = stats_utils.calculate_unnormalized_survival_from_vector(
    hamming_timepoints, min_x=0.1, max_x=1e05)
hamming_timepoints_survivals /= hamming_timepoints_survivals[0]

hamming_between = gene_hamming_matrix[diff_subject_idxs]
hamming_between.sort()
hamming_between_dns, hamming_between_survivals = stats_utils.calculate_unnormalized_survival_from_vector(
    hamming_between, min_x=0.1, max_x=1e05)
hamming_between_survivals /= hamming_between_survivals[0]

gene_counts = gene_presence_matrix.sum(axis=0)
gene_counts.sort()
gene_count_ns, gene_count_survivals = stats_utils.calculate_unnormalized_survival_from_vector(
    gene_counts, min_x=0.1, max_x=1e05)
gene_count_survivals /= gene_count_survivals[0]

print "Median gene count=", numpy.median(gene_presence_matrix.sum(axis=0))
    print low_divergence_between_host_gene_prevalences.mean()
    print len(low_divergence_between_host_gene_prevalences), len(
        between_host_gene_prevalences)

    h = numpy.histogram(low_divergence_between_host_gene_prevalences,
                        bins=prevalence_bins)[0]
    #prevalence_axis.plot(prevalence_locations, h*1.0/h.sum(),'r.-',label=('d<%g' % modification_divergence_threshold), alpha=0.5,markersize=3)

h = numpy.histogram(within_host_gene_prevalences, bins=prevalence_bins)[0]
#prevalence_axis.plot(prevalence_locations, h*1.0/h.sum(),'b.-',label='Within-host',markersize=3)

print len(within_host_gene_prevalences), "within-host changes"

# CDF version

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
    within_host_gene_prevalences)
prevalence_axis.step(xs,
                     1 - ns * 1.0 / ns[0],
                     'b-',
                     label='Within-host',
                     zorder=2)

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
    between_host_gene_prevalences)
prevalence_axis.step(xs,
                     1 - ns * 1.0 / ns[0],
                     'r-',
                     label='Between-host',
                     zorder=1)

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
        # Calculate basic parallellism statistics
        gene_parallelism_statistics = mutation_spectrum_utils.calculate_parallelism_statistics(
            convergence_matrix, populations)

        # Calculate G score for entire gene (G=n*g)
        gene_G_scores = mutation_spectrum_utils.calculate_G_scores(
            gene_parallelism_statistics)
        pooled_G_scores = numpy.asarray(list(gene_G_scores.values()))

        pooled_G_scores.sort()

        null_G_survival = mutation_spectrum_utils.NullGeneGSurvivalFunction.from_parallelism_statistics(
            gene_parallelism_statistics)

        observed_Gs, observed_G_survival = stats_utils.calculate_unnormalized_survival_from_vector(
            pooled_G_scores)

        # Do same thing for multiplicity statistic
        pooled_multiplicities = numpy.array([
            gene_parallelism_statistics[gene_name]['multiplicity']
            for gene_name in gene_parallelism_statistics.keys()
        ])
        pooled_multiplicities.sort()

        null_multiplicity_survival = mutation_spectrum_utils.NullGeneMultiplicitySurvivalFunction.from_parallelism_statistics(
            gene_parallelism_statistics)

        observed_ms, observed_multiplicity_survival = stats_utils.calculate_unnormalized_survival_from_vector(
            pooled_multiplicities)

        # Do same thing for num hits
    bootstrapped_fake_low_ps.extend(
        binomial(sample_sizes, low_p) * 1.0 / sample_sizes)
    bootstrapped_fake_all_ps.extend(
        binomial(sample_sizes, all_p) * 1.0 / sample_sizes)

#xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(bootstrapped_low_ps, min_x=0,max_x=2)
#sharing_axis.step(xs,ns*1.0/ns[0],'r-',label='Low $d_S$ (matched)',zorder=3)

#xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(bootstrapped_all_ps, min_x=0,max_x=2)
#sharing_axis.step(xs,ns*1.0/ns[0],'k-',label='All (matched)',zorder=2)

#xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(bootstrapped_fake_low_ps, min_x=0,max_x=1)
#sharing_axis.step(xs,ns*1.0/ns[0],'r-',label='Low $d_S$ (pooled)',zorder=1,alpha=0.5)

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
    real_all_ps[all_doubleton_opportunities > min_opportunities],
    min_x=0,
    max_x=2)
sharing_axis.step(xs,
                  ns * 1.0 / ns[0],
                  'k-',
                  label='Between hosts (all)',
                  zorder=1)  #,alpha=0.5)

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
    real_low_ps[low_doubleton_opportunities > min_opportunities],
    min_x=0,
    max_x=2)
sharing_axis.step(xs,
                  ns * 1.0 / ns[0],
                  'r-',
                  label='Between hosts\n(closely related)',
                    color='0.7',
                    linewidth=0)
g_axis.plot(tstars, upper_early_dGs, color='0.6', linewidth=0.25)
g_axis.fill_between(tstars,
                    lower_late_dGs,
                    numpy.zeros_like(tstars),
                    color='0.7',
                    linewidth=0)
g_axis.plot(tstars, lower_late_dGs, color='0.6', linewidth=0.25)

g_axis.plot(tstars, early_dGs, '-', color=early_color, label='$\leq t^*$')
g_axis.plot(tstars, late_dGs, '-', color=late_color, label='$> t^*$')

g_axis.legend(loc='upper right', frameon=False)

early_survival_ms, early_survivals = stats_utils.calculate_unnormalized_survival_from_vector(
    ms[ts <= tstar], min_x=0.1, max_x=100)
late_survival_ms, late_survivals = stats_utils.calculate_unnormalized_survival_from_vector(
    ms[ts > tstar], min_x=0.1, max_x=100)
all_survival_ms, all_survivals = stats_utils.calculate_unnormalized_survival_from_vector(
    ms, min_x=0.1, max_x=100)

theory_ms = numpy.logspace(0, 2, 100)
multiplicity_axis.loglog(theory_ms,
                         null_survival_function(theory_ms),
                         color='0.7',
                         linewidth=0.5)
multiplicity_axis.step(all_survival_ms,
                       all_survivals * 1.0 / all_survivals[0],
                       color=all_color,
                       linewidth=0.5,
                       label='All')
Ejemplo n.º 21
0
        x_t_idx = numpy.fabs(Xts-t).argmin()
        m_t_idx = numpy.fabs(Mts-t).argmin()
        
        sts.append(t)
        ss.append( Xs[x_t_idx]/(Ms[m_t_idx]+(Ms[m_t_idx]==0) ) )
        
        #if population in parse_file.complete_nonmutator_lines:
        #    print t, Xts[t_idx], Mts[t_idx], Xs[t_idx], Ms[t_idx], ss[-1]
    
    sts = numpy.array(sts)
    ss = numpy.array(ss)
    
    transit_times[population].sort()
    transit_times[population] = numpy.array(transit_times[population])
    
    dts, dt_survival = stats_utils.calculate_unnormalized_survival_from_vector(transit_times[population], min_x=0)

    
    if population in parse_file.complete_nonmutator_lines:
        # We're dealing with a non-mutator population    
        
        colorVal = parse_file.get_line_color(population)
        linestyle = 'o-'
        zorder = 12-nonmutator_idx
        
        nonmutator_idx += 1
        
        fixation_axis.plot(Ms, fixed_Ms, linestyle, color=colorVal, alpha=1, markersize=1,linewidth=0.5,zorder=zorder, markeredgewidth=0)    
        
        late_Xts, late_Xs, late_std_Xs = late_fitness_trajectories[population]
        
Ejemplo n.º 22
0
min_copynum_distribution = gene_copynum_matrix[desired_gene_idxs,:].min(axis=0)

for gene_name in desired_gene_names:

    gene_idx = numpy.nonzero(gene_names==gene_name)[0][0]

    gene_copynum_distribution = gene_copynum_matrix[gene_idx,:]

    print gene_copynum_matrix.shape, gene_copynum_distribution.shape



    #print gene_copynum_distribution

    xvalues, ns = stats_utils.calculate_unnormalized_survival_from_vector(gene_copynum_distribution, min_x=0, max_x=gene_copynum_distribution.max())

    pylab.step(xvalues,ns,label=gene_name)
    #pylab.semilogy([4],[4])
    
xvalues, ns = stats_utils.calculate_unnormalized_survival_from_vector(min_copynum_distribution, min_x=0, max_x=min_copynum_distribution.max())

pylab.step(xvalues,ns,label='Both')

    
pylab.legend(loc='upper right',frameon=False)

pylab.savefig('../morteza_collaboration/ben_figures/Alistipes_onderdonkii_gene_gain_HMP_prevalence.pdf',bbox_inches='tight')

#pylab.show()