Esempio n. 1
0
    
    #x0 = marker_coverages[i]
    xs, CDFs = stats_utils.calculate_unnormalized_CDF_from_histogram(sample_coverage_histogram)
    pylab.plot(xs, CDFs[-1]-CDFs, '-')

pylab.semilogx([1],[1])
pylab.xlabel('Coverage, D')
pylab.ylabel('Fraction sites with coverage >= D')
#pylab.xlim([1e-01,1e01])
pylab.savefig('%s/%s_genomic_coverage_distribution.pdf' % (parse_midas_data.analysis_directory,species_name),bbox_inches='tight',transparent=True)

pylab.figure(2)

median_coverages.sort()

median_coverage_xs, median_coverage_survivals = stats_utils.calculate_unnormalized_survival_from_vector(median_coverages, min_x=0.1, max_x=10000)


# Load gene coverage information for species_name
sys.stderr.write("Loading pangenome data for %s...\n" % species_name)
gene_samples, gene_names, gene_presence_matrix, gene_depth_matrix, marker_coverages, gene_reads_matrix = parse_midas_data.parse_pangenome_data(species_name)
sys.stderr.write("Done!\n")

marker_coverages = numpy.clip(marker_coverages, 2e-01,1e04)

marker_coverages.sort()

marker_coverage_xs, marker_coverage_survivals = stats_utils.calculate_unnormalized_survival_from_vector(marker_coverages, min_x=0.1, max_x=10000)

median_marker_coverage_xs, median_marker_coverage_survivals = stats_utils.calculate_unnormalized_survival_from_vector(median_marker_coverages, min_x=0.1, max_x=10000)
mean_marker_coverage_xs, mean_marker_coverage_survivals = stats_utils.calculate_unnormalized_survival_from_vector(mean_marker_coverages, min_x=0.1, max_x=10000)
                        snp_samples[j]]:
                    pair_snp_substitution_rates.append(
                        snp_substitution_matrix[i, j])

                if snp_substitution_matrix[i, j] < min_substitution_rate:
                    min_substitution_rate = snp_substitution_matrix[i, j]

        closest_snp_substitution_rates.append(min_substitution_rate)

    all_closest_rates.extend(closest_snp_substitution_rates)
    all_pair_rates.extend(pair_snp_substitution_rates)

print numpy.sort(all_closest_rates)
print numpy.sort(all_pair_rates)

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
    all_closest_rates, min_x=1e-06, max_x=1e09)
pylab.step(xs,
           ns / ns[0],
           '-',
           color='r',
           linewidth=0.5,
           alpha=0.5,
           label='Between-host',
           where='mid',
           zorder=2)

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
    all_pair_rates, min_x=1e-06, max_x=1e09)
pylab.step(xs,
           ns / ns[0],
           '-',
    print low_divergence_between_host_gene_prevalences.mean()
    print len(low_divergence_between_host_gene_prevalences), len(
        between_host_gene_prevalences)

    h = numpy.histogram(low_divergence_between_host_gene_prevalences,
                        bins=prevalence_bins)[0]
    #prevalence_axis.plot(prevalence_locations, h*1.0/h.sum(),'r.-',label=('d<%g' % modification_divergence_threshold), alpha=0.5,markersize=3)

h = numpy.histogram(within_host_gene_prevalences, bins=prevalence_bins)[0]
#prevalence_axis.plot(prevalence_locations, h*1.0/h.sum(),'b.-',label='Within-host',markersize=3)

print len(within_host_gene_prevalences), "within-host changes"

# CDF version

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
    within_host_gene_prevalences)
prevalence_axis.step(xs,
                     1 - ns * 1.0 / ns[0],
                     'b-',
                     label='Within-host',
                     zorder=2)

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
    between_host_gene_prevalences)
prevalence_axis.step(xs,
                     1 - ns * 1.0 / ns[0],
                     'r-',
                     label='Between-host',
                     zorder=1)

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
###################
#
# Prevalence
#
###################

prevalence_axis = plt.Subplot(fig, outer_grid[0])
fig.add_subplot(prevalence_axis)
fig.suptitle(species_name)

prevalence_axis.set_ylabel('Fraction genes $\geq p$')
prevalence_axis.set_xlabel('Prevalence of gene, $p$')
prevalence_axis.set_xlim([0, 1])
prevalence_axis.set_ylim([0, 1])

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
    pangenome_prevalences)
prevalence_axis.step(xs, ns * 1.0 / ns[0], 'k-', label='Total pan genome')

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
    between_host_gene_prevalences)
prevalence_axis.step(xs,
                     ns * 1.0 / ns[0],
                     'r-',
                     label='Between host differences')

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
    within_host_gene_prevalences)
prevalence_axis.step(xs,
                     ns * 1.0 / ns[0],
                     'g-',
                     label='Within host differences')
Esempio n. 5
0
combination_type = "sample"

# Load subject and sample metadata
sys.stderr.write("Loading HMP metadata...\n")
subject_sample_map = parse_midas_data.parse_subject_sample_map()
sys.stderr.write("Done!\n")

# Calculate num timepoints per sample
num_timepoints_per_subject = []
for subject in subject_sample_map.keys():
    num_timepoints_per_subject.append(len(subject_sample_map[subject].keys()))

num_timepoints_per_subject.sort()
num_timepoints_per_subject = numpy.array(num_timepoints_per_subject)

num_timepoints, num_subjects = stats_utils.calculate_unnormalized_survival_from_vector(
    num_timepoints_per_subject)

pylab.figure(1, figsize=(5, 3))
pylab.step(num_timepoints + 0.25, num_subjects, where='pre')
pylab.semilogy([0], [1])
pylab.xlim([0.5, 9])
pylab.ylim([0.3, 300])
pylab.xlabel('Num timepoints, $T$')
pylab.ylabel('Num subjects with $\geq T$')
print len(num_timepoints_per_subject), max(num_timepoints_per_subject)
pylab.savefig('%s/num_timepoints_per_subject.pdf' %
              parse_midas_data.analysis_directory,
              bbox_inches='tight')

# Load marker gene coverages
species_coverage_matrix, sample_list, species_list = parse_midas_data.parse_global_marker_gene_coverages(
min_copynum_distribution = gene_copynum_matrix[desired_gene_idxs, :].min(
    axis=0)

for gene_name in desired_gene_names:

    gene_idx = numpy.nonzero(gene_names == gene_name)[0][0]

    gene_copynum_distribution = gene_copynum_matrix[gene_idx, :]

    print gene_copynum_matrix.shape, gene_copynum_distribution.shape

    #print gene_copynum_distribution

    xvalues, ns = stats_utils.calculate_unnormalized_survival_from_vector(
        gene_copynum_distribution,
        min_x=0,
        max_x=gene_copynum_distribution.max())

    pylab.step(xvalues, ns, label=gene_name)
    #pylab.semilogy([4],[4])

xvalues, ns = stats_utils.calculate_unnormalized_survival_from_vector(
    min_copynum_distribution, min_x=0, max_x=min_copynum_distribution.max())

pylab.step(xvalues, ns, label='Both')

pylab.legend(loc='upper right', frameon=False)

pylab.savefig(
    '../morteza_collaboration/ben_figures/Alistipes_onderdonkii_gene_gain_HMP_prevalence.pdf',
    bbox_inches='tight')
Esempio n. 7
0
print "Mean within host snps =", pooled_snp_change_distribution.mean()
print "Median withon host snps =", numpy.median(pooled_snp_change_distribution)

pooled_snp_change_distribution = numpy.clip(pooled_snp_change_distribution, 1e-01,1e08)
pooled_twin_snp_change_distribution = numpy.clip(pooled_twin_snp_change_distribution, 1e-01,1e08)
pooled_between_snp_change_distribution = numpy.clip(pooled_between_snp_change_distribution, 1e-01,1e08)
pooled_min_between_snp_change_distribution = numpy.clip(pooled_min_between_snp_change_distribution, 1e-01,1e08)



#xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(pooled_between_snp_change_distribution, min_x=1e-02, max_x=1e09)

#pooled_snp_axis.step(xs,ns,'-',color='r',linewidth=0.5, alpha=0.5, label='Between-host', where='mid')

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(pooled_min_between_snp_change_distribution, min_x=1e-02, max_x=1e09)

ymin = 1.0/ns[0]
ymax = 1.3

pooled_snp_axis.loglog([1e-01,1e05],[ymin,ymin],'k:')

pooled_snp_axis.set_ylim([1.0/ns[0],1.3])

pooled_snp_axis.fill_between([1e-01,modification_difference_threshold],[ymin,ymin],[ymax,ymax],color='#deebf7')
pooled_snp_axis.fill_between([replacement_difference_threshold,1e05],[ymin,ymin],[ymax,ymax],color='#fee0d2')

pooled_snp_axis.text(exp((log(1e05)+log(replacement_difference_threshold))/2), ymax*1.2, 'putative\nreplacement',fontsize=6,fontstyle='italic',ha='center',color='#fc9272')
pooled_snp_axis.text(exp((log(1)+log(modification_difference_threshold))/2), ymax*1.2, 'putative\nmodification',fontsize=6,fontstyle='italic',ha='center',color='#9ecae1')
#pooled_snp_axis.text(exp((log(modification_difference_threshold)+log(replacement_difference_threshold))/2), ymax*1.2, 'unclassified',fontsize=6,fontstyle='italic',ha='center')
Esempio n. 8
0
    bootstrapped_fake_low_ps.extend(
        binomial(sample_sizes, low_p) * 1.0 / sample_sizes)
    bootstrapped_fake_all_ps.extend(
        binomial(sample_sizes, all_p) * 1.0 / sample_sizes)

#xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(bootstrapped_low_ps, min_x=0,max_x=2)
#sharing_axis.step(xs,ns*1.0/ns[0],'r-',label='Low $d_S$ (matched)',zorder=3)

#xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(bootstrapped_all_ps, min_x=0,max_x=2)
#sharing_axis.step(xs,ns*1.0/ns[0],'k-',label='All (matched)',zorder=2)

#xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(bootstrapped_fake_low_ps, min_x=0,max_x=1)
#sharing_axis.step(xs,ns*1.0/ns[0],'r-',label='Low $d_S$ (pooled)',zorder=1,alpha=0.5)

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
    real_all_ps[all_doubleton_opportunities > min_opportunities],
    min_x=0,
    max_x=2)
sharing_axis.step(xs,
                  ns * 1.0 / ns[0],
                  'k-',
                  label='Between hosts (all)',
                  zorder=1)  #,alpha=0.5)

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
    real_low_ps[low_doubleton_opportunities > min_opportunities],
    min_x=0,
    max_x=2)
sharing_axis.step(xs,
                  ns * 1.0 / ns[0],
                  'r-',
                  label='Between hosts\n(closely related)',
Esempio n. 9
0
bootstrapped_haploid_countss = []
for bootstrap_idx in xrange(0, num_bootstraps):

    bootstrapped_haploid_countss.append(
        binomial(sample_highcoverage_counts, pavg))

pooled_bootstrapped_haploid_fractions = []
for bootstrap_idx in xrange(0, num_bootstraps):
    pooled_bootstrapped_haploid_fractions.extend(
        bootstrapped_haploid_countss[bootstrap_idx][
            sample_highcoverage_counts >= 1] * 1.0 /
        sample_highcoverage_counts[sample_highcoverage_counts >= 1])
pooled_bootstrapped_haploid_fractions = numpy.array(
    pooled_bootstrapped_haploid_fractions)

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
    pooled_bootstrapped_haploid_fractions)
haploid_cdf_axis.step(xs, ns * 1.0 / ns[0], '-', color='0.7', label='Null')

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
    sample_haploid_fractions[sample_highcoverage_counts >= 1])
haploid_cdf_axis.step(xs,
                      ns * 1.0 / ns[0],
                      '-',
                      color=haploid_color,
                      label='Obs')
haploid_cdf_axis.set_xlim([0, 1])

haploid_cdf_axis.legend(loc='upper right', frameon=False, numpoints=1)

#########
#
gene_difference_axis.spines['right'].set_visible(False)
gene_difference_axis.get_xaxis().tick_bottom()
gene_difference_axis.get_yaxis().tick_left()

gene_difference_axis.semilogx([1, 1])
gene_difference_axis.set_xlim([1, 1e04])
gene_difference_axis.set_ylim([0, 1.174])

low_divergence_snp_differences = numpy.array(low_divergence_snp_differences)
low_divergence_gene_differences = numpy.array(low_divergence_gene_differences)
low_divergence_clock_null_gene_differences = numpy.array(
    low_divergence_clock_null_gene_differences)
normal_divergence_gene_differences = numpy.array(
    normal_divergence_gene_differences)

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
    low_divergence_gene_differences, min_x=0.1, max_x=1e04)
snp_difference_axis.step(xs,
                         1 - ns * 1.0 / ns[0],
                         'r-',
                         label='Closely\nrelated',
                         zorder=1)

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
    low_divergence_gene_differences, min_x=0.1, max_x=1e04)
gene_difference_axis.step(xs,
                          1 - ns * 1.0 / ns[0],
                          'r-',
                          label='Closely\nrelated',
                          zorder=1)

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
    pylab.savefig('%s/%s_pooled_sfs.pdf' %
                  (parse_midas_data.analysis_directory, species_name),
                  bbox_inches='tight')
    pylab.savefig('%s/%s_pooled_sfs.png' %
                  (parse_midas_data.analysis_directory, species_name),
                  bbox_inches='tight',
                  dpi=300)

    pylab.figure(2, figsize=(3.42, 2))
    pylab.suptitle(species_name)

    #xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(polymorphic_freqs)
    #pylab.step(xs,ns*1.0/ns[0],'b-',label='All polymorphisms')

    if len(null_inconsistent_freqs) > 0:
        xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
            null_inconsistent_freqs)
        pylab.step(xs,
                   ns * 1.0 / ns[0],
                   '-',
                   color='0.7',
                   linewidth=0.5,
                   label=('Unlinked expectation'))

    if len(inconsistent_freqs) > 0:
        xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
            inconsistent_freqs)
        pylab.step(xs,
                   ns * 1.0 / ns[0],
                   'r-',
                   label=('Inconsistent ($d=%g$)' % max_clade_d))
    ds.append(d)
    vs.append(v)

random_ds = numpy.array(random_ds)
ds = numpy.array(ds)
vs = numpy.array(vs)

sys.stderr.write("Done!\n")

print len(ds), "total singletons"
print(vs > 0.5).sum(), "1D"
print(vs < 0.5).sum(), "4D"

# Now plot them.

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(random_ds)
d_axis.step(xs, 1 - ns * 1.0 / ns[0], '-', color='0.7')

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(ds)
d_axis.step(xs, 1 - ns * 1.0 / ns[0], 'b-')

d_axis.semilogx([1e-05, 2e-05], [1, 1])

dstars = numpy.logspace(-4, -2, 20)
fraction_nonsynonymous = []

for dstar in dstars:

    less_idxs = (ds <= dstar)

    if less_idxs.sum() > 1:
Esempio n. 13
0
    all_ngood = all_doubletons[idxs].astype(numpy.int32)
    all_nbad = (all_doubleton_opportunities[idxs] - all_ngood).astype(
        numpy.int32)
    all_p = all_doubletons.sum() * 1.0 / all_doubleton_opportunities.sum()

    bootstrapped_low_ps.extend(
        hypergeometric(low_ngood, low_nbad, sample_sizes) * 1.0 / sample_sizes)
    bootstrapped_all_ps.extend(
        hypergeometric(all_ngood, all_nbad, sample_sizes) * 1.0 / sample_sizes)
    bootstrapped_fake_low_ps.extend(
        binomial(sample_sizes, low_p) * 1.0 / sample_sizes)
    bootstrapped_fake_all_ps.extend(
        binomial(sample_sizes, all_p) * 1.0 / sample_sizes)

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
    bootstrapped_low_ps, min_x=0, max_x=2)
sharing_axis.step(xs,
                  ns * 1.0 / ns[0],
                  'r-',
                  label='Low $d_S$ (matched)',
                  zorder=3)

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
    bootstrapped_all_ps, min_x=0, max_x=2)
sharing_axis.step(xs, ns * 1.0 / ns[0], 'k-', label='All (matched)', zorder=2)

#xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(bootstrapped_fake_low_ps, min_x=0,max_x=1)
#sharing_axis.step(xs,ns*1.0/ns[0],'r-',label='Low $d_S$ (pooled)',zorder=1,alpha=0.5)

xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(real_low_ps,
                                                                 min_x=0,
Esempio n. 14
0
print marker_genes

print marker_gene_idxs.sum()

sample_idxs = (parse_midas_data.calculate_unique_samples(subject_sample_map, gene_samples)) * (marker_coverages >= min_coverage)

prevalences = gene_diversity_utils.calculate_fractional_gene_prevalences(gene_depth_matrix[:, sample_idxs], marker_coverages[sample_idxs], min_copynum=0.3)

reference_prevalences = prevalences[reference_gene_idxs]
metaphlan2_prevalences = prevalences[metaphlan2_gene_idxs]
marker_prevalences = prevalences[marker_gene_idxs]

print marker_prevalences

pangenome_xs, pangenome_survivals = stats_utils.calculate_unnormalized_survival_from_vector(prevalences, min_x=0, max_x=1)

reference_xs, reference_survivals = stats_utils.calculate_unnormalized_survival_from_vector(reference_prevalences, min_x=0, max_x=1)

metaphlan2_xs, metaphlan2_survivals = stats_utils.calculate_unnormalized_survival_from_vector(metaphlan2_prevalences, min_x=0, max_x=1)

marker_xs, marker_survivals = stats_utils.calculate_unnormalized_survival_from_vector(marker_prevalences, min_x=0, max_x=1)

pylab.figure(1,figsize=(3.42,4))
pylab.title(species_name)

#pylab.step(pangenome_xs, pangenome_survivals/pangenome_survivals[0],label='Pan-genome')
#pylab.step(reference_xs, reference_survivals/reference_survivals[0],label='Reference')
#pylab.step(metaphlan2_xs, metaphlan2_survivals/metaphlan2_survivals[0],label='Metaphlan2')
#pylab.step(marker_xs, marker_survivals/marker_survivals[0],label='MIDAS Marker')
#pylab.ylim([1e-02,1])
Esempio n. 15
0
        print "d=", max_ds[i]
        print "Site", "Polymorphic", "Inconsistent"
        for variant_type in sorted(polymorphic_variant_types[i].keys()):
            variant_type, polymorphic_variant_types[i][
                variant_type], inconsistent_variant_types[i][variant_type]
        print ""

    pylab.figure(4, figsize=(3.42, 2))
    pylab.suptitle(species_name)

    for i in xrange(0, len(polymorphic_freqs)):

        if len(polymorphic_freqs[i]) == 0:
            continue

        xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
            polymorphic_freqs[i])
        pylab.step(xs,
                   ns * 1.0 / ns[0],
                   '-',
                   label='Polymorphic ($d=%g$)' % max_ds[i])
        print 'Polymorphic (d=%g), n=%g' % (max_ds[i], ns[0])

    if len(inconsistent_freqs[1]) > 0:
        xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
            inconsistent_freqs[1])
        pylab.step(xs,
                   ns * 1.0 / ns[0],
                   'r-',
                   linewidth=2,
                   label=('Inconsistent ($d=%g$)' % max_ds[1]))