core_rev_difference_matrix = numpy.array([])  # all sites in core genes
        core_rev_opportunity_matrix = numpy.array([])

        snp_mut_difference_matrix = numpy.array([])  # all sites in all genes
        snp_mut_opportunity_matrix = numpy.array([])
        snp_rev_difference_matrix = numpy.array([])  # all sites in all genes
        snp_rev_opportunity_matrix = numpy.array([])

        final_line_number = 0
        while final_line_number >= 0:

            sys.stderr.write("Loading chunk starting @ %d...\n" %
                             final_line_number)
            dummy_samples, allele_counts_map, passed_sites_map, final_line_number = parse_midas_data.parse_snps(
                species_name,
                debug=debug,
                allowed_samples=snp_samples,
                chunk_size=chunk_size,
                initial_line_number=final_line_number)
            sys.stderr.write("Done! Loaded %d genes\n" %
                             len(allele_counts_map.keys()))

            # Calculate fixation matrix
            sys.stderr.write("Calculating matrix of snp differences...\n")
            # Synonymous (4D)

            chunk_syn_mut_difference_matrix, chunk_syn_rev_difference_matrix, chunk_syn_mut_opportunity_matrix, chunk_syn_rev_opportunity_matrix = diversity_utils.calculate_mutation_reversion_matrix(
                allele_counts_map,
                passed_sites_map,
                allowed_genes=core_genes,
                allowed_variant_types=set(['4D']))
コード例 #2
0
    subject_sample_map, samples)

# Calculate the smaller and larger of the two pi estimates so we can look at correlation over time
lower_pis = numpy.fmin(clipped_pis[same_subject_idxs[0]],
                       clipped_pis[same_subject_idxs[1]])
upper_pis = numpy.fmax(clipped_pis[same_subject_idxs[0]],
                       clipped_pis[same_subject_idxs[1]])

# Only plot samples above a certain depth threshold that are "haploids"
desired_samples = samples[(median_coverages >= min_coverage) * (pis <= 1e-03)]

###

# Load SNP information for species_name
sys.stderr.write("Loading %s...\n" % species_name)
samples, allele_counts_map, passed_sites_map, final_line_number = parse_midas_data.parse_snps(
    species_name, debug=debug, allowed_samples=desired_samples)
sys.stderr.write("Done!\n")

pi_matrix_syn, avg_pi_matrix_syn = diversity_utils.calculate_pi_matrix(
    allele_counts_map, passed_sites_map, variant_type='4D')

low_diversity_samples = (numpy.diag(avg_pi_matrix_syn) < 1e-03)

unique_samples = parse_midas_data.calculate_unique_samples(
    subject_sample_map, samples)

desired_samples = unique_samples * low_diversity_samples

# initialize distance bins for LD computations
distance_bins = numpy.logspace(
    0, 4, 20
# Load genomic coverage distributions
sample_coverage_histograms, samples = parse_midas_data.parse_coverage_distribution(
    species_name, combination_type="sample")
median_coverages = numpy.array([
    stats_utils.calculate_median_from_histogram(sample_coverage_histogram)
    for sample_coverage_histogram in sample_coverage_histograms
])

sample_coverage_map = {
    samples[i]: median_coverages[i]
    for i in xrange(0, len(samples))
}

# Load SNP information for species_name
sys.stderr.write("Loading %s...\n" % species_name)
samples, allele_counts_map, passed_sites_map = parse_midas_data.parse_snps(
    species_name, combination_type="sample", debug=False)
sys.stderr.write("Done!\n")

median_coverages = numpy.array(
    [sample_coverage_map[samples[i]] for i in xrange(0, len(samples))])

# Calculate full matrix of synonymous pairwise differences
sys.stderr.write("Calculate synonymous pi matrix...\n")
pi_matrix_syn, avg_pi_matrix_syn = diversity_utils.calculate_pi_matrix(
    allele_counts_map,
    passed_sites_map,
    variant_type='4D',
    allowed_genes=metaphlan2_genes)

# Calculate fixation matrix
fixation_matrix_syn = diversity_utils.calculate_fixation_matrix(
コード例 #4
0
synonymous_sfs = []
nonsynonymous_sfs = []

synonymous_count_sfs = []
nonsynonymous_count_sfs = []

synonymous_pi_weighted_counts = 0
nonsynonymous_pi_weighted_counts = 0

    
final_line_number = 0
while final_line_number >= 0:
    
    sys.stderr.write("Loading chunk starting @ %d...\n" % final_line_number)
    snp_samples, allele_counts_map, passed_sites_map, final_line_number = parse_midas_data.parse_snps(species_name, debug=debug, allowed_variant_types=allowed_variant_types, allowed_samples=largest_clade_samples,allowed_genes=core_genes, chunk_size=chunk_size,initial_line_number=final_line_number)
    sys.stderr.write("Done! Loaded %d genes\n" % len(allele_counts_map.keys()))
    
    # Calculate fixation matrix
    sys.stderr.write("Calculating matrix of snp differences...\n")
    chunk_snp_difference_matrix, chunk_snp_opportunity_matrix = diversity_utils.calculate_fixation_matrix(allele_counts_map, passed_sites_map, allowed_genes=core_genes, min_change=min_change)    
    sys.stderr.write("Done!\n")
    
    if snp_difference_matrix.shape[0]==0:
        snp_difference_matrix = numpy.zeros_like(chunk_snp_difference_matrix)*1.0
        snp_opportunity_matrix = numpy.zeros_like(snp_difference_matrix)*1.0
        synonymous_difference_matrix = numpy.zeros_like(snp_difference_matrix)
        synonymous_opportunity_matrix = numpy.zeros_like(snp_difference_matrix)
        nonsynonymous_difference_matrix = numpy.zeros_like(snp_difference_matrix)
        nonsynonymous_opportunity_matrix = numpy.zeros_like(snp_difference_matrix)
コード例 #5
0
    desired_median_coverages = median_coverages[(median_coverages>=min_coverage)*(pis<=1e-03)]

    if len(desired_samples) < 2:
        sys.stderr.write("Too few haploid samples for %s.\n" % species_name)
        continue
    else:
        sys.stderr.write("Analyzing %d haploid samples...\n" % len(desired_samples))

    species_idx += 1
    
    
    

    # Load SNP information for species_name
    sys.stderr.write("Loading %s...\n" % species_name)
    dummy_samples, allele_counts_map, passed_sites_map = parse_midas_data.parse_snps(species_name, debug=debug, allowed_samples=desired_samples)
    sys.stderr.write("Done!\n")
    
    # Calculate fixation matrices
    sys.stderr.write("Calculating 4D fixation matrix...\n")
    fixation_matrix_syn, fixation_opportunities_syn = diversity_utils.calculate_fixation_matrix(allele_counts_map, passed_sites_map, allowed_variant_types=set(['4D']), min_change=min_change)
    sys.stderr.write("Calculating 1D fixation matrix...\n")
    fixation_matrix_non, fixation_opportunities_non = diversity_utils.calculate_fixation_matrix(allele_counts_map, passed_sites_map, allowed_variant_types=set(['1D']), min_change=min_change)
    sys.stderr.write("Calculating total fixation matrix...\n")
    fixation_matrix_all, fixation_opportunities_all = diversity_utils.calculate_fixation_matrix(allele_counts_map, passed_sites_map, min_change=min_change)

    sys.stderr.write("Done!\n")

    # Calculate fraction nonsynonymous  
    dN = fixation_matrix_non/fixation_opportunities_non
    dS = fixation_matrix_syn/fixation_opportunities_syn
import parse_midas_data
import pylab
import sys
import numpy
from calculate_pi_matrix import calculate_self_pis
import os
species=sys.argv[1]

data_directory = os.path.expanduser("~/ben_nandita_hmp_data/")
analysis_directory = os.path.expanduser("~/ben_nandita_hmp_analysis/")

default_directory_prefix =  data_directory

print species

sys.stderr.write("Loading %s...\n" % species)
    
samples, allele_counts_syn, locations_syn, genes_syn, passed_sites_syn, allele_counts_non, locations_non, genes_non, passed_sites_non = parse_midas_data.parse_snps(species, site_depth_threshold=15, directory_prefix=default_directory_prefix)
    
sys.stderr.write("Done!\n")
    
sys.stderr.write("Calculating pis...\n")
piS = calculate_self_pis(allele_counts_syn)
piS /= (passed_sites_syn+(passed_sites_syn==0))
sys.stderr.write("Done!\n")
    
for sample,pi in zip(samples,piS):
    print sample,pi
コード例 #7
0
        debug = False
        species_name = sys.argv[1]
else:
    sys.stderr.write("Usage: python command.py [debug] species_name")
########################################################################################

min_change = 0.8

# Load subject and sample metadata
sys.stderr.write("Loading HMP metadata...\n")
subject_sample_map = parse_midas_data.parse_subject_sample_map()
sys.stderr.write("Done!\n")

# Load SNP information for species_name
sys.stderr.write("Loading %s...\n" % species_name)
snp_samples, allele_counts_map, passed_sites_map = parse_midas_data.parse_snps(
    species_name, debug)
sys.stderr.write("Done!\n")

# Calculate full matrix of synonymous pairwise differences
sys.stderr.write("Calculate synonymous pi matrix...\n")
pi_matrix_syn, avg_pi_matrix_syn = diversity_utils.calculate_pi_matrix(
    allele_counts_map, passed_sites_map, variant_type='4D')

pi_matrix_syn = numpy.clip(pi_matrix_syn, 1e-06, 1)
avg_pi_matrix_syn = numpy.clip(avg_pi_matrix_syn, 1e-06, 1)

# Load gene presence/absence information for species_name
sys.stderr.write("Loading %s...\n" % species_name)
gene_samples, gene_names, gene_presence_matrix, gene_depth_matrix, marker_coverages, gene_reads_matrix = parse_midas_data.parse_pangenome_data(
    species_name)
sys.stderr.write("Done!\n")
コード例 #8
0
matplotlib.use('Agg')
import parse_midas_data
import pylab
import sys
import numpy
import diversity_utils
species = sys.argv[1]

# Load subject and sample metadata
sys.stderr.write("Loading HMP metadata...\n")
subject_sample_map = parse_midas_data.parse_subject_sample_map()
sys.stderr.write("Done!\n")

# Load SNP information for species_name
sys.stderr.write("Loading %s...\n" % species)
samples, allele_counts_map, passed_sites_map, final_line_number = parse_midas_data.parse_snps(
    species, debug=False)
sys.stderr.write("Done!\n")

sys.stderr.write("Calculating synonymous SFS...\n")
# calculate SFS
pooled_freqs = diversity_utils.calculate_pooled_freqs(
    allele_counts_map, passed_sites_map, allowed_variant_types=['4D'])
pooled_freqs = numpy.fmin(pooled_freqs, 1 - pooled_freqs)

bins = numpy.linspace(0, 0.5, 51)
bins -= (bins[1] - bins[0]) / 2
xs = bins[1:] - (bins[1] - bins[0]) / 2

sfs_syn, dummy = numpy.histogram(pooled_freqs, bins=bins)

sys.stderr.write("Calculating nonsynonymous SFS...\n")