core_rev_difference_matrix = numpy.array([]) # all sites in core genes core_rev_opportunity_matrix = numpy.array([]) snp_mut_difference_matrix = numpy.array([]) # all sites in all genes snp_mut_opportunity_matrix = numpy.array([]) snp_rev_difference_matrix = numpy.array([]) # all sites in all genes snp_rev_opportunity_matrix = numpy.array([]) final_line_number = 0 while final_line_number >= 0: sys.stderr.write("Loading chunk starting @ %d...\n" % final_line_number) dummy_samples, allele_counts_map, passed_sites_map, final_line_number = parse_midas_data.parse_snps( species_name, debug=debug, allowed_samples=snp_samples, chunk_size=chunk_size, initial_line_number=final_line_number) sys.stderr.write("Done! Loaded %d genes\n" % len(allele_counts_map.keys())) # Calculate fixation matrix sys.stderr.write("Calculating matrix of snp differences...\n") # Synonymous (4D) chunk_syn_mut_difference_matrix, chunk_syn_rev_difference_matrix, chunk_syn_mut_opportunity_matrix, chunk_syn_rev_opportunity_matrix = diversity_utils.calculate_mutation_reversion_matrix( allele_counts_map, passed_sites_map, allowed_genes=core_genes, allowed_variant_types=set(['4D']))
subject_sample_map, samples) # Calculate the smaller and larger of the two pi estimates so we can look at correlation over time lower_pis = numpy.fmin(clipped_pis[same_subject_idxs[0]], clipped_pis[same_subject_idxs[1]]) upper_pis = numpy.fmax(clipped_pis[same_subject_idxs[0]], clipped_pis[same_subject_idxs[1]]) # Only plot samples above a certain depth threshold that are "haploids" desired_samples = samples[(median_coverages >= min_coverage) * (pis <= 1e-03)] ### # Load SNP information for species_name sys.stderr.write("Loading %s...\n" % species_name) samples, allele_counts_map, passed_sites_map, final_line_number = parse_midas_data.parse_snps( species_name, debug=debug, allowed_samples=desired_samples) sys.stderr.write("Done!\n") pi_matrix_syn, avg_pi_matrix_syn = diversity_utils.calculate_pi_matrix( allele_counts_map, passed_sites_map, variant_type='4D') low_diversity_samples = (numpy.diag(avg_pi_matrix_syn) < 1e-03) unique_samples = parse_midas_data.calculate_unique_samples( subject_sample_map, samples) desired_samples = unique_samples * low_diversity_samples # initialize distance bins for LD computations distance_bins = numpy.logspace( 0, 4, 20
# Load genomic coverage distributions sample_coverage_histograms, samples = parse_midas_data.parse_coverage_distribution( species_name, combination_type="sample") median_coverages = numpy.array([ stats_utils.calculate_median_from_histogram(sample_coverage_histogram) for sample_coverage_histogram in sample_coverage_histograms ]) sample_coverage_map = { samples[i]: median_coverages[i] for i in xrange(0, len(samples)) } # Load SNP information for species_name sys.stderr.write("Loading %s...\n" % species_name) samples, allele_counts_map, passed_sites_map = parse_midas_data.parse_snps( species_name, combination_type="sample", debug=False) sys.stderr.write("Done!\n") median_coverages = numpy.array( [sample_coverage_map[samples[i]] for i in xrange(0, len(samples))]) # Calculate full matrix of synonymous pairwise differences sys.stderr.write("Calculate synonymous pi matrix...\n") pi_matrix_syn, avg_pi_matrix_syn = diversity_utils.calculate_pi_matrix( allele_counts_map, passed_sites_map, variant_type='4D', allowed_genes=metaphlan2_genes) # Calculate fixation matrix fixation_matrix_syn = diversity_utils.calculate_fixation_matrix(
synonymous_sfs = [] nonsynonymous_sfs = [] synonymous_count_sfs = [] nonsynonymous_count_sfs = [] synonymous_pi_weighted_counts = 0 nonsynonymous_pi_weighted_counts = 0 final_line_number = 0 while final_line_number >= 0: sys.stderr.write("Loading chunk starting @ %d...\n" % final_line_number) snp_samples, allele_counts_map, passed_sites_map, final_line_number = parse_midas_data.parse_snps(species_name, debug=debug, allowed_variant_types=allowed_variant_types, allowed_samples=largest_clade_samples,allowed_genes=core_genes, chunk_size=chunk_size,initial_line_number=final_line_number) sys.stderr.write("Done! Loaded %d genes\n" % len(allele_counts_map.keys())) # Calculate fixation matrix sys.stderr.write("Calculating matrix of snp differences...\n") chunk_snp_difference_matrix, chunk_snp_opportunity_matrix = diversity_utils.calculate_fixation_matrix(allele_counts_map, passed_sites_map, allowed_genes=core_genes, min_change=min_change) sys.stderr.write("Done!\n") if snp_difference_matrix.shape[0]==0: snp_difference_matrix = numpy.zeros_like(chunk_snp_difference_matrix)*1.0 snp_opportunity_matrix = numpy.zeros_like(snp_difference_matrix)*1.0 synonymous_difference_matrix = numpy.zeros_like(snp_difference_matrix) synonymous_opportunity_matrix = numpy.zeros_like(snp_difference_matrix) nonsynonymous_difference_matrix = numpy.zeros_like(snp_difference_matrix) nonsynonymous_opportunity_matrix = numpy.zeros_like(snp_difference_matrix)
desired_median_coverages = median_coverages[(median_coverages>=min_coverage)*(pis<=1e-03)] if len(desired_samples) < 2: sys.stderr.write("Too few haploid samples for %s.\n" % species_name) continue else: sys.stderr.write("Analyzing %d haploid samples...\n" % len(desired_samples)) species_idx += 1 # Load SNP information for species_name sys.stderr.write("Loading %s...\n" % species_name) dummy_samples, allele_counts_map, passed_sites_map = parse_midas_data.parse_snps(species_name, debug=debug, allowed_samples=desired_samples) sys.stderr.write("Done!\n") # Calculate fixation matrices sys.stderr.write("Calculating 4D fixation matrix...\n") fixation_matrix_syn, fixation_opportunities_syn = diversity_utils.calculate_fixation_matrix(allele_counts_map, passed_sites_map, allowed_variant_types=set(['4D']), min_change=min_change) sys.stderr.write("Calculating 1D fixation matrix...\n") fixation_matrix_non, fixation_opportunities_non = diversity_utils.calculate_fixation_matrix(allele_counts_map, passed_sites_map, allowed_variant_types=set(['1D']), min_change=min_change) sys.stderr.write("Calculating total fixation matrix...\n") fixation_matrix_all, fixation_opportunities_all = diversity_utils.calculate_fixation_matrix(allele_counts_map, passed_sites_map, min_change=min_change) sys.stderr.write("Done!\n") # Calculate fraction nonsynonymous dN = fixation_matrix_non/fixation_opportunities_non dS = fixation_matrix_syn/fixation_opportunities_syn
import parse_midas_data import pylab import sys import numpy from calculate_pi_matrix import calculate_self_pis import os species=sys.argv[1] data_directory = os.path.expanduser("~/ben_nandita_hmp_data/") analysis_directory = os.path.expanduser("~/ben_nandita_hmp_analysis/") default_directory_prefix = data_directory print species sys.stderr.write("Loading %s...\n" % species) samples, allele_counts_syn, locations_syn, genes_syn, passed_sites_syn, allele_counts_non, locations_non, genes_non, passed_sites_non = parse_midas_data.parse_snps(species, site_depth_threshold=15, directory_prefix=default_directory_prefix) sys.stderr.write("Done!\n") sys.stderr.write("Calculating pis...\n") piS = calculate_self_pis(allele_counts_syn) piS /= (passed_sites_syn+(passed_sites_syn==0)) sys.stderr.write("Done!\n") for sample,pi in zip(samples,piS): print sample,pi
debug = False species_name = sys.argv[1] else: sys.stderr.write("Usage: python command.py [debug] species_name") ######################################################################################## min_change = 0.8 # Load subject and sample metadata sys.stderr.write("Loading HMP metadata...\n") subject_sample_map = parse_midas_data.parse_subject_sample_map() sys.stderr.write("Done!\n") # Load SNP information for species_name sys.stderr.write("Loading %s...\n" % species_name) snp_samples, allele_counts_map, passed_sites_map = parse_midas_data.parse_snps( species_name, debug) sys.stderr.write("Done!\n") # Calculate full matrix of synonymous pairwise differences sys.stderr.write("Calculate synonymous pi matrix...\n") pi_matrix_syn, avg_pi_matrix_syn = diversity_utils.calculate_pi_matrix( allele_counts_map, passed_sites_map, variant_type='4D') pi_matrix_syn = numpy.clip(pi_matrix_syn, 1e-06, 1) avg_pi_matrix_syn = numpy.clip(avg_pi_matrix_syn, 1e-06, 1) # Load gene presence/absence information for species_name sys.stderr.write("Loading %s...\n" % species_name) gene_samples, gene_names, gene_presence_matrix, gene_depth_matrix, marker_coverages, gene_reads_matrix = parse_midas_data.parse_pangenome_data( species_name) sys.stderr.write("Done!\n")
matplotlib.use('Agg') import parse_midas_data import pylab import sys import numpy import diversity_utils species = sys.argv[1] # Load subject and sample metadata sys.stderr.write("Loading HMP metadata...\n") subject_sample_map = parse_midas_data.parse_subject_sample_map() sys.stderr.write("Done!\n") # Load SNP information for species_name sys.stderr.write("Loading %s...\n" % species) samples, allele_counts_map, passed_sites_map, final_line_number = parse_midas_data.parse_snps( species, debug=False) sys.stderr.write("Done!\n") sys.stderr.write("Calculating synonymous SFS...\n") # calculate SFS pooled_freqs = diversity_utils.calculate_pooled_freqs( allele_counts_map, passed_sites_map, allowed_variant_types=['4D']) pooled_freqs = numpy.fmin(pooled_freqs, 1 - pooled_freqs) bins = numpy.linspace(0, 0.5, 51) bins -= (bins[1] - bins[0]) / 2 xs = bins[1:] - (bins[1] - bins[0]) / 2 sfs_syn, dummy = numpy.histogram(pooled_freqs, bins=bins) sys.stderr.write("Calculating nonsynonymous SFS...\n")