parser.add_argument("--species", help="Name of specific species to run code on", default="Bacteroides_vulgatus_57955") args = parser.parse_args() debug = args.debug chunk_size = args.chunk_size species_name=args.species # Load subject and sample metadata sys.stderr.write("Loading sample metadata...\n") subject_sample_map = parse_HMP_data.parse_subject_sample_map() sys.stderr.write("Done!\n") # Only plot samples above a certain depth threshold that are "haploids" snp_samples = diversity_utils.calculate_haploid_samples(species_name, debug=debug) # Only consider one sample per person snp_samples = snp_samples[parse_midas_data.calculate_unique_samples(subject_sample_map, sample_list=snp_samples)] sys.stderr.write("Proceeding with %d haploid samples!\n" % len(snp_samples)) if len(snp_samples) < min_sample_size: sys.stderr.write("Not enough haploid samples!\n") sys.exit(1) sys.stderr.write("Proceeding with %d haploid samples!\n" % len(snp_samples)) sys.stderr.write("Loading core genes...\n") core_genes = core_gene_utils.parse_core_genes(species_name) non_shared_genes = core_gene_utils.parse_non_shared_reference_genes(species_name) shared_pangenome_genes = core_gene_utils.parse_shared_genes(species_name) sys.stderr.write("Done! Core genome consists of %d genes\n" % len(core_genes)) sys.stderr.write("%d shared genes and %d non-shared genes\n" % (len(shared_pangenome_genes), len(non_shared_genes)))
sys.stderr.write("Done!\n") sys.stderr.write("Loaded gene info for %d samples\n" % len(gene_samples)) gene_copynum_matrix = gene_depth_matrix * 1.0 / (marker_coverages + (marker_coverages == 0)) clipped_gene_copynum_matrix = numpy.clip(gene_depth_matrix, 0.1, 1e09) / (marker_coverages + 0.1 * (marker_coverages == 0)) low_copynum_matrix = (gene_copynum_matrix <= 3) good_copynum_matrix = (gene_copynum_matrix >= 0.5) * ( gene_copynum_matrix <= 3) # why isn't this till 2? NRG prevalence_idxs = (parse_midas_data.calculate_unique_samples( subject_sample_map, gene_samples)) * (marker_coverages >= min_coverage) prevalences = gene_diversity_utils.calculate_fractional_gene_prevalences( gene_depth_matrix[:, prevalence_idxs], marker_coverages[prevalence_idxs]) pangenome_prevalences = numpy.array(prevalences, copy=True) pangenome_prevalences.sort() # Calculate matrix of number of genes that differ sys.stderr.write("Calculating matrix of gene differences...\n") gene_gain_matrix, gene_loss_matrix, gene_opportunity_matrix = gene_diversity_utils.calculate_coverage_based_gene_hamming_matrix_gain_loss( gene_reads_matrix, gene_depth_matrix, marker_coverages) gene_difference_matrix = gene_gain_matrix + gene_loss_matrix # Now need to make the gene samples and snp samples match up
desired_samples = samples[(median_coverages >= min_coverage) * (pis <= 1e-03)] ### # Load SNP information for species_name sys.stderr.write("Loading %s...\n" % species_name) samples, allele_counts_map, passed_sites_map, final_line_number = parse_midas_data.parse_snps( species_name, debug=debug, allowed_samples=desired_samples) sys.stderr.write("Done!\n") pi_matrix_syn, avg_pi_matrix_syn = diversity_utils.calculate_pi_matrix( allele_counts_map, passed_sites_map, variant_type='4D') low_diversity_samples = (numpy.diag(avg_pi_matrix_syn) < 1e-03) unique_samples = parse_midas_data.calculate_unique_samples( subject_sample_map, samples) desired_samples = unique_samples * low_diversity_samples # initialize distance bins for LD computations distance_bins = numpy.logspace( 0, 4, 20 ) # bins start from 1 to 10^4 and there are 20 evenly spaced bins log(1)=0, log(10^4)-4 distance_bin_locations = numpy.array( distance_bins[:-1], copy=True) # shifted one to avoid edge effects for plotting. distance_bins[0] = 0.5 # made smallest bin 0.5 to avoid edge effects distance_bins[ -1] = 1e09 # made largest bin very large to catch anything >10^4. binned_rsquared_numerators = numpy.zeros_like(distance_bin_locations)
median_coverages = numpy.array( [sample_coverage_map[samples[i]] for i in xrange(0, len(samples))]) ############################################################### # Indexes for SNP samples that have high coverage # ############################################################### # Only plot samples above a certain depth threshold that are "haploids" low_pi_snp_samples = samples[(median_coverages >= min_coverage) * (pis <= 1e-03)] high_pi_snp_samples = samples[(median_coverages >= min_coverage) * (pis > 1e-03)] # Calculate which pairs of idxs belong to unique samples. Remove any samples that are duplicates (i.e. multiple time pts) unique_idxs = parse_midas_data.calculate_unique_samples( subject_sample_map, low_pi_snp_samples) low_pi_snp_samples = low_pi_snp_samples[unique_idxs] unique_idxs = parse_midas_data.calculate_unique_samples( subject_sample_map, high_pi_snp_samples) high_pi_snp_samples = low_pi_snp_samples[unique_idxs] #################################################### # Load gene coverage information for species_name #################################################### sys.stderr.write("Loading pangenome data for %s...\n" % species_name) low_pi_gene_samples, low_pi_gene_names, low_pi_gene_presence_matrix, low_pi_gene_depth_matrix, low_pi_marker_coverages, low_pi_gene_reads_matrix = parse_midas_data.parse_pangenome_data( species_name, allowed_samples=low_pi_snp_samples) high_pi_gene_samples, high_pi_gene_names, high_pi_gene_presence_matrix, high_pi_gene_depth_matrix, high_pi_marker_coverages, high_pi_gene_reads_matrix = parse_midas_data.parse_pangenome_data( species_name, allowed_samples=high_pi_snp_samples)