core_rev_difference_matrix += chunk_core_rev_difference_matrix
            core_rev_opportunity_matrix += chunk_core_rev_opportunity_matrix

            # Add all
            snp_mut_difference_matrix += chunk_snp_mut_difference_matrix
            snp_mut_opportunity_matrix += chunk_snp_mut_opportunity_matrix
            snp_rev_difference_matrix += chunk_snp_rev_difference_matrix
            snp_rev_opportunity_matrix += chunk_snp_rev_opportunity_matrix

            snp_samples = dummy_samples

        # Now calculate gene differences
        # Load gene coverage information for species_name
        sys.stderr.write("Loading pangenome data for %s...\n" % species_name)
        gene_samples, gene_names, gene_presence_matrix, gene_depth_matrix, marker_coverages, gene_reads_matrix = parse_midas_data.parse_pangenome_data(
            species_name,
            allowed_samples=snp_samples,
            disallowed_genes=shared_pangenome_genes)
        sys.stderr.write("Done! Loaded %d genes\n" % len(gene_names))

        gene_sample_list = list(gene_samples)
        gene_sample_set = set(gene_samples)

        # Calculate matrix of number of genes that differ
        sys.stderr.write("Calculating matrix of gene differences...\n")

        gene_gain_matrix, gene_loss_matrix, gene_opportunity_matrix = gene_diversity_utils.calculate_coverage_based_gene_hamming_matrix_gain_loss(
            gene_reads_matrix, gene_depth_matrix, marker_coverages)

        good_marker_coverages = (marker_coverages >= min_coverage)

        gene_gain_matrix = gene_gain_matrix * good_marker_coverages[:,
# Only plot samples above a certain depth threshold that are "haploids"
low_pi_snp_samples = samples[(median_coverages >= min_coverage) *
                             (pis <= 1e-03)]
high_pi_snp_samples = samples[(median_coverages >= min_coverage) *
                              (pis > 1e-03)]

# Calculate which pairs of idxs belong to the same sample, which to the same subject
# and which to different subjects
#snp_same_sample_idxs, snp_same_subject_idxs, snp_diff_subject_idxs = parse_midas_data.calculate_subject_pairs(subject_sample_map, snp_samples)

####################################################
# Load gene coverage information for species_name
####################################################
sys.stderr.write("Loading pangenome data for %s...\n" % species_name)
low_pi_gene_samples, low_pi_gene_names, low_pi_gene_presence_matrix, low_pi_gene_depth_matrix, low_pi_marker_coverages, low_pi_gene_reads_matrix = parse_midas_data.parse_pangenome_data(
    species_name, allowed_samples=low_pi_snp_samples)

high_pi_gene_samples, high_pi_gene_names, high_pi_gene_presence_matrix, high_pi_gene_depth_matrix, high_pi_marker_coverages, high_pi_gene_reads_matrix = parse_midas_data.parse_pangenome_data(
    species_name, allowed_samples=high_pi_snp_samples)
sys.stderr.write("Done!\n")

# this represents all gene names regardless of prevalences
gene_names, new_species_names = list(
    parse_midas_data.load_pangenome_genes(species_name))

# convert format of gene names from set to list:
gene_names = list(gene_names)

###############################################
# Load kegg information
##############################################
print len(metaphlan2_genes), "metaphlan2 genes"

metaphlan2_gene_coverages = []
for gene in metaphlan2_genes:
    if gene in gene_coverages:
        metaphlan2_gene_coverages.append( gene_coverages[gene] )
metaphlan2_gene_coverages = numpy.array(metaphlan2_gene_coverages)

median_metaphlan2_coverages = numpy.median(metaphlan2_gene_coverages,axis=0)
#mean_metaphlan2_coverages = metaphlan2_gene_coverages.mean(axis=0)
mean_metaphlan2_coverages = (metaphlan2_gene_coverages*(metaphlan2_gene_coverages>=1)).sum(axis=0)/((metaphlan2_gene_coverages>=1).sum(axis=0))


# Load gene coverage information for species_name
sys.stderr.write("Loading pangenome data for %s...\n" % species_name)
gene_samples, gene_names, gene_presence_matrix, gene_depth_matrix, marker_coverages, gene_reads_matrix = parse_midas_data.parse_pangenome_data(species_name)
sys.stderr.write("Done!\n")

midas_marker_coverage_map = {}
for i in xrange(0,len(gene_samples)):
    midas_marker_coverage_map[gene_samples[i]] = marker_coverages[i]
    
pylab.figure(1,figsize=(22,2))
 
marker_gene_coverages = parse_midas_data.parse_marker_gene_coverage_distribution(species_name)

max_coverages = []
median_coverages = []

marker_genes = sorted(marker_gene_coverages[marker_gene_coverages.keys()[0]].keys())
    sys.stderr.write("Done!\n")
#
    if snp_difference_matrix.shape[0]==0:
        snp_difference_matrix = numpy.zeros_like(chunk_snp_difference_matrix)*1.0
        snp_opportunity_matrix = numpy.zeros_like(snp_difference_matrix)*1.0
#    
    snp_difference_matrix += chunk_snp_difference_matrix
    snp_opportunity_matrix += chunk_snp_opportunity_matrix


snp_substitution_rate = snp_difference_matrix*1.0/(snp_opportunity_matrix+(snp_opportunity_matrix==0))
sys.stderr.write("Done!\n")   

# Load gene coverage information for species_name
sys.stderr.write("Loading pangenome data for %s...\n" % species_name)
gene_samples, gene_names, gene_presence_matrix, gene_depth_matrix, marker_coverages, gene_reads_matrix = parse_midas_data.parse_pangenome_data(species_name,allowed_samples=snp_samples,convert_centroid_names=False)
sys.stderr.write("Done!\n")

prevalence_idxs = (parse_midas_data.calculate_unique_samples(subject_sample_map, gene_samples))*(marker_coverages>=min_coverage)
    
prevalences = gene_diversity_utils.calculate_fractional_gene_prevalences(gene_depth_matrix[:,prevalence_idxs], marker_coverages[prevalence_idxs])

pangenome_prevalences = numpy.array(prevalences,copy=True)
pangenome_prevalences.sort()
        
# Calculate matrix of number of genes that differ
sys.stderr.write("Calculating matrix of gene differences...\n")
gene_difference_matrix, gene_opportunity_matrix = gene_diversity_utils.calculate_coverage_based_gene_hamming_matrix(gene_depth_matrix, marker_coverages, min_log2_fold_change=4)

##############################################################
# Now need to make the gene samples and snp samples match up #