snp_substitution_rate = snp_difference_matrix * 1.0 / (
    snp_opportunity_matrix + (snp_opportunity_matrix == 0))
sys.stderr.write("Done!\n")

# Load gene coverage information for species_name
sys.stderr.write("Loading pangenome data for %s...\n" % species_name)
gene_samples, gene_names, gene_presence_matrix, gene_depth_matrix, marker_coverages, gene_reads_matrix = parse_midas_data.parse_pangenome_data(
    species_name, allowed_samples=snp_samples)
sys.stderr.write("Done!\n")

sys.stderr.write("Loaded gene info for %d samples\n" % len(gene_samples))

gene_copynum_matrix = gene_depth_matrix * 1.0 / (marker_coverages +
                                                 (marker_coverages == 0))

prevalence_idxs = (parse_midas_data.calculate_unique_samples(
    subject_sample_map, gene_samples)) * (marker_coverages >= min_coverage)

prevalences = gene_diversity_utils.calculate_fractional_gene_prevalences(
    gene_depth_matrix[:, prevalence_idxs], marker_coverages[prevalence_idxs])

pangenome_prevalences = numpy.array(prevalences, copy=True)
pangenome_prevalences.sort()

# Calculate matrix of number of genes that differ
sys.stderr.write("Calculating matrix of gene differences...\n")
gene_gain_matrix, gene_loss_matrix, gene_opportunity_matrix = gene_diversity_utils.calculate_coverage_based_gene_hamming_matrix_gain_loss(
    gene_depth_matrix,
    marker_coverages,
    min_log2_fold_change=4,
    include_high_copynum=include_high_copynum)
    inconsistency_axis = inconsistency_axes[example_idx]
    #
    # Only plot samples above a certain depth threshold that are "haploids"
    snp_samples = diversity_utils.calculate_haploid_samples(species_name,
                                                            debug=debug)

    # Only consider samples from isolates
    snp_samples_isolates = []
    for sample in snp_samples:
        if sample in isolates:
            snp_samples_isolates.append(sample)

    snp_samples = numpy.asarray(snp_samples_isolates)

    # Only consider one sample per person
    snp_samples = snp_samples[parse_midas_data.calculate_unique_samples(
        subject_sample_map, sample_list=snp_samples)]
    sys.stderr.write("Proceeding with %d haploid samples!\n" %
                     len(snp_samples))
    #
    sys.stderr.write("Loading pre-computed substitution rates for %s...\n" %
                     species_name)
    substitution_rate_map = calculate_substitution_rates.load_substitution_rate_map(
        species_name)
    sys.stderr.write("Calculating matrix...\n")
    dummy_samples, snp_difference_matrix, snp_opportunity_matrix = calculate_substitution_rates.calculate_matrices_from_substitution_rate_map(
        substitution_rate_map, 'core', allowed_samples=snp_samples)
    snp_samples = dummy_samples
    sys.stderr.write("Done!\n")
    #
    snp_substitution_rate = snp_difference_matrix * 1.0 / (
        snp_opportunity_matrix + (snp_opportunity_matrix == 0))
}

# Load pi information for species_name
sys.stderr.write("Loading within-sample diversity for %s...\n" % species_name)
samples, total_pis, total_pi_opportunities = parse_midas_data.parse_within_sample_pi(
    species_name, debug)
sys.stderr.write("Done!\n")
pis = total_pis / total_pi_opportunities

median_coverages = numpy.array(
    [sample_coverage_map[samples[i]] for i in xrange(0, len(samples))])

# Only plot samples above a certain depth threshold that are "haploids"
snp_samples = samples[(median_coverages >= min_coverage) * (pis <= 1e-03)]
# Restrict to single timepoint single timepoints per person
unique_subject_idxs = parse_midas_data.calculate_unique_samples(
    subject_sample_map, snp_samples)
snp_samples = snp_samples[unique_subject_idxs]

# Analyze SNPs, looping over chunk sizes.
# Clunky, but necessary to limit memory usage on cluster

# Load SNP information for species_name
sys.stderr.write("Loading %s...\n" % species_name)

genotype_matrix = numpy.array([])
passed_sites_matrix = numpy.array([])
snp_difference_matrix = numpy.array([])
snp_opportunity_matrix = numpy.array([])

final_line_number = 0
while final_line_number >= 0:
median_coverages = numpy.array(
    [sample_coverage_map[samples[i]] for i in xrange(0, len(samples))])

###############################################################
# Indexes for SNP samples that have high coverage #
###############################################################

# Only plot samples above a certain depth threshold that are "haploids"
low_pi_snp_samples = samples[(median_coverages >= min_coverage) *
                             (pis <= 1e-03)]
high_pi_snp_samples = samples[(median_coverages >= min_coverage) *
                              (pis > 1e-03)]

# Calculate which pairs of idxs belong to unique samples. Remove any samples that are duplicates (i.e. multiple time pts)
unique_idxs = parse_midas_data.calculate_unique_samples(
    subject_sample_map, low_pi_snp_samples)
low_pi_snp_samples = low_pi_snp_samples[unique_idxs]

unique_idxs = parse_midas_data.calculate_unique_samples(
    subject_sample_map, high_pi_snp_samples)
high_pi_snp_samples = low_pi_snp_samples[unique_idxs]

####################################################
# Load gene coverage information for species_name
####################################################
sys.stderr.write("Loading pangenome data for %s...\n" % species_name)
low_pi_gene_samples, low_pi_gene_names, low_pi_gene_presence_matrix, low_pi_gene_depth_matrix, low_pi_marker_coverages, low_pi_gene_reads_matrix = parse_midas_data.parse_pangenome_data(
    species_name, allowed_samples=low_pi_snp_samples)

high_pi_gene_samples, high_pi_gene_names, high_pi_gene_presence_matrix, high_pi_gene_depth_matrix, high_pi_marker_coverages, high_pi_gene_reads_matrix = parse_midas_data.parse_pangenome_data(
    species_name, allowed_samples=high_pi_snp_samples)