# Load gene coverage information for species_name
sys.stderr.write("Loading pangenome data for %s...\n" % species_name)
gene_samples, gene_names, gene_presence_matrix, gene_depth_matrix, marker_coverages, gene_reads_matrix = parse_midas_data.parse_pangenome_data(
    species_name, allowed_samples=snp_samples)
sys.stderr.write("Done!\n")

sys.stderr.write("Loaded gene info for %d samples\n" % len(gene_samples))

gene_copynum_matrix = gene_depth_matrix * 1.0 / (marker_coverages +
                                                 (marker_coverages == 0))

prevalence_idxs = (parse_midas_data.calculate_unique_samples(
    subject_sample_map, gene_samples)) * (marker_coverages >= min_coverage)

prevalences = gene_diversity_utils.calculate_fractional_gene_prevalences(
    gene_depth_matrix[:, prevalence_idxs], marker_coverages[prevalence_idxs])

pangenome_prevalences = numpy.array(prevalences, copy=True)
pangenome_prevalences.sort()

# Calculate matrix of number of genes that differ
sys.stderr.write("Calculating matrix of gene differences...\n")
gene_gain_matrix, gene_loss_matrix, gene_opportunity_matrix = gene_diversity_utils.calculate_coverage_based_gene_hamming_matrix_gain_loss(
    gene_depth_matrix,
    marker_coverages,
    min_log2_fold_change=4,
    include_high_copynum=include_high_copynum)

gene_difference_matrix = gene_gain_matrix + gene_loss_matrix

# Now need to make the gene samples and snp samples match up
Example #2
0
pangenome_genes = set(gene_names)

for marker_gene in marker_genes:
    print marker_gene, marker_gene in pangenome_genes
    
reference_gene_idxs = numpy.array([gene_name in reference_genes for gene_name in gene_names])
metaphlan2_gene_idxs = numpy.array([gene_name in metaphlan2_genes for gene_name in gene_names])
marker_gene_idxs = numpy.array([gene_name in marker_genes for gene_name in gene_names])

print marker_genes

print marker_gene_idxs.sum()

sample_idxs = (parse_midas_data.calculate_unique_samples(subject_sample_map, gene_samples)) * (marker_coverages >= min_coverage)

prevalences = gene_diversity_utils.calculate_fractional_gene_prevalences(gene_depth_matrix[:, sample_idxs], marker_coverages[sample_idxs], min_copynum=0.3)

reference_prevalences = prevalences[reference_gene_idxs]
metaphlan2_prevalences = prevalences[metaphlan2_gene_idxs]
marker_prevalences = prevalences[marker_gene_idxs]

print marker_prevalences

pangenome_xs, pangenome_survivals = stats_utils.calculate_unnormalized_survival_from_vector(prevalences, min_x=0, max_x=1)

reference_xs, reference_survivals = stats_utils.calculate_unnormalized_survival_from_vector(reference_prevalences, min_x=0, max_x=1)

metaphlan2_xs, metaphlan2_survivals = stats_utils.calculate_unnormalized_survival_from_vector(metaphlan2_prevalences, min_x=0, max_x=1)

marker_xs, marker_survivals = stats_utils.calculate_unnormalized_survival_from_vector(marker_prevalences, min_x=0, max_x=1)