gene_copynum_matrix = gene_depth_matrix * 1.0 / (marker_coverages + (marker_coverages == 0)) # # convert gene_samples to list: gene_samples = gene_samples.tolist() # # convert gene names to numpy array: gene_names = numpy.array(gene_names) # # indexes for different subject pairs desired_samples = gene_samples # desired_same_sample_idxs, desired_same_subject_idxs, desired_diff_subject_idxs = parse_midas_data.calculate_ordered_subject_pairs( sample_order_map, desired_samples) # snp_sample_idx_map = parse_midas_data.calculate_sample_idx_map( desired_samples, snp_samples) gene_sample_idx_map = parse_midas_data.calculate_sample_idx_map( desired_samples, gene_samples) # same_subject_snp_idxs = parse_midas_data.apply_sample_index_map_to_indices( snp_sample_idx_map, desired_same_subject_idxs) same_subject_gene_idxs = parse_midas_data.apply_sample_index_map_to_indices( gene_sample_idx_map, desired_same_subject_idxs) # diff_subject_snp_idxs = parse_midas_data.apply_sample_index_map_to_indices( snp_sample_idx_map, desired_diff_subject_idxs) diff_subject_gene_idxs = parse_midas_data.apply_sample_index_map_to_indices( gene_sample_idx_map, desired_diff_subject_idxs) # between_host_gene_idxs = [ ] # store idxs of genes that change between hosts
sys.stderr.write("Done! (%d genes)\n" % len(reference_genes)) print reference_genes[0:10] print gene_names[0:10] # Calculate matrix of number of genes that differ sys.stderr.write("Calculate gene hamming matrix...\n") # Either: for all genes in pan-genome gene_hamming_matrix, num_opportunities = gene_diversity_utils.calculate_coverage_based_gene_hamming_matrix( gene_depth_matrix, marker_coverages, min_log2_fold_change=4) # # Or: just the subset from the MIDAS reference genome #gene_hamming_matrix = diversity_utils.calculate_coverage_based_gene_hamming_matrix(gene_depth_matrix[reference_gene_idxs,:], marker_coverages, min_log2_fold_change=4) # sample_idx_map = parse_midas_data.calculate_sample_idx_map( high_coverage_samples, samples) # Calculate which pairs of idxs belong to the same sample, which to the same subject # and which to different subjects high_coverage_same_sample_idxs, high_coverage_same_subject_idxs, high_coverage_diff_subject_idxs = parse_midas_data.calculate_subject_pairs( subject_sample_map, high_coverage_samples) same_sample_idxs = parse_midas_data.apply_sample_index_map_to_indices( sample_idx_map, high_coverage_same_sample_idxs) same_subject_idxs = parse_midas_data.apply_sample_index_map_to_indices( sample_idx_map, high_coverage_same_subject_idxs) diff_subject_idxs = parse_midas_data.apply_sample_index_map_to_indices( sample_idx_map, high_coverage_diff_subject_idxs) hamming_timepoints = gene_hamming_matrix[same_subject_idxs] hamming_timepoints.sort()