def build_pairwise_matrix(self, strain_names, elem_intervals): # 3d matrix. First index is combo, remaining 2d matrices are counts for pairwise intervals source_counts = np.zeros([(subspecies.NUM_SUBSPECIES + 1) ** 2, len(elem_intervals), len(elem_intervals)], dtype=np.int16) for strain_name in strain_names: intervals, sources = self.sample_dict[strain_name] # map this strain's intervals onto the elementary intervals breaks = np.insert(np.searchsorted(elem_intervals, intervals), 0, -1) for row in xrange(len(intervals)): for col in xrange(row, len(intervals)): # only upper triangle source = subspecies.combine(sources[row], sources[col]) source_ordinate = subspecies.to_ordinal(source) source_counts[source_ordinate, breaks[row] + 1:breaks[row + 1] + 1, breaks[col] + 1:breaks[col + 1] + 1] += 1 return source_counts
def interlocus_dependence(self, strain_names): """ Performs a chi square test to find interval pairs whose origins are interdependent :param strain_names: list of strain names to analyze :return: elementary intervals, matrix of chi square values, matrix of p values (both upper triangular) """ combo_count_dict, intervals = self.pairwise_frequencies(strain_names) # convert source_counts to matrix combo_counts combo_counts = np.empty([len(intervals), len(intervals), subspecies.NUM_SUBSPECIES ** 2], dtype=np.uint16) species_counts = np.zeros([len(intervals), subspecies.NUM_SUBSPECIES]) for i, prox_species in enumerate(subspecies.iter_subspecies()): for j, dist_species in enumerate(subspecies.iter_subspecies()): counts = combo_count_dict[subspecies.combine(prox_species, dist_species)] species_counts[:, i] += np.diag(counts) combo_counts[:, :, i * subspecies.NUM_SUBSPECIES + j] = counts # compute expected combo frequencies from source frequencies combo_expectations = np.zeros([len(intervals), len(intervals), subspecies.NUM_SUBSPECIES ** 2]) for i in xrange(subspecies.NUM_SUBSPECIES): for j in xrange(subspecies.NUM_SUBSPECIES): combo_expectations[:, :, i * subspecies.NUM_SUBSPECIES + j] += \ np.outer(species_counts[:, i], species_counts[:, j]) # normalize expectations using the actual total frequency sums = np.sum(combo_counts, axis=2) old_settings = np.seterr(invalid='ignore') # ignore division by 0 errors for intervals with no assigned origin for i in xrange(subspecies.NUM_SUBSPECIES ** 2): combo_expectations[:, :, i] = np.true_divide(combo_expectations[:, :, i], sums) np.seterr(**old_settings) combo_expectations = np.nan_to_num(combo_expectations) # do chi-square test output = [] for i in xrange(len(intervals)): # only upper triangle is meaningful for j in xrange(i + 1, len(intervals)): nonzero_expectations = np.where(combo_expectations[i, j]) chi_sq, p_value = stats.chisquare( combo_counts[i, j][nonzero_expectations], combo_expectations[i, j][nonzero_expectations]) output.append([ chi_sq, p_value, # proximal interval intervals[i-1], intervals[i], # distal interval intervals[j], intervals[j-1] ]) return output
def pairwise_frequencies(self, strain_names): """ For every locus pair and every label pair, count the number of strains which have those labels at those pairs of loci. :param strain_names: list of strain names to analyze (must be a subset of the output from preprocess()) """ output = [[[], [], [], []] for _ in xrange(subspecies.NUM_SUBSPECIES**2)] for strain_name in strain_names: intervals, sources = self.sample_dict[strain_name] for i in xrange(len(intervals)): # only upper triangle is meaningful if subspecies.is_known(sources[i]): for j in xrange(i, len(intervals)): if subspecies.is_known(sources[j]): combo_output = output[subspecies.to_ordinal(subspecies.combine(sources[i], sources[j]))] combo_output[0].append(intervals[i-1]) combo_output[1].append(intervals[i]) combo_output[2].append(intervals[j-1]) combo_output[3].append(intervals[j]) return output, [subspecies.to_color(i, True) for i in xrange(subspecies.NUM_SUBSPECIES**2)]