def interlocus_dependence(self, strain_names): """ Performs a chi square test to find interval pairs whose origins are interdependent :param strain_names: list of strain names to analyze :return: elementary intervals, matrix of chi square values, matrix of p values (both upper triangular) """ combo_count_dict, intervals = self.pairwise_frequencies(strain_names) # convert source_counts to matrix combo_counts combo_counts = np.empty([len(intervals), len(intervals), subspecies.NUM_SUBSPECIES ** 2], dtype=np.uint16) species_counts = np.zeros([len(intervals), subspecies.NUM_SUBSPECIES]) for i, prox_species in enumerate(subspecies.iter_subspecies()): for j, dist_species in enumerate(subspecies.iter_subspecies()): counts = combo_count_dict[subspecies.combine(prox_species, dist_species)] species_counts[:, i] += np.diag(counts) combo_counts[:, :, i * subspecies.NUM_SUBSPECIES + j] = counts # compute expected combo frequencies from source frequencies combo_expectations = np.zeros([len(intervals), len(intervals), subspecies.NUM_SUBSPECIES ** 2]) for i in xrange(subspecies.NUM_SUBSPECIES): for j in xrange(subspecies.NUM_SUBSPECIES): combo_expectations[:, :, i * subspecies.NUM_SUBSPECIES + j] += \ np.outer(species_counts[:, i], species_counts[:, j]) # normalize expectations using the actual total frequency sums = np.sum(combo_counts, axis=2) old_settings = np.seterr(invalid='ignore') # ignore division by 0 errors for intervals with no assigned origin for i in xrange(subspecies.NUM_SUBSPECIES ** 2): combo_expectations[:, :, i] = np.true_divide(combo_expectations[:, :, i], sums) np.seterr(**old_settings) combo_expectations = np.nan_to_num(combo_expectations) # do chi-square test output = [] for i in xrange(len(intervals)): # only upper triangle is meaningful for j in xrange(i + 1, len(intervals)): nonzero_expectations = np.where(combo_expectations[i, j]) chi_sq, p_value = stats.chisquare( combo_counts[i, j][nonzero_expectations], combo_expectations[i, j][nonzero_expectations]) output.append([ chi_sq, p_value, # proximal interval intervals[i-1], intervals[i], # distal interval intervals[j], intervals[j-1] ]) return output
def sources_at_point_pair(self, chrom1, pos1, chrom2, pos2, strain_names): """ Prints the range of the 2D interval and the counts of subspecific combos at 2 loci in the genome :param chrom1: chromosome of one locus :param pos1: position of one locus :param chrom2: chromosome of another locus :param pos2: position of another locus :param strain_names: list of strain names to analyze """ coords = [self.genome_index(chrom1, pos1), self.genome_index(chrom2, pos2)] mins = [0] * 2 maxes = [np.sum(self.sizes)] * 2 coords.sort() output = {} samples = [[[] for _ in subspecies.iter_subspecies(True)] for _ in subspecies.iter_subspecies(True)] key = [subspecies.to_string(s) for s in subspecies.iter_subspecies(True)] for strain_name in strain_names: intervals = self.sample_dict[strain_name][0] sources = self.sample_dict[strain_name][1] # find interval containing each location i = 0 interval_indices = [None, None] for loc_num in xrange(2): while intervals[i] < coords[loc_num]: i += 1 if i > 0: mins[loc_num] = max(mins[loc_num], intervals[i - 1]) maxes[loc_num] = min(maxes[loc_num], intervals[i]) interval_indices[loc_num] = i samples[subspecies.to_ordinal(sources[interval_indices[0]])][ subspecies.to_ordinal(sources[interval_indices[1]])].append(strain_name) output['Key'] = key output['Samples'] = samples output['Intervals'] = [ self.chrom_and_pos(mins[0], maxes[0]), self.chrom_and_pos(mins[1], maxes[1]) ] return output