def build_pairwise_matrix(self, strain_names, elem_intervals):
     # 3d matrix. First index is combo, remaining 2d matrices are counts for pairwise intervals
     source_counts = np.zeros([(subspecies.NUM_SUBSPECIES + 1) ** 2, len(elem_intervals), len(elem_intervals)],
                              dtype=np.int16)
     for strain_name in strain_names:
         intervals, sources = self.sample_dict[strain_name]
         # map this strain's intervals onto the elementary intervals
         breaks = np.insert(np.searchsorted(elem_intervals, intervals), 0, -1)
         for row in xrange(len(intervals)):
             for col in xrange(row, len(intervals)):  # only upper triangle
                 source = subspecies.combine(sources[row], sources[col])
                 source_ordinate = subspecies.to_ordinal(source)
                 source_counts[source_ordinate, breaks[row] + 1:breaks[row + 1] + 1,
                 breaks[col] + 1:breaks[col + 1] + 1] += 1
     return source_counts
 def interlocus_dependence(self, strain_names):
     """ Performs a chi square test to find interval pairs whose origins are interdependent
     :param strain_names: list of strain names to analyze
     :return: elementary intervals, matrix of chi square values, matrix of p values (both upper triangular)
     """
     combo_count_dict, intervals = self.pairwise_frequencies(strain_names)
     # convert source_counts to matrix combo_counts
     combo_counts = np.empty([len(intervals), len(intervals), subspecies.NUM_SUBSPECIES ** 2], dtype=np.uint16)
     species_counts = np.zeros([len(intervals), subspecies.NUM_SUBSPECIES])
     for i, prox_species in enumerate(subspecies.iter_subspecies()):
         for j, dist_species in enumerate(subspecies.iter_subspecies()):
             counts = combo_count_dict[subspecies.combine(prox_species, dist_species)]
             species_counts[:, i] += np.diag(counts)
             combo_counts[:, :, i * subspecies.NUM_SUBSPECIES + j] = counts
     # compute expected combo frequencies from source frequencies
     combo_expectations = np.zeros([len(intervals), len(intervals), subspecies.NUM_SUBSPECIES ** 2])
     for i in xrange(subspecies.NUM_SUBSPECIES):
         for j in xrange(subspecies.NUM_SUBSPECIES):
             combo_expectations[:, :, i * subspecies.NUM_SUBSPECIES + j] += \
                 np.outer(species_counts[:, i], species_counts[:, j])
     # normalize expectations using the actual total frequency
     sums = np.sum(combo_counts, axis=2)
     old_settings = np.seterr(invalid='ignore')  # ignore division by 0 errors for intervals with no assigned origin
     for i in xrange(subspecies.NUM_SUBSPECIES ** 2):
         combo_expectations[:, :, i] = np.true_divide(combo_expectations[:, :, i], sums)
     np.seterr(**old_settings)
     combo_expectations = np.nan_to_num(combo_expectations)
     # do chi-square test
     output = []
     for i in xrange(len(intervals)):
         # only upper triangle is meaningful
         for j in xrange(i + 1, len(intervals)):
             nonzero_expectations = np.where(combo_expectations[i, j])
             chi_sq, p_value = stats.chisquare(
                 combo_counts[i, j][nonzero_expectations], combo_expectations[i, j][nonzero_expectations])
             output.append([
                     chi_sq,
                     p_value,
                     # proximal interval
                     intervals[i-1],
                     intervals[i],
                     # distal interval
                     intervals[j],
                     intervals[j-1]
                     ])
     return output
 def pairwise_frequencies(self, strain_names):
     """ For every locus pair and every label pair, count the number of strains which have those
     labels at those pairs of loci.
     :param strain_names: list of strain names to analyze (must be a subset of the output from preprocess())
     """
     output = [[[], [], [], []] for _ in xrange(subspecies.NUM_SUBSPECIES**2)]
     for strain_name in strain_names:
         intervals, sources = self.sample_dict[strain_name]
         for i in xrange(len(intervals)):
             # only upper triangle is meaningful
             if subspecies.is_known(sources[i]):
                 for j in xrange(i, len(intervals)):
                     if subspecies.is_known(sources[j]):
                         combo_output = output[subspecies.to_ordinal(subspecies.combine(sources[i], sources[j]))]
                         combo_output[0].append(intervals[i-1])
                         combo_output[1].append(intervals[i])
                         combo_output[2].append(intervals[j-1])
                         combo_output[3].append(intervals[j])
     return output, [subspecies.to_color(i, True) for i in xrange(subspecies.NUM_SUBSPECIES**2)]