def main(): """ Run some tests with a dummy file, overriding chromosome lengths locally for sake of testing. """ tl = TwoLocus(in_path='/csbiodata/public/www.csbio.unc.edu/htdocs/sgreens/pairwise_origins/') # tl = TwoLocus() # tl.preprocess(glob.glob('OR_ss_origins/*.hap')) print len(tl.list_available_strains()) exit() # print len(tl.list_available_strains()) # tl.preprocess(['cc_origins.csv']) # tl.preprocess(['ccv_origins.csv']) classical = [s for s in ["129P1/ReJ", # "129P3/J", "129S1SvlmJ", "129S6", "129T2/SvEmsJ", "129X1/SvJ", "A/J", "A/WySnJ", "AEJ/GnLeJ", "AEJ/GnRk", "AKR/J", "ALR/LtJ", "ALS/LtJ", "BALB/cByJ", "BALB/cJ", "BDP/J", "BPH/2J", # "BPL/1J", "BPN/3J", "BTBR T<+>tf/J", "BUB/BnJ", "BXSB/MpJ", "C3H/HeJ", "C3HeB/FeJ", "C57BL/10J", # "C57BL/10ScNJ", "C57BL/10SAAAJ", "C57BL/6CR", "C57BL/6J", "C57BL/6NCI", "C57BL/6Tc", "C57BLKS/J", # "C57BR/cdJ", "C57L/J", "C58/J", "CBA/CaJ", "CBA/J", "CE/J", "CHMU/LeJ", "DBA/1J", "DBA/1LacJ", # "DBA/2DeJ", "DBA/2HaSmnJ", "DBA/2J", "DDK/Pas", "DDY/JclSidSeyFrkJ", "DLS/LeJ", "EL/SuzSeyFrkJ", # "FVB/NJ", "HPG/BmJ", "I/LnJ", "IBWSP2", "IBWSR2", "ICOLD2", "IHOT1", "IHOT2", "ILS", "ISS", "JE/LeJ", # "KK/HlJ", "LG/J", "LP/J", "LT/SvEiJ", "MRL/MpJ", "NOD/ShiLtJ", "NON/ShiLtJ", "NONcNZO10/LtJ", # "NONcNZO5/LtJ", "NOR/LtJ", "NU/J", "NZB/BlNJ", "NZL/LtJ", "NZM2410/J", "NZO/HlLtJ", "NZW/LacJ", "P/J", # "PL/J", "PN/nBSwUmabJ", "RF/J", "RHJ/LeJ", "RIIIS/J", "RSV/LeJ", "SB/LeJ", "SEA/GnJ", "SEC/1GnLeJ", # "SEC/1ReJ", "SH1/LeJ", "SI/Col Tyrp1 Dnahc11/J", "SJL/Bm", "SJL/J", "SM/J", "SSL/LeJ", "ST/bJ", "STX/Le", ] # "SWR/J", "TALLYHO/JngJ", "TKDU/DnJ", "TSJ/LeJ", "YBR/EiJ", "ZRDCT Rax<+>ChUmdJ"] if tl.is_available(s)] wild_derived = [s for s in ['22MO', # 'BIK/g', 'BULS', 'BUSNA', 'BZO', 'CALB/RkJ', 'CASA/RkJ', 'CAST/EiJ', 'CIM', 'CKN', 'CKS', 'CZECHI/EiJ', 'CZECHII/EiJ', 'DCA', 'DCP', 'DDO', 'DEB', 'DGA', 'DIK', 'DJO', 'DKN', 'DMZ', 'DOT', # 'IS/CamRkJ', 'JF1/Ms', 'LEWES/EiJ', 'MBK', 'MBS', 'MCZ', 'MDG', 'MDGI', 'MDH', 'MGA', 'MH', # 'MOLD/RkJ', 'MOLF/EiJ', 'MOLG/DnJ', 'MOR/RkJ', 'MPB', 'MSM/Ms', 'PERA/EiJ', 'PERC/EiJ', 'POHN/Deh', # 'PWD/PhJ', 'PWK/PhJ', 'RBA/DnJ', 'RBB/DnJ', 'RBF/DnJ', 'SF/CamEiJ', 'SKIVE/EiJ', 'SOD1/EiJ', # 'STLT', 'STRA', 'STRB', 'STUF', 'STUP', 'STUS', 'TIRANO/EiJ', 'WLA', 'WMP', 'WSB/EiJ', 'ZALENDE/EiJ'] if tl.is_available(s)] tl.contingency_table(classical, wild_derived, '/csbiohome01/sgreens/Projects/intervals/contingency.csv') exit() x = TwoLocus(chrom_sizes=[20e6, 20e6]) x.preprocess(["test2.csv"]) x.unique_combos(['A', 'B', 'D'], ['C', 'E']) x.sources_at_point_pair('1', 1, '1', 10000000, ['A']) # x.interlocus_dependence([chr(c) for c in xrange(ord('A'), ord('J')+1)]) # exit() x = TwoLocus(chrom_sizes=[20 * 10 ** 6, 20 * 10 ** 6]) x.preprocess(["test.csv"]) rez = x.pairwise_frequencies(["A"]) areas = x.calculate_genomic_area(rez[0], rez[1]) total = 0.0 for combo in subspecies.iter_combos(): print "\t{:15s}({:4d}):{:1.5f}".format(subspecies.to_string(combo), combo, areas[str(subspecies.to_string(combo))]) total += areas[str(subspecies.to_string(combo))] print "\t{:21s}:{:1.5f}".format("Total", total) sys.exit(1)
def calculate_genomic_area(self, counts, intervals): """ Compute the total genomic 'area' occupied by each combination of subspecies. :param counts: dictionary of incidence matrices, one per subspecies combo :param intervals: the 'elementary intervals' over which the counts were computed """ # compute area of each cell in the interval grid intervals = np.array([0] + intervals, dtype=np.float32) / 1.0e6 areas = np.zeros([len(intervals) - 1, len(intervals) - 1], dtype=np.float32) for row in xrange(1, len(intervals)): for col in xrange(row, len(intervals)): areas[row - 1, col - 1] = (intervals[row] - intervals[row - 1]) * (intervals[col] - intervals[col - 1]) if col > row: areas[col - 1, row - 1] = areas[row - 1, col - 1] areas_masked = OrderedDict() denom = np.sum(np.array(self.sizes) / 1.0e6) ** 2 for combo, vals in enumerate(counts): factor = 1 areas_masked.update({str(subspecies.to_string(combo, True)): np.sum((vals > 0) * areas * factor) / denom}) return areas_masked
def sources_at_point_pair(self, chrom1, pos1, chrom2, pos2, strain_names): """ Prints the range of the 2D interval and the counts of subspecific combos at 2 loci in the genome :param chrom1: chromosome of one locus :param pos1: position of one locus :param chrom2: chromosome of another locus :param pos2: position of another locus :param strain_names: list of strain names to analyze """ coords = [self.genome_index(chrom1, pos1), self.genome_index(chrom2, pos2)] mins = [0] * 2 maxes = [np.sum(self.sizes)] * 2 coords.sort() output = {} samples = [[[] for _ in subspecies.iter_subspecies(True)] for _ in subspecies.iter_subspecies(True)] key = [subspecies.to_string(s) for s in subspecies.iter_subspecies(True)] for strain_name in strain_names: intervals = self.sample_dict[strain_name][0] sources = self.sample_dict[strain_name][1] # find interval containing each location i = 0 interval_indices = [None, None] for loc_num in xrange(2): while intervals[i] < coords[loc_num]: i += 1 if i > 0: mins[loc_num] = max(mins[loc_num], intervals[i - 1]) maxes[loc_num] = min(maxes[loc_num], intervals[i]) interval_indices[loc_num] = i samples[subspecies.to_ordinal(sources[interval_indices[0]])][ subspecies.to_ordinal(sources[interval_indices[1]])].append(strain_name) output['Key'] = key output['Samples'] = samples output['Intervals'] = [ self.chrom_and_pos(mins[0], maxes[0]), self.chrom_and_pos(mins[1], maxes[1]) ] return output