def setUp(self): self.epo_records = [] for i, (t, q) in enumerate(cigar_pairs): gab_pair = (toCigar("homo_sapiens", i, t), toCigar("mus_musculus", i, q)) A = EPOitem._strfactory(gab_pair[0]) B = EPOitem._strfactory(gab_pair[1]) if A and B: self.epo_records.append((A, B))
def test_rem_dash(self): # ****--****-------**** 4M2D4M7D4M # *******-------******* 7M7D7M # has 4 dash columns and should become # ****--****---**** 4M2D4M3D4M # *******---******* 7M3D7M for i in range(100): dash_cols = random.randint(0, 10) tStart = random.randint(0, 1000) qStart = random.randint(0, 1000) epo_pair = ( EPOitem._strfactory("homo_sapiens\t0\t1\t%d\t%d\t1\t%s" % (tStart, tStart+12-1, "4M2D4M%dD4M" % (dash_cols+3))), EPOitem._strfactory("mus_musculus\t0\t1\t%d\t%d\t1\t%s" % (qStart, qStart+14-1, "7M%dD7M" % (dash_cols+3)))) chain = Chain._make_from_epo(epo_pair[0], epo_pair[1], {"chr1": 500}, {"chr1": 800}) ti = epo_pair[0].intervals(False) qi = epo_pair[1].intervals(False) assert ti[2][0] - ti[1][1] - dash_cols == chain[2][1] assert qi[1][0] - qi[0][1] - dash_cols == chain[2][1] # ----***** # *-------* # has 3 dash cols and should become # * # * # with the qStart += 1 and tStart += 4 for i in range(100): dash_cols = random.randint(0, 10) tm = random.randint(6, 10) qm = random.randint(1, 5) tStart = random.randint(0, 1000) qStart = random.randint(0, 1000) epo_pair = ( EPOitem._strfactory("homo_sapiens\t0\t1\t%d\t%d\t1\t%s" % (tStart, tStart+tm-1, "%dD%dM" % (dash_cols+1, tm))), EPOitem._strfactory("mus_musculus\t0\t1\t%d\t%d\t1\t%s" % (qStart, qStart+qm+1-1, "M%dD%dM" % (dash_cols+tm-qm, qm)))) chain = Chain._make_from_epo(epo_pair[0], epo_pair[1], {"chr1": 500}, {"chr1": 800}) if chain[1][-1] != qm: pdb.set_trace() assert chain[1][-1] == qm # correct also for coordinate interpretation differences between UCSC and EPO assert (qStart + 1) - 1 == chain[0].qStart, "%d != %d" % (qStart + 1, chain[0].qStart)
parser = argparse.ArgumentParser(description="""EPO alignments (.out) to .chain converter.""", epilog="Olgert Denas (Taylor Lab)", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("input", help="File to process.") parser.add_argument("--species", nargs=2, default=["homo_sapiens", "mus_musculus"], help="Names of target and query species (respectively) in the alignment.") parser.add_argument("--chrsizes", nargs=2, required=True, help="Chromosome sizes for the given species.") parser.add_argument("-o", '--output', metavar="FILE", default='stdout', type=outFile, help="Output file") opt = parser.parse_args() log.info("loading sizes ...") tsizes = loadChrSizes(opt.chrsizes[0]) qsizes = loadChrSizes(opt.chrsizes[1]) log.info("loading alignments ...") data = EPOitem._parse_epo(opt.input) log.info("dumping ...") for k in data: components = data[k] trg_comp = filter(lambda c: c.species == opt.species[0], components) qr_comp = filter(lambda c: c.species == opt.species[1], components) convert_action(trg_comp, qr_comp, tsizes, qsizes, opt)
default=["homo_sapiens", "mus_musculus"], help= "Names of target and query species (respectively) in the alignment.") parser.add_argument("--chrsizes", nargs=2, required=True, help="Chromosome sizes for the given species.") parser.add_argument("-o", '--output', metavar="FILE", default='stdout', type=outFile, help="Output file") opt = parser.parse_args() log.info("loading sizes ...") tsizes = loadChrSizes(opt.chrsizes[0]) qsizes = loadChrSizes(opt.chrsizes[1]) log.info("loading alignments ...") data = EPOitem._parse_epo(opt.input) log.info("dumping ...") for k in data: components = data[k] trg_comp = filter(lambda c: c.species == opt.species[0], components) qr_comp = filter(lambda c: c.species == opt.species[1], components) convert_action(trg_comp, qr_comp, tsizes, qsizes, opt)
default=["homo_sapiens", "mus_musculus"], help= "Names of target and query species (respectively) in the alignment.") parser.add_argument("--chrsizes", nargs=2, required=True, help="Chromosome sizes for the given species.") parser.add_argument("-o", '--output', metavar="FILE", default='stdout', type=outFile, help="Output file") opt = parser.parse_args() log.info("loading sizes ...") tsizes = loadChrSizes(opt.chrsizes[0]) qsizes = loadChrSizes(opt.chrsizes[1]) log.info("loading alignments ...") data = OrderedDict(sorted(EPOitem._parse_epo(opt.input).items())) log.info("dumping ...") for k in data: components = data[k] trg_comp = [c for c in components if c.species == opt.species[0]] qr_comp = [c for c in components if c.species == opt.species[1]] convert_action(trg_comp, qr_comp, tsizes, qsizes, opt)
log.warning("skipping chromosome/contig (%s, %s)" % (a.chrom, b.chrom)) if __name__ == '__main__': parser = argparse.ArgumentParser(description="""EPO alignments (.out) to .chain converter.""", epilog="Olgert Denas (Taylor Lab)", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("input", help="File to process.") parser.add_argument("--species", nargs=2, default=["homo_sapiens", "mus_musculus"], help="Names of target and query species (respectively) in the alignment.") parser.add_argument("--chrsizes", nargs=2, required=True, help="Chromosome sizes for the given species.") parser.add_argument("-o", '--output', metavar="FILE", default='stdout', type=outFile, help="Output file") opt = parser.parse_args() log.info("loading sizes ...") tsizes = loadChrSizes(opt.chrsizes[0]) qsizes = loadChrSizes(opt.chrsizes[1]) log.info("loading alignments ...") data = OrderedDict(sorted(EPOitem._parse_epo(opt.input).items())) log.info("dumping ...") for k in data: components = data[k] trg_comp = [c for c in components if c.species == opt.species[0]] qr_comp = [c for c in components if c.species == opt.species[1]] convert_action(trg_comp, qr_comp, tsizes, qsizes, opt)