def test_compare_trinity(self): # Create the list of files files = [ "trinity.gtf", "trinity.gff3", "trinity.cDNA_match.gff3", "trinity.match_matchpart.gff3" ] files = [ pkg_resources.resource_filename("Mikado.tests", filename) for filename in files ] namespace = Namespace(default=False) namespace.distance = 2000 namespace.no_save_index = True for ref, pred in itertools.permutations(files, 2): with self.subTest(ref=ref, pred=pred): namespace.reference = to_gff(ref) namespace.prediction = to_gff(pred) namespace.log = os.path.join( tempfile.gettempdir(), "compare_{}_{}.log".format(files.index(ref), files.index(pred))) namespace.out = os.path.join( tempfile.gettempdir(), "compare_{}_{}".format(files.index(ref), files.index(pred))) compare(namespace) refmap = "{}.refmap".format(namespace.out) tmap = "{}.tmap".format(namespace.out) stats = "{}.stats".format(namespace.out) self.assertTrue(os.path.exists(namespace.log)) # with open(log) as log_handle: # log = [_.rstrip() for _ in log_handle] for fname in [refmap, stats, tmap]: self.assertTrue(os.path.exists(fname)) self.assertGreater(os.stat(fname).st_size, 0) with open(refmap) as _: reader = csv.DictReader(_, delimiter="\t") counter = 0 for counter, line in enumerate(reader, start=1): ccode = line["ccode"] self.assertIn(ccode, ("_", "=", "f,_", "f,="), (ref, pred, line)) self.assertEqual(counter, 38) for suff in ["log", "refmap", "tmap", "stats"]: [ os.remove(_) for _ in glob.glob( os.path.join(tempfile.gettempdir(), "compare_*.{}".format( suff))) ]
def test_compare_trinity(self): # Create the list of files files = ["trinity.gtf", "trinity.gff3", "trinity.cDNA_match.gff3", "trinity.match_matchpart.gff3"] files = [pkg_resources.resource_filename("Mikado.tests", filename) for filename in files] namespace = Namespace(default=False) namespace.distance = 2000 namespace.no_save_index = True for ref, pred in itertools.permutations(files, 2): with self.subTest(ref=ref, pred=pred): namespace.reference = to_gff(ref) namespace.prediction = to_gff(pred) namespace.log = os.path.join(tempfile.gettempdir(), "compare_{}_{}.log".format( files.index(ref), files.index(pred))) namespace.out = os.path.join(tempfile.gettempdir(), "compare_{}_{}".format( files.index(ref), files.index(pred))) compare(namespace) refmap = "{}.refmap".format(namespace.out) tmap = "{}.tmap".format(namespace.out) stats = "{}.stats".format(namespace.out) self.assertTrue(os.path.exists(namespace.log)) # with open(log) as log_handle: # log = [_.rstrip() for _ in log_handle] for fname in [refmap, stats, tmap]: self.assertTrue(os.path.exists(fname)) self.assertGreater(os.stat(fname).st_size, 0) with open(refmap) as _: reader = csv.DictReader(_, delimiter="\t") counter = 0 for counter, line in enumerate(reader, start=1): ccode = line["ccode"] self.assertIn(ccode, ("_", "=", "f,_", "f,="), (ref, pred, line)) self.assertEqual(counter, 38) for suff in ["log", "refmap", "tmap", "stats"]: [os.remove(_) for _ in glob.glob(os.path.join(tempfile.gettempdir(), "compare_*.{}".format(suff)))]
def test_multi_proc(self): json_conf = configurator.to_json(None) json_conf["pick"]["run_options"]["procs"] = 2 json_conf["pick"]["files"]["input"] = pkg_resources.resource_filename("Mikado.tests", "mikado_prepared.gtf") json_conf["pick"]["files"]["output_dir"] = tempfile.gettempdir() json_conf["pick"]["files"]["loci_out"] = "mikado.multiproc.loci.gff3" json_conf["pick"]["files"]["subloci_out"] = "mikado.multiproc.subloci.gff3" json_conf["pick"]["files"]["monoloci_out"] = "mikado.multiproc.monoloci.gff3" json_conf["pick"]["files"]["log"] = "mikado.multiproc.log" json_conf["db_settings"]["db"] = pkg_resources.resource_filename("Mikado.tests", "mikado.db") json_conf["log_settings"]["log_level"] = "WARNING" pick_caller = picker.Picker(json_conf=json_conf) with self.assertRaises(SystemExit), self.assertLogs("main_logger", "INFO"): pick_caller() self.assertTrue(os.path.exists(os.path.join(tempfile.gettempdir(), "mikado.multiproc.loci.gff3"))) with to_gff(os.path.join(tempfile.gettempdir(), "mikado.multiproc.loci.gff3")) as inp_gff: lines = [_ for _ in inp_gff if not _.header is True] self.assertGreater(len(lines), 0) self.assertGreater(len([_ for _ in lines if _.is_transcript is True]), 0) self.assertGreater(len([_ for _ in lines if _.feature == "mRNA"]), 0) self.assertGreater(len([_ for _ in lines if _.feature == "CDS"]), 0) [os.remove(_) for _ in glob.glob(os.path.join(tempfile.gettempdir(), "mikado.multiproc.") + "*")]
def test_stat(self): files = ["trinity.gtf", "trinity.gff3", "trinity.cDNA_match.gff3", "trinity.match_matchpart.gff3"] files = [pkg_resources.resource_filename("Mikado.tests", filename) for filename in files] std_lines = [] with pkg_resources.resource_stream("Mikado.tests", "trinity_stats.txt") as t_stats: for line in t_stats: std_lines.append(line.decode().rstrip()) namespace = Namespace(default=False) namespace.tab_stats = None for filename in files: with self.subTest(filename=filename): namespace.gff = to_gff(filename) with open(os.path.join(tempfile.gettempdir(), "{}.txt".format(os.path.basename(filename))), "w") as out: namespace.out = out Calculator(namespace)() self.assertGreater(os.stat(out.name).st_size, 0) with open(out.name) as out_handle: lines = [_.rstrip() for _ in out_handle] self.assertEqual(std_lines, lines) os.remove(out.name)
def test_index(self): # Create the list of files files = ["trinity.gtf", "trinity.gff3", "trinity.cDNA_match.gff3", "trinity.match_matchpart.gff3"] # files = [pkg_resources.resource_filename("Mikado.tests", filename) for filename in files] namespace = Namespace(default=False) namespace.distance = 2000 namespace.index = True namespace.prediction = None namespace.log = os.path.join(tempfile.gettempdir(), "index.log") logger = create_null_logger("null") for ref in files: with self.subTest(ref=ref): temp_ref = os.path.join(tempfile.gettempdir(), ref) with pkg_resources.resource_stream("Mikado.tests", ref) as ref_handle,\ open(temp_ref, "wb") as out_handle: out_handle.write(ref_handle.read()) namespace.reference = to_gff(temp_ref) compare(namespace) self.assertTrue(os.path.exists(namespace.log)) self.assertTrue(os.path.exists("{}.midx".format(namespace.reference.name))) self.assertGreater(os.stat("{}.midx".format(namespace.reference.name)).st_size, 0) genes, positions = load_index(namespace, logger) self.assertIsInstance(genes, dict) self.assertIsInstance(positions, dict) self.assertEqual(len(genes), 38) os.remove(namespace.reference.name) os.remove(namespace.log) os.remove("{}.midx".format(namespace.reference.name))
def test_stat(self): files = [ "trinity.gtf", "trinity.gff3", "trinity.cDNA_match.gff3", "trinity.match_matchpart.gff3" ] files = [ pkg_resources.resource_filename("Mikado.tests", filename) for filename in files ] std_lines = [] with pkg_resources.resource_stream("Mikado.tests", "trinity_stats.txt") as t_stats: for line in t_stats: std_lines.append(line.decode().rstrip()) namespace = Namespace(default=False) namespace.tab_stats = None for filename in files: with self.subTest(filename=filename): namespace.gff = to_gff(filename) with open( os.path.join( tempfile.gettempdir(), "{}.txt".format(os.path.basename(filename))), "w") as out: namespace.out = out Calculator(namespace)() self.assertGreater(os.stat(out.name).st_size, 0) with open(out.name) as out_handle: lines = [_.rstrip() for _ in out_handle] self.assertEqual(std_lines, lines) os.remove(out.name)
def test_subprocess(self): json_conf = configurator.to_json(None) json_conf["pick"]["files"]["input"] = pkg_resources.resource_filename( "Mikado.tests", "mikado_prepared.gtf") json_conf["pick"]["files"]["output_dir"] = tempfile.gettempdir() json_conf["pick"]["files"]["loci_out"] = "mikado.subproc.loci.gff3" json_conf["pick"]["files"][ "subloci_out"] = "mikado.subproc.subloci.gff3" json_conf["pick"]["files"][ "monoloci_out"] = "mikado.subproc.monoloci.gff3" json_conf["pick"]["files"]["log"] = "mikado.subproc.log" json_conf["db_settings"]["db"] = pkg_resources.resource_filename( "Mikado.tests", "mikado.db") json_conf["log_settings"]["log_level"] = "WARNING" for num in (1, 2): with self.subTest(num=num): json_conf["pick"]["run_options"]["procs"] = num json_conf["pick"]["run_options"]["single_thread"] = (num == 1) json_file = os.path.join(tempfile.gettempdir(), "mikado.yaml") with open(json_file, "wt") as json_handle: Mikado.subprograms.configure.print_config( yaml.dump(json_conf, default_flow_style=False), json_handle) sys.argv = ["mikado", "pick", "--json-conf", json_file] with self.assertRaises(SystemExit): pkg_resources.load_entry_point("Mikado", "console_scripts", "mikado")() self.assertTrue( os.path.exists( os.path.join(tempfile.gettempdir(), "mikado.subproc.loci.gff3"))) with to_gff( os.path.join(tempfile.gettempdir(), "mikado.subproc.loci.gff3")) as inp_gff: lines = [_ for _ in inp_gff if not _.header is True] self.assertGreater(len(lines), 0) self.assertGreater( len([_ for _ in lines if _.is_transcript is True]), 0) self.assertGreater( len([_ for _ in lines if _.feature == "mRNA"]), 0) self.assertGreater( len([_ for _ in lines if _.feature == "CDS"]), 0) [ os.remove(_) for _ in glob.glob( os.path.join(tempfile.gettempdir(), "mikado.subproc.") + "*") ]
def test_single_proc(self): json_conf = configurator.to_json(None) json_conf["pick"]["run_options"]["procs"] = 1 json_conf["db_settings"]["db"] = pkg_resources.resource_filename( "Mikado.tests", "mikado.db") json_conf["pick"]["files"]["input"] = pkg_resources.resource_filename( "Mikado.tests", "mikado_prepared.gtf") json_conf["pick"]["files"]["output_dir"] = tempfile.gettempdir() json_conf["pick"]["files"]["loci_out"] = "mikado.monoproc.loci.gff3" json_conf["pick"]["files"][ "subloci_out"] = "mikado.monoproc.subloci.gff3" json_conf["pick"]["files"][ "monoloci_out"] = "mikado.monoproc.monoloci.gff3" json_conf["pick"]["files"]["log"] = "mikado.monoproc.log" json_conf["log_settings"]["log_level"] = "WARNING" pick_caller = picker.Picker(json_conf=json_conf) with self.assertRaises(SystemExit), self.assertLogs( "main_logger", "INFO"): pick_caller() self.assertTrue( os.path.exists( os.path.join(tempfile.gettempdir(), "mikado.monoproc.loci.gff3"))) with to_gff( os.path.join(tempfile.gettempdir(), "mikado.monoproc.loci.gff3")) as inp_gff: lines = [_ for _ in inp_gff if not _.header is True] self.assertGreater(len(lines), 0) self.assertGreater( len([_ for _ in lines if _.is_transcript is True]), 0) self.assertGreater(len([_ for _ in lines if _.feature == "mRNA"]), 0) self.assertGreater(len([_ for _ in lines if _.feature == "CDS"]), 0) [ os.remove(_) for _ in glob.glob( os.path.join(tempfile.gettempdir(), "mikado.monoproc.") + "*") ]
def test_index(self): # Create the list of files files = [ "trinity.gtf", "trinity.gff3", "trinity.cDNA_match.gff3", "trinity.match_matchpart.gff3" ] # files = [pkg_resources.resource_filename("Mikado.tests", filename) for filename in files] namespace = Namespace(default=False) namespace.distance = 2000 namespace.index = True namespace.prediction = None namespace.log = os.path.join(tempfile.gettempdir(), "index.log") logger = create_null_logger("null") for ref in files: with self.subTest(ref=ref): temp_ref = os.path.join(tempfile.gettempdir(), ref) with pkg_resources.resource_stream("Mikado.tests", ref) as ref_handle,\ open(temp_ref, "wb") as out_handle: out_handle.write(ref_handle.read()) namespace.reference = to_gff(temp_ref) compare(namespace) self.assertTrue(os.path.exists(namespace.log)) self.assertTrue( os.path.exists("{}.midx".format(namespace.reference.name))) self.assertGreater( os.stat("{}.midx".format( namespace.reference.name)).st_size, 0) genes, positions = load_index(namespace, logger) self.assertIsInstance(genes, dict) self.assertIsInstance(positions, dict) self.assertEqual(len(genes), 38) os.remove(namespace.reference.name) os.remove(namespace.log) os.remove("{}.midx".format(namespace.reference.name))
def test_subprocess(self): json_conf = configurator.to_json(None) json_conf["pick"]["files"]["input"] = pkg_resources.resource_filename("Mikado.tests", "mikado_prepared.gtf") json_conf["pick"]["files"]["output_dir"] = tempfile.gettempdir() json_conf["pick"]["files"]["loci_out"] = "mikado.subproc.loci.gff3" json_conf["pick"]["files"]["subloci_out"] = "mikado.subproc.subloci.gff3" json_conf["pick"]["files"]["monoloci_out"] = "mikado.subproc.monoloci.gff3" json_conf["pick"]["files"]["log"] = "mikado.subproc.log" json_conf["db_settings"]["db"] = pkg_resources.resource_filename("Mikado.tests", "mikado.db") json_conf["log_settings"]["log_level"] = "WARNING" for num in (1, 2): with self.subTest(num=num): json_conf["pick"]["run_options"]["procs"] = num json_conf["pick"]["run_options"]["single_thread"] = (num == 1) json_file = os.path.join(tempfile.gettempdir(), "mikado.yaml") with open(json_file, "wt") as json_handle: Mikado.subprograms.configure.print_config(yaml.dump(json_conf, default_flow_style=False), json_handle) sys.argv = ["mikado", "pick", "--json-conf", json_file] with self.assertRaises(SystemExit): pkg_resources.load_entry_point("Mikado", "console_scripts", "mikado")() self.assertTrue(os.path.exists(os.path.join(tempfile.gettempdir(), "mikado.subproc.loci.gff3"))) with to_gff(os.path.join(tempfile.gettempdir(), "mikado.subproc.loci.gff3")) as inp_gff: lines = [_ for _ in inp_gff if not _.header is True] self.assertGreater(len(lines), 0) self.assertGreater(len([_ for _ in lines if _.is_transcript is True]), 0) self.assertGreater(len([_ for _ in lines if _.feature == "mRNA"]), 0) self.assertGreater(len([_ for _ in lines if _.feature == "CDS"]), 0) [os.remove(_) for _ in glob.glob(os.path.join(tempfile.gettempdir(), "mikado.subproc.") + "*")]
def test_purging(self): gtf = """Chr1 foo transcript 100 1000 . + . gene_id "foo1"; transcript_id "foo1.1" Chr1 foo exon 100 1000 . + . gene_id "foo1"; transcript_id "foo1.1" Chr1 foo transcript 100 2000 . + . gene_id "foo1"; transcript_id "foo1.2" Chr1 foo exon 100 800 . + . gene_id "foo1"; transcript_id "foo1.2" Chr1 foo exon 1900 2000 . + . gene_id "foo1"; transcript_id "foo1.2" Chr1 foo transcript 10000 20000 . + . gene_id "foo2"; transcript_id "foo2.1" Chr1 foo exon 10000 13000 . + . gene_id "foo2; transcript_id "foo2.1" Chr1 foo exon 19000 20000 . + . gene_id "foo"; transcript_id "foo2.1""" temp_gtf = tempfile.NamedTemporaryFile(mode="wt", suffix=".gtf", delete=True) temp_gtf.write(gtf) temp_gtf.flush() json_conf = configurator.to_json(None) json_conf["pick"]["files"]["input"] = temp_gtf.name json_conf["db_settings"]["db"] = os.path.join(tempfile.gettempdir(), "mikado.db") json_conf["pick"]["files"]["output_dir"] = tempfile.gettempdir() json_conf["log_settings"]["log_level"] = "WARNING" # Now the scoring scoring = dict() scoring["requirements"] = dict() scoring["requirements"]["expression"] = ["exon_num"] scoring["requirements"]["parameters"] = dict() scoring["requirements"]["parameters"]["exon_num"] = dict() scoring["requirements"]["parameters"]["exon_num"]["name"] = "exon_num" scoring["requirements"]["parameters"]["exon_num"]["operator"] = "gt" scoring["requirements"]["parameters"]["exon_num"]["value"] = 1 import copy scoring["as_requirements"] = copy.deepcopy(scoring["requirements"]) scoring["not_fragmentary"] = copy.deepcopy(scoring["requirements"].copy()) scoring["scoring"] = dict() scoring["scoring"]["cdna_length"] = dict() scoring["scoring"]["cdna_length"]["rescaling"] = "max" scoring["scoring"]["cdna_length"]["filter"] = dict() scoring["scoring"]["cdna_length"]["filter"]["operator"] = "gt" scoring["scoring"]["cdna_length"]["filter"]["value"] = 2000 scoring_file = tempfile.NamedTemporaryFile(suffix=".yaml", delete=True, mode="wt") yaml.dump(scoring, scoring_file) scoring_file.flush() json_conf["pick"]["scoring_file"] = scoring_file.name del json_conf["scoring"] del json_conf["requirements"] del json_conf["as_requirements"] del json_conf["not_fragmentary"] for purging in (False, True): with self.subTest(purging=purging): json_conf["pick"]["files"]["loci_out"] = "mikado.purging_{}.loci.gff3".format(purging) json_conf["pick"]["files"]["log"] = "mikado.purging_{}.log".format(purging) json_conf["pick"]["clustering"]["purge"] = purging json_conf["pick"]["scoring_file"] = scoring_file.name json_conf = configurator.check_json(json_conf) self.assertEqual(len(json_conf["scoring"].keys()), 1, json_conf["scoring"].keys()) pick_caller = picker.Picker(json_conf=json_conf) with self.assertRaises(SystemExit), self.assertLogs("main_logger", "INFO"): pick_caller() with to_gff(os.path.join(tempfile.gettempdir(), json_conf["pick"]["files"]["loci_out"])) as gff: lines = [line for line in gff if line.header is False] self.assertGreater(len(lines), 0) self.assertTrue(any([_ for _ in lines if _.attributes.get("alias", "") == "foo2.1"])) if purging is True: self.assertFalse(any([_ for _ in lines if _.attributes.get("alias", "") in ("foo1.2", "foo1.1")])) else: found_line = [_ for _ in lines if _.attributes.get("alias", "") in ("foo1.2", "foo1.1")] self.assertTrue(any(found_line)) self.assertTrue(any([_ for _ in found_line if _.score == 0])) # Clean up for fname in ["mikado.db", "mikado.purging_{}.*".format(purging)]: [os.remove(_) for _ in glob.glob(os.path.join(tempfile.gettempdir(), fname))] scoring_file.close() # Now let us test with a scoring which will create transcripts with negative scores scoring["scoring"] = dict() scoring["scoring"]["cdna_length"] = dict() scoring["scoring"]["cdna_length"]["rescaling"] = "min" scoring["scoring"]["cdna_length"]["multiplier"] = -10 scoring["scoring"]["cdna_length"]["filter"] = dict() scoring["scoring"]["cdna_length"]["filter"]["operator"] = "lt" scoring["scoring"]["cdna_length"]["filter"]["value"] = 1000 scoring["scoring"]["exon_num"] = dict() scoring["scoring"]["exon_num"]["rescaling"] = "max" scoring_file = tempfile.NamedTemporaryFile(suffix=".yaml", delete=True, mode="wt") yaml.dump(scoring, scoring_file) scoring_file.flush() json_conf["pick"]["scoring_file"] = scoring_file.name for purging in (False, True): with self.subTest(purging=purging): json_conf["pick"]["files"]["loci_out"] = "mikado.purging_{}.loci.gff3".format(purging) json_conf["pick"]["files"]["subloci_out"] = "mikado.purging_{}.subloci.gff3".format(purging) json_conf["pick"]["files"]["log"] = os.path.join( tempfile.gettempdir(), "mikado.purging_{}.log".format(purging)) json_conf["pick"]["clustering"]["purge"] = purging json_conf["pick"]["scoring_file"] = scoring_file.name json_conf = configurator.check_json(json_conf) self.assertEqual(len(json_conf["scoring"].keys()), 2, json_conf["scoring"].keys()) pick_caller = picker.Picker(json_conf=json_conf) with self.assertRaises(SystemExit), self.assertLogs("main_logger", "INFO"): pick_caller() with to_gff(os.path.join(tempfile.gettempdir(), json_conf["pick"]["files"]["loci_out"])) as gff: lines = [line for line in gff if line.header is False] self.assertGreater(len(lines), 0) self.assertTrue(any([_ for _ in lines if _.attributes.get("alias", "") == "foo2.1"])) if purging is True: self.assertFalse(any([_ for _ in lines if _.attributes.get("alias", "") in ("foo1.2", "foo1.1")])) else: found_line = [_ for _ in lines if _.attributes.get("alias", "") in ("foo1.2", "foo1.1")] self.assertTrue(any(found_line)) self.assertTrue(any([_ for _ in found_line if _.score <= 0])) # Clean up for fname in ["mikado.db", "mikado.purging_{}.*".format(purging)]: [os.remove(_) for _ in glob.glob(os.path.join(tempfile.gettempdir(), fname))] temp_gtf.close() temp_gtf = tempfile.NamedTemporaryFile(mode="wt", suffix=".gtf", delete=True) gtf = "\n".join([_ for _ in gtf.split("\n") if "foo1.1" not in _]) temp_gtf.write(gtf) temp_gtf.flush() json_conf["pick"]["files"]["input"] = temp_gtf.name for purging in (False, True): with self.subTest(purging=purging): json_conf["pick"]["files"]["loci_out"] = "mikado.purging_{}.loci.gff3".format(purging) json_conf["pick"]["files"]["subloci_out"] = "mikado.purging_{}.subloci.gff3".format(purging) json_conf["pick"]["files"]["log"] = "mikado.purging_{}.log".format(purging) json_conf["pick"]["clustering"]["purge"] = purging json_conf["pick"]["scoring_file"] = scoring_file.name json_conf = configurator.check_json(json_conf) self.assertEqual(len(json_conf["scoring"].keys()), 2, json_conf["scoring"].keys()) pick_caller = picker.Picker(json_conf=json_conf) with self.assertRaises(SystemExit), self.assertLogs("main_logger", "INFO"): pick_caller() with to_gff(os.path.join(tempfile.gettempdir(), json_conf["pick"]["files"]["loci_out"])) as gff: lines = [line for line in gff if line.header is False] self.assertGreater(len(lines), 0) self.assertTrue(any([_ for _ in lines if _.attributes.get("alias", "") == "foo2.1"])) if purging is True: self.assertFalse(any([_ for _ in lines if _.attributes.get("alias", "") == "foo1.2"])) else: found_line = [_ for _ in lines if _.attributes.get("alias", "") == "foo1.2"] self.assertTrue(any(found_line)) self.assertTrue(any([_ for _ in found_line if _.score <= 0]), "\n".join([str(_) for _ in found_line])) # Clean up for fname in ["mikado.db", "mikado.purging_{}.*".format(purging)]: [os.remove(_) for _ in glob.glob(os.path.join(tempfile.gettempdir(), fname))]
def main(): parser = argparse.ArgumentParser(__doc__) parser.add_argument("--bed12", nargs=2, required=True, help="Transcriptomic cDNAs BED12s") parser.add_argument("--cdnas", nargs=2, required=True) parser.add_argument("-gf", help="GFF3/BED12 of the transferred annotation.", required=True) parser.add_argument("--out", default=sys.stdout, type=argparse.FileType("wt")) parser.add_argument("-ob", "--out-bed", dest="out_bed", required=False, default=None, type=argparse.FileType("wt")) log = parser.add_mutually_exclusive_group() log.add_argument("-q", "--quiet", default=False, action="store_true") log.add_argument("-v", "--verbose", default=False, action="store_true") parser.add_argument("-p", "--processes", type=int, default=mp.cpu_count()) args = parser.parse_args() logger = create_default_logger("master") verbosity = "INFO" if args.verbose is True: verbosity = "DEBUG" elif args.quiet is True: verbosity = "WARNING" listener = logging.handlers.QueueListener(logging_queue, logger) listener.propagate = False listener.start() logger.setLevel(verbosity) cdnas = dict() beds = dict() beds["ref"] = dict() beds["target"] = dict() gmap_pat = re.compile("\.mrna[0-9]*$") logger.info("Loading reference cDNAS") cdnas["ref"] = pyfaidx.Fasta(args.cdnas[0]) logger.info("Loading target cDNAS") cdnas["target"] = pyfaidx.Fasta(args.cdnas[1]) logger.info("Loaded cDNAs") logger.info("Loading reference BED12") for entry in Bed12Parser(args.bed12[0], transcriptomic=True): if entry.header: continue name = entry.chrom if name in beds["ref"]: raise KeyError("Duplicated ID for the reference: {}".format(name)) if name not in cdnas["ref"]: raise KeyError("Reference {} not found in the cDNAs!".format(name)) beds["ref"][name] = entry logger.info("Loading target BED12") beds["target"] = defaultdict(dict) for entry in Bed12Parser(args.bed12[1], transcriptomic=True): # Now, here we have to account for the fact that there *might* be multiple alignments name = re.sub(gmap_pat, "", entry.chrom) if entry.chrom not in cdnas["target"]: raise KeyError("Target {} not found in the cDNAs!".format( entry.chrom)) beds["target"][name][entry.chrom] = entry logger.info("Loaded BED12s") # Now let us start parsing the GFF3, which we presume being a GMAP GFF3 transcript = None logger.info("Launching sub-processes") procs = [] queue = mp.Queue(-1) for proc in range(args.processes): sq = tempfile.NamedTemporaryFile(mode="wb") sq.close() sq = sq.name _proc = Transferer(sq, queue, verbosity=verbosity) _proc.start() procs.append(_proc) logger.info("Launched sub-processes, starting parsing annotation") # pool = mp.Pool(processes=args.processes) tnum = -1 if args.gf.endswith(("bed12", "bed")): parser = Bed12Parser(args.gf, transcriptomic=False) for line in parser: if line.header: continue else: transcript = Transcript(line) tid = re.sub(gmap_pat, "", transcript.id) logger.debug("Found %s", tid) ref_cdna = str(cdnas["ref"][tid]) ref_bed = beds["ref"][tid] target_cdna = str(cdnas["target"][transcript.id]) target_bed = beds["target"][tid][transcript.id] tnum += 1 logger.debug("Submitting %s", tid) queue.put((tnum, (transcript, ref_cdna, ref_bed, target_cdna, target_bed))) if tnum >= 10**4 and tnum % 10**4 == 0: logger.info("Parsed {} transcripts", tnum) logger.info("Finished parsing input genomic BED file") else: parser = to_gff(args.gf) for pos, line in enumerate(parser): if line.header is True: # or (not isinstance(line, BED12) and line.is_gene is True): if str(line) == "###": continue try: print(line, file=args.out) except IndexError: raise IndexError(line._line) continue elif not isinstance(line, BED12) and line.is_gene is True: continue elif line.is_transcript is True: if transcript: if transcript.alias is None: tid = re.sub(gmap_pat, "", transcript.id) else: tid = re.sub(gmap_pat, "", transcript.alias) ref_cdna = str(cdnas["ref"][tid]) ref_bed = beds["ref"][tid] target_cdna = str(cdnas["target"][transcript.id]) store = beds["target"].get(tid, None) if store is None: raise KeyError((tid, beds["target"].keys())) target_bed = store.get(transcript.id, None) if target_bed is None: raise KeyError((tid, store.keys())) tnum += 1 queue.put((tnum, (transcript, ref_cdna, ref_bed, target_cdna, target_bed))) try: transcript = Transcript(line) except (ValueError, TypeError): raise ValueError((pos, line)) elif line.is_exon is True: transcript.add_exon(line) if tnum >= 10**4 and tnum % 10**4 == 0: logger.info("Parsed {} transcripts", tnum) if transcript: tnum += 1 tid = re.sub(gmap_pat, "", transcript.id) ref_cdna = str(cdnas["ref"][tid]) ref_bed = beds["ref"][tid] target_cdna = str(cdnas["target"][transcript.id]) target_bed = beds["target"][tid][transcript.id] queue.put((tnum, (transcript, ref_cdna, ref_bed, target_cdna, target_bed))) logger.info("Finished parsing input genomic GF file") queue.put("EXIT") logger.info("Waiting for subprocesses to finish") [_proc.join() for _proc in procs] # Now the printing ... # results = dict() logger.info("Subprocesses finished, printing") for proc in procs: sq = sqlalchemy.create_engine("sqlite:///{}".format(proc.out_sq)) for res in sq.execute("select * from storer"): num, bed12, gff3 = res if args.out_bed is not None: print(bed12.decode(), file=args.out_bed) print(*gff3.decode().split("\n"), file=args.out, sep="\n") os.remove(proc.out_sq) logger.info("Finished!") return
def test_purging(self): gtf = """Chr1 foo transcript 100 1000 . + . gene_id "foo1"; transcript_id "foo1.1" Chr1 foo exon 100 1000 . + . gene_id "foo1"; transcript_id "foo1.1" Chr1 foo transcript 100 2000 . + . gene_id "foo1"; transcript_id "foo1.2" Chr1 foo exon 100 800 . + . gene_id "foo1"; transcript_id "foo1.2" Chr1 foo exon 1900 2000 . + . gene_id "foo1"; transcript_id "foo1.2" Chr1 foo transcript 10000 20000 . + . gene_id "foo2"; transcript_id "foo2.1" Chr1 foo exon 10000 13000 . + . gene_id "foo2; transcript_id "foo2.1" Chr1 foo exon 19000 20000 . + . gene_id "foo"; transcript_id "foo2.1""" temp_gtf = tempfile.NamedTemporaryFile(mode="wt", suffix=".gtf", delete=True) temp_gtf.write(gtf) temp_gtf.flush() json_conf = configurator.to_json(None) json_conf["pick"]["files"]["input"] = temp_gtf.name json_conf["db_settings"]["db"] = os.path.join(tempfile.gettempdir(), "mikado.db") json_conf["pick"]["files"]["output_dir"] = tempfile.gettempdir() json_conf["log_settings"]["log_level"] = "WARNING" # Now the scoring scoring = dict() scoring["requirements"] = dict() scoring["requirements"]["expression"] = ["exon_num"] scoring["requirements"]["parameters"] = dict() scoring["requirements"]["parameters"]["exon_num"] = dict() scoring["requirements"]["parameters"]["exon_num"]["name"] = "exon_num" scoring["requirements"]["parameters"]["exon_num"]["operator"] = "gt" scoring["requirements"]["parameters"]["exon_num"]["value"] = 1 import copy scoring["as_requirements"] = copy.deepcopy(scoring["requirements"]) scoring["not_fragmentary"] = copy.deepcopy( scoring["requirements"].copy()) scoring["scoring"] = dict() scoring["scoring"]["cdna_length"] = dict() scoring["scoring"]["cdna_length"]["rescaling"] = "max" scoring["scoring"]["cdna_length"]["filter"] = dict() scoring["scoring"]["cdna_length"]["filter"]["operator"] = "gt" scoring["scoring"]["cdna_length"]["filter"]["value"] = 2000 scoring_file = tempfile.NamedTemporaryFile(suffix=".yaml", delete=True, mode="wt") yaml.dump(scoring, scoring_file) scoring_file.flush() json_conf["pick"]["scoring_file"] = scoring_file.name del json_conf["scoring"] del json_conf["requirements"] del json_conf["as_requirements"] del json_conf["not_fragmentary"] for purging in (False, True): with self.subTest(purging=purging): json_conf["pick"]["files"][ "loci_out"] = "mikado.purging_{}.loci.gff3".format(purging) json_conf["pick"]["files"][ "log"] = "mikado.purging_{}.log".format(purging) json_conf["pick"]["clustering"]["purge"] = purging json_conf["pick"]["scoring_file"] = scoring_file.name json_conf = configurator.check_json(json_conf) self.assertEqual(len(json_conf["scoring"].keys()), 1, json_conf["scoring"].keys()) pick_caller = picker.Picker(json_conf=json_conf) with self.assertRaises(SystemExit), self.assertLogs( "main_logger", "INFO"): pick_caller() with to_gff( os.path.join( tempfile.gettempdir(), json_conf["pick"]["files"]["loci_out"])) as gff: lines = [line for line in gff if line.header is False] self.assertGreater(len(lines), 0) self.assertTrue( any([ _ for _ in lines if _.attributes.get("alias", "") == "foo2.1" ])) if purging is True: self.assertFalse( any([ _ for _ in lines if _.attributes.get("alias", "") in ("foo1.2", "foo1.1") ])) else: found_line = [ _ for _ in lines if _.attributes.get("alias", "") in ("foo1.2", "foo1.1") ] self.assertTrue(any(found_line)) self.assertTrue( any([_ for _ in found_line if _.score == 0])) # Clean up for fname in ["mikado.db", "mikado.purging_{}.*".format(purging)]: [ os.remove(_) for _ in glob.glob( os.path.join(tempfile.gettempdir(), fname)) ] scoring_file.close() # Now let us test with a scoring which will create transcripts with negative scores scoring["scoring"] = dict() scoring["scoring"]["cdna_length"] = dict() scoring["scoring"]["cdna_length"]["rescaling"] = "min" scoring["scoring"]["cdna_length"]["multiplier"] = -10 scoring["scoring"]["cdna_length"]["filter"] = dict() scoring["scoring"]["cdna_length"]["filter"]["operator"] = "lt" scoring["scoring"]["cdna_length"]["filter"]["value"] = 1000 scoring["scoring"]["exon_num"] = dict() scoring["scoring"]["exon_num"]["rescaling"] = "max" scoring_file = tempfile.NamedTemporaryFile(suffix=".yaml", delete=True, mode="wt") yaml.dump(scoring, scoring_file) scoring_file.flush() json_conf["pick"]["scoring_file"] = scoring_file.name for purging in (False, True): with self.subTest(purging=purging): json_conf["pick"]["files"][ "loci_out"] = "mikado.purging_{}.loci.gff3".format(purging) json_conf["pick"]["files"][ "subloci_out"] = "mikado.purging_{}.subloci.gff3".format( purging) json_conf["pick"]["files"]["log"] = os.path.join( tempfile.gettempdir(), "mikado.purging_{}.log".format(purging)) json_conf["pick"]["clustering"]["purge"] = purging json_conf["pick"]["scoring_file"] = scoring_file.name json_conf = configurator.check_json(json_conf) self.assertEqual(len(json_conf["scoring"].keys()), 2, json_conf["scoring"].keys()) pick_caller = picker.Picker(json_conf=json_conf) with self.assertRaises(SystemExit), self.assertLogs( "main_logger", "INFO"): pick_caller() with to_gff( os.path.join( tempfile.gettempdir(), json_conf["pick"]["files"]["loci_out"])) as gff: lines = [line for line in gff if line.header is False] self.assertGreater(len(lines), 0) self.assertTrue( any([ _ for _ in lines if _.attributes.get("alias", "") == "foo2.1" ])) if purging is True: self.assertFalse( any([ _ for _ in lines if _.attributes.get("alias", "") in ("foo1.2", "foo1.1") ])) else: found_line = [ _ for _ in lines if _.attributes.get("alias", "") in ("foo1.2", "foo1.1") ] self.assertTrue(any(found_line)) self.assertTrue( any([_ for _ in found_line if _.score <= 0])) # Clean up for fname in ["mikado.db", "mikado.purging_{}.*".format(purging)]: [ os.remove(_) for _ in glob.glob( os.path.join(tempfile.gettempdir(), fname)) ] temp_gtf.close() temp_gtf = tempfile.NamedTemporaryFile(mode="wt", suffix=".gtf", delete=True) gtf = "\n".join([_ for _ in gtf.split("\n") if "foo1.1" not in _]) temp_gtf.write(gtf) temp_gtf.flush() json_conf["pick"]["files"]["input"] = temp_gtf.name for purging in (False, True): with self.subTest(purging=purging): json_conf["pick"]["files"][ "loci_out"] = "mikado.purging_{}.loci.gff3".format(purging) json_conf["pick"]["files"][ "subloci_out"] = "mikado.purging_{}.subloci.gff3".format( purging) json_conf["pick"]["files"][ "log"] = "mikado.purging_{}.log".format(purging) json_conf["pick"]["clustering"]["purge"] = purging json_conf["pick"]["scoring_file"] = scoring_file.name json_conf = configurator.check_json(json_conf) self.assertEqual(len(json_conf["scoring"].keys()), 2, json_conf["scoring"].keys()) pick_caller = picker.Picker(json_conf=json_conf) with self.assertRaises(SystemExit), self.assertLogs( "main_logger", "INFO"): pick_caller() with to_gff( os.path.join( tempfile.gettempdir(), json_conf["pick"]["files"]["loci_out"])) as gff: lines = [line for line in gff if line.header is False] self.assertGreater(len(lines), 0) self.assertTrue( any([ _ for _ in lines if _.attributes.get("alias", "") == "foo2.1" ])) if purging is True: self.assertFalse( any([ _ for _ in lines if _.attributes.get("alias", "") == "foo1.2" ])) else: found_line = [ _ for _ in lines if _.attributes.get("alias", "") == "foo1.2" ] self.assertTrue(any(found_line)) self.assertTrue( any([_ for _ in found_line if _.score <= 0]), "\n".join([str(_) for _ in found_line])) # Clean up for fname in ["mikado.db", "mikado.purging_{}.*".format(purging)]: [ os.remove(_) for _ in glob.glob( os.path.join(tempfile.gettempdir(), fname)) ]