def test_compare_trinity(self):

        # Create the list of files
        files = [
            "trinity.gtf", "trinity.gff3", "trinity.cDNA_match.gff3",
            "trinity.match_matchpart.gff3"
        ]
        files = [
            pkg_resources.resource_filename("Mikado.tests", filename)
            for filename in files
        ]

        namespace = Namespace(default=False)
        namespace.distance = 2000
        namespace.no_save_index = True

        for ref, pred in itertools.permutations(files, 2):

            with self.subTest(ref=ref, pred=pred):
                namespace.reference = to_gff(ref)
                namespace.prediction = to_gff(pred)
                namespace.log = os.path.join(
                    tempfile.gettempdir(),
                    "compare_{}_{}.log".format(files.index(ref),
                                               files.index(pred)))
                namespace.out = os.path.join(
                    tempfile.gettempdir(),
                    "compare_{}_{}".format(files.index(ref),
                                           files.index(pred)))
                compare(namespace)
                refmap = "{}.refmap".format(namespace.out)
                tmap = "{}.tmap".format(namespace.out)
                stats = "{}.stats".format(namespace.out)

                self.assertTrue(os.path.exists(namespace.log))
                # with open(log) as log_handle:
                #     log = [_.rstrip() for _ in log_handle]
                for fname in [refmap, stats, tmap]:
                    self.assertTrue(os.path.exists(fname))
                    self.assertGreater(os.stat(fname).st_size, 0)

                with open(refmap) as _:
                    reader = csv.DictReader(_, delimiter="\t")
                    counter = 0
                    for counter, line in enumerate(reader, start=1):
                        ccode = line["ccode"]
                        self.assertIn(ccode, ("_", "=", "f,_", "f,="),
                                      (ref, pred, line))

                    self.assertEqual(counter, 38)

        for suff in ["log", "refmap", "tmap", "stats"]:
            [
                os.remove(_) for _ in glob.glob(
                    os.path.join(tempfile.gettempdir(), "compare_*.{}".format(
                        suff)))
            ]
Beispiel #2
0
    def test_compare_trinity(self):

        # Create the list of files
        files = ["trinity.gtf",
                 "trinity.gff3",
                 "trinity.cDNA_match.gff3",
                 "trinity.match_matchpart.gff3"]
        files = [pkg_resources.resource_filename("Mikado.tests", filename) for filename in files]

        namespace = Namespace(default=False)
        namespace.distance = 2000
        namespace.no_save_index = True

        for ref, pred in itertools.permutations(files, 2):

            with self.subTest(ref=ref, pred=pred):
                namespace.reference = to_gff(ref)
                namespace.prediction = to_gff(pred)
                namespace.log = os.path.join(tempfile.gettempdir(), "compare_{}_{}.log".format(
                    files.index(ref), files.index(pred)))
                namespace.out = os.path.join(tempfile.gettempdir(), "compare_{}_{}".format(
                    files.index(ref), files.index(pred)))
                compare(namespace)
                refmap = "{}.refmap".format(namespace.out)
                tmap = "{}.tmap".format(namespace.out)
                stats = "{}.stats".format(namespace.out)

                self.assertTrue(os.path.exists(namespace.log))
                # with open(log) as log_handle:
                #     log = [_.rstrip() for _ in log_handle]
                for fname in [refmap, stats, tmap]:
                    self.assertTrue(os.path.exists(fname))
                    self.assertGreater(os.stat(fname).st_size, 0)

                with open(refmap) as _:
                    reader = csv.DictReader(_, delimiter="\t")
                    counter = 0
                    for counter, line in enumerate(reader, start=1):
                        ccode = line["ccode"]
                        self.assertIn(ccode,
                                      ("_", "=", "f,_", "f,="),
                                      (ref, pred, line))

                    self.assertEqual(counter, 38)

        for suff in ["log", "refmap", "tmap", "stats"]:
            [os.remove(_) for _ in glob.glob(os.path.join(tempfile.gettempdir(),
                                                          "compare_*.{}".format(suff)))]
Beispiel #3
0
    def test_multi_proc(self):
        json_conf = configurator.to_json(None)
        json_conf["pick"]["run_options"]["procs"] = 2
        json_conf["pick"]["files"]["input"] = pkg_resources.resource_filename("Mikado.tests",
                                                                              "mikado_prepared.gtf")
        json_conf["pick"]["files"]["output_dir"] = tempfile.gettempdir()
        json_conf["pick"]["files"]["loci_out"] = "mikado.multiproc.loci.gff3"
        json_conf["pick"]["files"]["subloci_out"] = "mikado.multiproc.subloci.gff3"
        json_conf["pick"]["files"]["monoloci_out"] = "mikado.multiproc.monoloci.gff3"
        json_conf["pick"]["files"]["log"] = "mikado.multiproc.log"
        json_conf["db_settings"]["db"] = pkg_resources.resource_filename("Mikado.tests", "mikado.db")
        json_conf["log_settings"]["log_level"] = "WARNING"

        pick_caller = picker.Picker(json_conf=json_conf)
        with self.assertRaises(SystemExit), self.assertLogs("main_logger", "INFO"):
            pick_caller()
        self.assertTrue(os.path.exists(os.path.join(tempfile.gettempdir(), "mikado.multiproc.loci.gff3")))
        with to_gff(os.path.join(tempfile.gettempdir(), "mikado.multiproc.loci.gff3")) as inp_gff:
            lines = [_ for _ in inp_gff if not _.header is True]
            self.assertGreater(len(lines), 0)
            self.assertGreater(len([_ for _ in lines if _.is_transcript is True]), 0)
            self.assertGreater(len([_ for _ in lines if _.feature == "mRNA"]), 0)
            self.assertGreater(len([_ for _ in lines if _.feature == "CDS"]), 0)

        [os.remove(_) for _ in glob.glob(os.path.join(tempfile.gettempdir(), "mikado.multiproc.") + "*")]
Beispiel #4
0
    def test_stat(self):

        files = ["trinity.gtf",
                 "trinity.gff3",
                 "trinity.cDNA_match.gff3",
                 "trinity.match_matchpart.gff3"]
        files = [pkg_resources.resource_filename("Mikado.tests", filename) for filename in files]

        std_lines = []
        with pkg_resources.resource_stream("Mikado.tests", "trinity_stats.txt") as t_stats:
            for line in t_stats:
                std_lines.append(line.decode().rstrip())

        namespace = Namespace(default=False)
        namespace.tab_stats = None
        for filename in files:
            with self.subTest(filename=filename):
                namespace.gff = to_gff(filename)
                with open(os.path.join(tempfile.gettempdir(),
                                       "{}.txt".format(os.path.basename(filename))), "w") as out:
                    namespace.out = out
                    Calculator(namespace)()
                self.assertGreater(os.stat(out.name).st_size, 0)
                with open(out.name) as out_handle:
                    lines = [_.rstrip() for _ in out_handle]
                self.assertEqual(std_lines, lines)
                os.remove(out.name)
Beispiel #5
0
    def test_index(self):

        # Create the list of files
        files = ["trinity.gtf",
                 "trinity.gff3",
                 "trinity.cDNA_match.gff3",
                 "trinity.match_matchpart.gff3"]
        # files = [pkg_resources.resource_filename("Mikado.tests", filename) for filename in files]

        namespace = Namespace(default=False)
        namespace.distance = 2000
        namespace.index = True
        namespace.prediction = None
        namespace.log = os.path.join(tempfile.gettempdir(), "index.log")
        logger = create_null_logger("null")

        for ref in files:
            with self.subTest(ref=ref):
                temp_ref = os.path.join(tempfile.gettempdir(), ref)
                with pkg_resources.resource_stream("Mikado.tests", ref) as ref_handle,\
                        open(temp_ref, "wb") as out_handle:
                    out_handle.write(ref_handle.read())
                namespace.reference = to_gff(temp_ref)
                compare(namespace)

                self.assertTrue(os.path.exists(namespace.log))
                self.assertTrue(os.path.exists("{}.midx".format(namespace.reference.name)))
                self.assertGreater(os.stat("{}.midx".format(namespace.reference.name)).st_size, 0)
                genes, positions = load_index(namespace, logger)
                self.assertIsInstance(genes, dict)
                self.assertIsInstance(positions, dict)
                self.assertEqual(len(genes), 38)
                os.remove(namespace.reference.name)
                os.remove(namespace.log)
                os.remove("{}.midx".format(namespace.reference.name))
    def test_stat(self):

        files = [
            "trinity.gtf", "trinity.gff3", "trinity.cDNA_match.gff3",
            "trinity.match_matchpart.gff3"
        ]
        files = [
            pkg_resources.resource_filename("Mikado.tests", filename)
            for filename in files
        ]

        std_lines = []
        with pkg_resources.resource_stream("Mikado.tests",
                                           "trinity_stats.txt") as t_stats:
            for line in t_stats:
                std_lines.append(line.decode().rstrip())

        namespace = Namespace(default=False)
        namespace.tab_stats = None
        for filename in files:
            with self.subTest(filename=filename):
                namespace.gff = to_gff(filename)
                with open(
                        os.path.join(
                            tempfile.gettempdir(),
                            "{}.txt".format(os.path.basename(filename))),
                        "w") as out:
                    namespace.out = out
                    Calculator(namespace)()
                self.assertGreater(os.stat(out.name).st_size, 0)
                with open(out.name) as out_handle:
                    lines = [_.rstrip() for _ in out_handle]
                self.assertEqual(std_lines, lines)
                os.remove(out.name)
    def test_subprocess(self):

        json_conf = configurator.to_json(None)

        json_conf["pick"]["files"]["input"] = pkg_resources.resource_filename(
            "Mikado.tests", "mikado_prepared.gtf")
        json_conf["pick"]["files"]["output_dir"] = tempfile.gettempdir()
        json_conf["pick"]["files"]["loci_out"] = "mikado.subproc.loci.gff3"
        json_conf["pick"]["files"][
            "subloci_out"] = "mikado.subproc.subloci.gff3"
        json_conf["pick"]["files"][
            "monoloci_out"] = "mikado.subproc.monoloci.gff3"
        json_conf["pick"]["files"]["log"] = "mikado.subproc.log"
        json_conf["db_settings"]["db"] = pkg_resources.resource_filename(
            "Mikado.tests", "mikado.db")
        json_conf["log_settings"]["log_level"] = "WARNING"

        for num in (1, 2):
            with self.subTest(num=num):

                json_conf["pick"]["run_options"]["procs"] = num
                json_conf["pick"]["run_options"]["single_thread"] = (num == 1)
                json_file = os.path.join(tempfile.gettempdir(), "mikado.yaml")

                with open(json_file, "wt") as json_handle:
                    Mikado.subprograms.configure.print_config(
                        yaml.dump(json_conf, default_flow_style=False),
                        json_handle)

                sys.argv = ["mikado", "pick", "--json-conf", json_file]
                with self.assertRaises(SystemExit):
                    pkg_resources.load_entry_point("Mikado", "console_scripts",
                                                   "mikado")()
                self.assertTrue(
                    os.path.exists(
                        os.path.join(tempfile.gettempdir(),
                                     "mikado.subproc.loci.gff3")))
                with to_gff(
                        os.path.join(tempfile.gettempdir(),
                                     "mikado.subproc.loci.gff3")) as inp_gff:
                    lines = [_ for _ in inp_gff if not _.header is True]
                    self.assertGreater(len(lines), 0)
                    self.assertGreater(
                        len([_ for _ in lines if _.is_transcript is True]), 0)
                    self.assertGreater(
                        len([_ for _ in lines if _.feature == "mRNA"]), 0)
                    self.assertGreater(
                        len([_ for _ in lines if _.feature == "CDS"]), 0)

                [
                    os.remove(_) for _ in glob.glob(
                        os.path.join(tempfile.gettempdir(), "mikado.subproc.")
                        + "*")
                ]
    def test_single_proc(self):

        json_conf = configurator.to_json(None)
        json_conf["pick"]["run_options"]["procs"] = 1
        json_conf["db_settings"]["db"] = pkg_resources.resource_filename(
            "Mikado.tests", "mikado.db")

        json_conf["pick"]["files"]["input"] = pkg_resources.resource_filename(
            "Mikado.tests", "mikado_prepared.gtf")
        json_conf["pick"]["files"]["output_dir"] = tempfile.gettempdir()
        json_conf["pick"]["files"]["loci_out"] = "mikado.monoproc.loci.gff3"
        json_conf["pick"]["files"][
            "subloci_out"] = "mikado.monoproc.subloci.gff3"
        json_conf["pick"]["files"][
            "monoloci_out"] = "mikado.monoproc.monoloci.gff3"
        json_conf["pick"]["files"]["log"] = "mikado.monoproc.log"
        json_conf["log_settings"]["log_level"] = "WARNING"

        pick_caller = picker.Picker(json_conf=json_conf)
        with self.assertRaises(SystemExit), self.assertLogs(
                "main_logger", "INFO"):
            pick_caller()
        self.assertTrue(
            os.path.exists(
                os.path.join(tempfile.gettempdir(),
                             "mikado.monoproc.loci.gff3")))
        with to_gff(
                os.path.join(tempfile.gettempdir(),
                             "mikado.monoproc.loci.gff3")) as inp_gff:
            lines = [_ for _ in inp_gff if not _.header is True]
            self.assertGreater(len(lines), 0)
            self.assertGreater(
                len([_ for _ in lines if _.is_transcript is True]), 0)
            self.assertGreater(len([_ for _ in lines if _.feature == "mRNA"]),
                               0)
            self.assertGreater(len([_ for _ in lines if _.feature == "CDS"]),
                               0)

        [
            os.remove(_) for _ in glob.glob(
                os.path.join(tempfile.gettempdir(), "mikado.monoproc.") + "*")
        ]
    def test_index(self):

        # Create the list of files
        files = [
            "trinity.gtf", "trinity.gff3", "trinity.cDNA_match.gff3",
            "trinity.match_matchpart.gff3"
        ]
        # files = [pkg_resources.resource_filename("Mikado.tests", filename) for filename in files]

        namespace = Namespace(default=False)
        namespace.distance = 2000
        namespace.index = True
        namespace.prediction = None
        namespace.log = os.path.join(tempfile.gettempdir(), "index.log")
        logger = create_null_logger("null")

        for ref in files:
            with self.subTest(ref=ref):
                temp_ref = os.path.join(tempfile.gettempdir(), ref)
                with pkg_resources.resource_stream("Mikado.tests", ref) as ref_handle,\
                        open(temp_ref, "wb") as out_handle:
                    out_handle.write(ref_handle.read())
                namespace.reference = to_gff(temp_ref)
                compare(namespace)

                self.assertTrue(os.path.exists(namespace.log))
                self.assertTrue(
                    os.path.exists("{}.midx".format(namespace.reference.name)))
                self.assertGreater(
                    os.stat("{}.midx".format(
                        namespace.reference.name)).st_size, 0)
                genes, positions = load_index(namespace, logger)
                self.assertIsInstance(genes, dict)
                self.assertIsInstance(positions, dict)
                self.assertEqual(len(genes), 38)
                os.remove(namespace.reference.name)
                os.remove(namespace.log)
                os.remove("{}.midx".format(namespace.reference.name))
Beispiel #10
0
    def test_subprocess(self):
        
        json_conf = configurator.to_json(None)
        
        json_conf["pick"]["files"]["input"] = pkg_resources.resource_filename("Mikado.tests",
                                                                              "mikado_prepared.gtf")
        json_conf["pick"]["files"]["output_dir"] = tempfile.gettempdir()
        json_conf["pick"]["files"]["loci_out"] = "mikado.subproc.loci.gff3"
        json_conf["pick"]["files"]["subloci_out"] = "mikado.subproc.subloci.gff3"
        json_conf["pick"]["files"]["monoloci_out"] = "mikado.subproc.monoloci.gff3"
        json_conf["pick"]["files"]["log"] = "mikado.subproc.log"
        json_conf["db_settings"]["db"] = pkg_resources.resource_filename("Mikado.tests", "mikado.db")
        json_conf["log_settings"]["log_level"] = "WARNING"
        
        for num in (1, 2):
            with self.subTest(num=num):

                json_conf["pick"]["run_options"]["procs"] = num
                json_conf["pick"]["run_options"]["single_thread"] = (num == 1)
                json_file = os.path.join(tempfile.gettempdir(), "mikado.yaml")

                with open(json_file, "wt") as json_handle:
                    Mikado.subprograms.configure.print_config(yaml.dump(json_conf, default_flow_style=False),
                                                              json_handle)

                sys.argv = ["mikado", "pick", "--json-conf", json_file]
                with self.assertRaises(SystemExit):
                    pkg_resources.load_entry_point("Mikado", "console_scripts", "mikado")()
                self.assertTrue(os.path.exists(os.path.join(tempfile.gettempdir(), "mikado.subproc.loci.gff3")))
                with to_gff(os.path.join(tempfile.gettempdir(), "mikado.subproc.loci.gff3")) as inp_gff:
                    lines = [_ for _ in inp_gff if not _.header is True]
                    self.assertGreater(len(lines), 0)
                    self.assertGreater(len([_ for _ in lines if _.is_transcript is True]), 0)
                    self.assertGreater(len([_ for _ in lines if _.feature == "mRNA"]), 0)
                    self.assertGreater(len([_ for _ in lines if _.feature == "CDS"]), 0)

                [os.remove(_) for _ in glob.glob(os.path.join(tempfile.gettempdir(), "mikado.subproc.") + "*")]
Beispiel #11
0
    def test_purging(self):

        gtf = """Chr1	foo	transcript	100	1000	.	+	.	gene_id "foo1"; transcript_id "foo1.1"
Chr1	foo	exon	100	1000	.	+	.	gene_id "foo1"; transcript_id "foo1.1"
Chr1	foo	transcript	100	2000	.	+	.	gene_id "foo1"; transcript_id "foo1.2"
Chr1	foo	exon	100	800	.	+	.	gene_id "foo1"; transcript_id "foo1.2"
Chr1	foo	exon	1900	2000	.	+	.	gene_id "foo1"; transcript_id "foo1.2"
Chr1	foo	transcript	10000	20000	.	+	.	gene_id "foo2"; transcript_id "foo2.1"
Chr1	foo	exon	10000	13000	.	+	.	gene_id "foo2; transcript_id "foo2.1"
Chr1	foo	exon	19000	20000	.	+	.	gene_id "foo"; transcript_id "foo2.1"""

        temp_gtf = tempfile.NamedTemporaryFile(mode="wt", suffix=".gtf", delete=True)

        temp_gtf.write(gtf)
        temp_gtf.flush()

        json_conf = configurator.to_json(None)

        json_conf["pick"]["files"]["input"] = temp_gtf.name
        json_conf["db_settings"]["db"] = os.path.join(tempfile.gettempdir(), "mikado.db")
        json_conf["pick"]["files"]["output_dir"] = tempfile.gettempdir()
        json_conf["log_settings"]["log_level"] = "WARNING"

        # Now the scoring
        scoring = dict()
        
        scoring["requirements"] = dict()
        scoring["requirements"]["expression"] = ["exon_num"]
        scoring["requirements"]["parameters"] = dict()
        scoring["requirements"]["parameters"]["exon_num"] = dict()
        scoring["requirements"]["parameters"]["exon_num"]["name"] = "exon_num"
        scoring["requirements"]["parameters"]["exon_num"]["operator"] = "gt"
        scoring["requirements"]["parameters"]["exon_num"]["value"] = 1

        import copy
        scoring["as_requirements"] = copy.deepcopy(scoring["requirements"])
        scoring["not_fragmentary"] = copy.deepcopy(scoring["requirements"].copy())

        scoring["scoring"] = dict()
        scoring["scoring"]["cdna_length"] = dict()
        scoring["scoring"]["cdna_length"]["rescaling"] = "max"
        scoring["scoring"]["cdna_length"]["filter"] = dict()
        scoring["scoring"]["cdna_length"]["filter"]["operator"] = "gt"
        scoring["scoring"]["cdna_length"]["filter"]["value"] = 2000

        scoring_file = tempfile.NamedTemporaryFile(suffix=".yaml", delete=True, mode="wt")
        yaml.dump(scoring, scoring_file)
        scoring_file.flush()
        json_conf["pick"]["scoring_file"] = scoring_file.name
        del json_conf["scoring"]
        del json_conf["requirements"]
        del json_conf["as_requirements"]
        del json_conf["not_fragmentary"]

        for purging in (False, True):
            with self.subTest(purging=purging):
                json_conf["pick"]["files"]["loci_out"] = "mikado.purging_{}.loci.gff3".format(purging)
                json_conf["pick"]["files"]["log"] = "mikado.purging_{}.log".format(purging)
                json_conf["pick"]["clustering"]["purge"] = purging
                json_conf["pick"]["scoring_file"] = scoring_file.name
                json_conf = configurator.check_json(json_conf)
                self.assertEqual(len(json_conf["scoring"].keys()), 1, json_conf["scoring"].keys())

                pick_caller = picker.Picker(json_conf=json_conf)
                with self.assertRaises(SystemExit), self.assertLogs("main_logger", "INFO"):
                    pick_caller()

                with to_gff(os.path.join(tempfile.gettempdir(),
                                                              json_conf["pick"]["files"]["loci_out"])) as gff:

                    lines = [line for line in gff if line.header is False]
                self.assertGreater(len(lines), 0)
                self.assertTrue(any([_ for _ in lines if _.attributes.get("alias", "") == "foo2.1"]))
                if purging is True:
                    self.assertFalse(any([_ for _ in lines if _.attributes.get("alias", "") in ("foo1.2", "foo1.1")]))
                else:
                    found_line = [_ for _ in lines if _.attributes.get("alias", "") in ("foo1.2", "foo1.1")]
                    self.assertTrue(any(found_line))
                    self.assertTrue(any([_ for _ in found_line if _.score == 0]))

            # Clean up
            for fname in ["mikado.db", "mikado.purging_{}.*".format(purging)]:
                [os.remove(_) for _ in glob.glob(os.path.join(tempfile.gettempdir(), fname))]

        scoring_file.close()
        # Now let us test with a scoring which will create transcripts with negative scores
        scoring["scoring"] = dict()
        scoring["scoring"]["cdna_length"] = dict()
        scoring["scoring"]["cdna_length"]["rescaling"] = "min"
        scoring["scoring"]["cdna_length"]["multiplier"] = -10
        scoring["scoring"]["cdna_length"]["filter"] = dict()
        scoring["scoring"]["cdna_length"]["filter"]["operator"] = "lt"
        scoring["scoring"]["cdna_length"]["filter"]["value"] = 1000

        scoring["scoring"]["exon_num"] = dict()
        scoring["scoring"]["exon_num"]["rescaling"] = "max"

        scoring_file = tempfile.NamedTemporaryFile(suffix=".yaml", delete=True, mode="wt")
        yaml.dump(scoring, scoring_file)
        scoring_file.flush()
        json_conf["pick"]["scoring_file"] = scoring_file.name

        for purging in (False, True):
            with self.subTest(purging=purging):
                json_conf["pick"]["files"]["loci_out"] = "mikado.purging_{}.loci.gff3".format(purging)
                json_conf["pick"]["files"]["subloci_out"] = "mikado.purging_{}.subloci.gff3".format(purging)
                json_conf["pick"]["files"]["log"] = os.path.join(
                    tempfile.gettempdir(),
                    "mikado.purging_{}.log".format(purging))
                json_conf["pick"]["clustering"]["purge"] = purging
                json_conf["pick"]["scoring_file"] = scoring_file.name
                json_conf = configurator.check_json(json_conf)
                self.assertEqual(len(json_conf["scoring"].keys()), 2, json_conf["scoring"].keys())

                pick_caller = picker.Picker(json_conf=json_conf)
                with self.assertRaises(SystemExit), self.assertLogs("main_logger", "INFO"):
                    pick_caller()

                with to_gff(os.path.join(tempfile.gettempdir(),
                                                              json_conf["pick"]["files"]["loci_out"])) as gff:
                    lines = [line for line in gff if line.header is False]
                self.assertGreater(len(lines), 0)
                self.assertTrue(any([_ for _ in lines if _.attributes.get("alias", "") == "foo2.1"]))
                if purging is True:
                    self.assertFalse(any([_ for _ in lines if _.attributes.get("alias", "") in ("foo1.2", "foo1.1")]))
                else:
                    found_line = [_ for _ in lines if _.attributes.get("alias", "") in ("foo1.2", "foo1.1")]
                    self.assertTrue(any(found_line))
                    self.assertTrue(any([_ for _ in found_line if _.score <= 0]))

            # Clean up
            for fname in ["mikado.db", "mikado.purging_{}.*".format(purging)]:
                [os.remove(_) for _ in glob.glob(os.path.join(tempfile.gettempdir(), fname))]

        temp_gtf.close()

        temp_gtf = tempfile.NamedTemporaryFile(mode="wt", suffix=".gtf", delete=True)

        gtf = "\n".join([_ for _ in gtf.split("\n") if "foo1.1" not in _])

        temp_gtf.write(gtf)
        temp_gtf.flush()
        json_conf["pick"]["files"]["input"] = temp_gtf.name

        for purging in (False, True):
            with self.subTest(purging=purging):
                json_conf["pick"]["files"]["loci_out"] = "mikado.purging_{}.loci.gff3".format(purging)
                json_conf["pick"]["files"]["subloci_out"] = "mikado.purging_{}.subloci.gff3".format(purging)
                json_conf["pick"]["files"]["log"] = "mikado.purging_{}.log".format(purging)
                json_conf["pick"]["clustering"]["purge"] = purging
                json_conf["pick"]["scoring_file"] = scoring_file.name
                json_conf = configurator.check_json(json_conf)
                self.assertEqual(len(json_conf["scoring"].keys()), 2, json_conf["scoring"].keys())

                pick_caller = picker.Picker(json_conf=json_conf)
                with self.assertRaises(SystemExit), self.assertLogs("main_logger", "INFO"):
                    pick_caller()

                with to_gff(os.path.join(tempfile.gettempdir(),
                                                              json_conf["pick"]["files"]["loci_out"])) as gff:
                    lines = [line for line in gff if line.header is False]
                self.assertGreater(len(lines), 0)
                self.assertTrue(any([_ for _ in lines if _.attributes.get("alias", "") == "foo2.1"]))
                if purging is True:
                    self.assertFalse(any([_ for _ in lines if _.attributes.get("alias", "") == "foo1.2"]))
                else:
                    found_line = [_ for _ in lines if _.attributes.get("alias", "") == "foo1.2"]
                    self.assertTrue(any(found_line))
                    self.assertTrue(any([_ for _ in found_line if _.score <= 0]),
                                    "\n".join([str(_) for _ in found_line]))

            # Clean up
            for fname in ["mikado.db", "mikado.purging_{}.*".format(purging)]:
                [os.remove(_) for _ in glob.glob(os.path.join(tempfile.gettempdir(), fname))]
def main():

    parser = argparse.ArgumentParser(__doc__)
    parser.add_argument("--bed12",
                        nargs=2,
                        required=True,
                        help="Transcriptomic cDNAs BED12s")
    parser.add_argument("--cdnas", nargs=2, required=True)
    parser.add_argument("-gf",
                        help="GFF3/BED12 of the transferred annotation.",
                        required=True)
    parser.add_argument("--out",
                        default=sys.stdout,
                        type=argparse.FileType("wt"))
    parser.add_argument("-ob",
                        "--out-bed",
                        dest="out_bed",
                        required=False,
                        default=None,
                        type=argparse.FileType("wt"))
    log = parser.add_mutually_exclusive_group()
    log.add_argument("-q", "--quiet", default=False, action="store_true")
    log.add_argument("-v", "--verbose", default=False, action="store_true")
    parser.add_argument("-p", "--processes", type=int, default=mp.cpu_count())
    args = parser.parse_args()

    logger = create_default_logger("master")
    verbosity = "INFO"
    if args.verbose is True:
        verbosity = "DEBUG"
    elif args.quiet is True:
        verbosity = "WARNING"

    listener = logging.handlers.QueueListener(logging_queue, logger)
    listener.propagate = False
    listener.start()
    logger.setLevel(verbosity)

    cdnas = dict()
    beds = dict()
    beds["ref"] = dict()
    beds["target"] = dict()

    gmap_pat = re.compile("\.mrna[0-9]*$")

    logger.info("Loading reference cDNAS")
    cdnas["ref"] = pyfaidx.Fasta(args.cdnas[0])
    logger.info("Loading target cDNAS")
    cdnas["target"] = pyfaidx.Fasta(args.cdnas[1])
    logger.info("Loaded cDNAs")
    logger.info("Loading reference BED12")
    for entry in Bed12Parser(args.bed12[0], transcriptomic=True):
        if entry.header:
            continue
        name = entry.chrom
        if name in beds["ref"]:
            raise KeyError("Duplicated ID for the reference: {}".format(name))
        if name not in cdnas["ref"]:
            raise KeyError("Reference {} not found in the cDNAs!".format(name))
        beds["ref"][name] = entry

    logger.info("Loading target BED12")
    beds["target"] = defaultdict(dict)
    for entry in Bed12Parser(args.bed12[1], transcriptomic=True):
        # Now, here we have to account for the fact that there *might* be multiple alignments
        name = re.sub(gmap_pat, "", entry.chrom)
        if entry.chrom not in cdnas["target"]:
            raise KeyError("Target {} not found in the cDNAs!".format(
                entry.chrom))
        beds["target"][name][entry.chrom] = entry
    logger.info("Loaded BED12s")

    # Now let us start parsing the GFF3, which we presume being a GMAP GFF3
    transcript = None

    logger.info("Launching sub-processes")
    procs = []
    queue = mp.Queue(-1)
    for proc in range(args.processes):
        sq = tempfile.NamedTemporaryFile(mode="wb")
        sq.close()
        sq = sq.name
        _proc = Transferer(sq, queue, verbosity=verbosity)
        _proc.start()
        procs.append(_proc)
    logger.info("Launched sub-processes, starting parsing annotation")

    # pool = mp.Pool(processes=args.processes)

    tnum = -1
    if args.gf.endswith(("bed12", "bed")):
        parser = Bed12Parser(args.gf, transcriptomic=False)
        for line in parser:
            if line.header:
                continue
            else:
                transcript = Transcript(line)
                tid = re.sub(gmap_pat, "", transcript.id)
                logger.debug("Found %s", tid)
                ref_cdna = str(cdnas["ref"][tid])
                ref_bed = beds["ref"][tid]
                target_cdna = str(cdnas["target"][transcript.id])
                target_bed = beds["target"][tid][transcript.id]
                tnum += 1
                logger.debug("Submitting %s", tid)
                queue.put((tnum, (transcript, ref_cdna, ref_bed, target_cdna,
                                  target_bed)))
            if tnum >= 10**4 and tnum % 10**4 == 0:
                logger.info("Parsed {} transcripts", tnum)
        logger.info("Finished parsing input genomic BED file")
    else:
        parser = to_gff(args.gf)

        for pos, line in enumerate(parser):
            if line.header is True:  # or (not isinstance(line, BED12) and line.is_gene is True):
                if str(line) == "###":
                    continue
                try:
                    print(line, file=args.out)
                except IndexError:
                    raise IndexError(line._line)
                continue
            elif not isinstance(line, BED12) and line.is_gene is True:
                continue
            elif line.is_transcript is True:
                if transcript:
                    if transcript.alias is None:
                        tid = re.sub(gmap_pat, "", transcript.id)
                    else:
                        tid = re.sub(gmap_pat, "", transcript.alias)
                    ref_cdna = str(cdnas["ref"][tid])
                    ref_bed = beds["ref"][tid]
                    target_cdna = str(cdnas["target"][transcript.id])
                    store = beds["target"].get(tid, None)
                    if store is None:
                        raise KeyError((tid, beds["target"].keys()))
                    target_bed = store.get(transcript.id, None)
                    if target_bed is None:
                        raise KeyError((tid, store.keys()))
                    tnum += 1
                    queue.put((tnum, (transcript, ref_cdna, ref_bed,
                                      target_cdna, target_bed)))
                try:
                    transcript = Transcript(line)
                except (ValueError, TypeError):
                    raise ValueError((pos, line))
            elif line.is_exon is True:
                transcript.add_exon(line)
            if tnum >= 10**4 and tnum % 10**4 == 0:
                logger.info("Parsed {} transcripts", tnum)

        if transcript:
            tnum += 1
            tid = re.sub(gmap_pat, "", transcript.id)
            ref_cdna = str(cdnas["ref"][tid])
            ref_bed = beds["ref"][tid]
            target_cdna = str(cdnas["target"][transcript.id])
            target_bed = beds["target"][tid][transcript.id]
            queue.put((tnum, (transcript, ref_cdna, ref_bed, target_cdna,
                              target_bed)))
        logger.info("Finished parsing input genomic GF file")

    queue.put("EXIT")
    logger.info("Waiting for subprocesses to finish")
    [_proc.join() for _proc in procs]

    # Now the printing ...
    # results = dict()

    logger.info("Subprocesses finished, printing")
    for proc in procs:
        sq = sqlalchemy.create_engine("sqlite:///{}".format(proc.out_sq))
        for res in sq.execute("select * from storer"):
            num, bed12, gff3 = res
            if args.out_bed is not None:
                print(bed12.decode(), file=args.out_bed)
            print(*gff3.decode().split("\n"), file=args.out, sep="\n")
        os.remove(proc.out_sq)

    logger.info("Finished!")
    return
Beispiel #13
0
    def test_purging(self):

        gtf = """Chr1	foo	transcript	100	1000	.	+	.	gene_id "foo1"; transcript_id "foo1.1"
Chr1	foo	exon	100	1000	.	+	.	gene_id "foo1"; transcript_id "foo1.1"
Chr1	foo	transcript	100	2000	.	+	.	gene_id "foo1"; transcript_id "foo1.2"
Chr1	foo	exon	100	800	.	+	.	gene_id "foo1"; transcript_id "foo1.2"
Chr1	foo	exon	1900	2000	.	+	.	gene_id "foo1"; transcript_id "foo1.2"
Chr1	foo	transcript	10000	20000	.	+	.	gene_id "foo2"; transcript_id "foo2.1"
Chr1	foo	exon	10000	13000	.	+	.	gene_id "foo2; transcript_id "foo2.1"
Chr1	foo	exon	19000	20000	.	+	.	gene_id "foo"; transcript_id "foo2.1"""

        temp_gtf = tempfile.NamedTemporaryFile(mode="wt",
                                               suffix=".gtf",
                                               delete=True)

        temp_gtf.write(gtf)
        temp_gtf.flush()

        json_conf = configurator.to_json(None)

        json_conf["pick"]["files"]["input"] = temp_gtf.name
        json_conf["db_settings"]["db"] = os.path.join(tempfile.gettempdir(),
                                                      "mikado.db")
        json_conf["pick"]["files"]["output_dir"] = tempfile.gettempdir()
        json_conf["log_settings"]["log_level"] = "WARNING"

        # Now the scoring
        scoring = dict()

        scoring["requirements"] = dict()
        scoring["requirements"]["expression"] = ["exon_num"]
        scoring["requirements"]["parameters"] = dict()
        scoring["requirements"]["parameters"]["exon_num"] = dict()
        scoring["requirements"]["parameters"]["exon_num"]["name"] = "exon_num"
        scoring["requirements"]["parameters"]["exon_num"]["operator"] = "gt"
        scoring["requirements"]["parameters"]["exon_num"]["value"] = 1

        import copy
        scoring["as_requirements"] = copy.deepcopy(scoring["requirements"])
        scoring["not_fragmentary"] = copy.deepcopy(
            scoring["requirements"].copy())

        scoring["scoring"] = dict()
        scoring["scoring"]["cdna_length"] = dict()
        scoring["scoring"]["cdna_length"]["rescaling"] = "max"
        scoring["scoring"]["cdna_length"]["filter"] = dict()
        scoring["scoring"]["cdna_length"]["filter"]["operator"] = "gt"
        scoring["scoring"]["cdna_length"]["filter"]["value"] = 2000

        scoring_file = tempfile.NamedTemporaryFile(suffix=".yaml",
                                                   delete=True,
                                                   mode="wt")
        yaml.dump(scoring, scoring_file)
        scoring_file.flush()
        json_conf["pick"]["scoring_file"] = scoring_file.name
        del json_conf["scoring"]
        del json_conf["requirements"]
        del json_conf["as_requirements"]
        del json_conf["not_fragmentary"]

        for purging in (False, True):
            with self.subTest(purging=purging):
                json_conf["pick"]["files"][
                    "loci_out"] = "mikado.purging_{}.loci.gff3".format(purging)
                json_conf["pick"]["files"][
                    "log"] = "mikado.purging_{}.log".format(purging)
                json_conf["pick"]["clustering"]["purge"] = purging
                json_conf["pick"]["scoring_file"] = scoring_file.name
                json_conf = configurator.check_json(json_conf)
                self.assertEqual(len(json_conf["scoring"].keys()), 1,
                                 json_conf["scoring"].keys())

                pick_caller = picker.Picker(json_conf=json_conf)
                with self.assertRaises(SystemExit), self.assertLogs(
                        "main_logger", "INFO"):
                    pick_caller()

                with to_gff(
                        os.path.join(
                            tempfile.gettempdir(),
                            json_conf["pick"]["files"]["loci_out"])) as gff:

                    lines = [line for line in gff if line.header is False]
                self.assertGreater(len(lines), 0)
                self.assertTrue(
                    any([
                        _ for _ in lines
                        if _.attributes.get("alias", "") == "foo2.1"
                    ]))
                if purging is True:
                    self.assertFalse(
                        any([
                            _ for _ in lines
                            if _.attributes.get("alias", "") in ("foo1.2",
                                                                 "foo1.1")
                        ]))
                else:
                    found_line = [
                        _ for _ in lines
                        if _.attributes.get("alias", "") in ("foo1.2",
                                                             "foo1.1")
                    ]
                    self.assertTrue(any(found_line))
                    self.assertTrue(
                        any([_ for _ in found_line if _.score == 0]))

            # Clean up
            for fname in ["mikado.db", "mikado.purging_{}.*".format(purging)]:
                [
                    os.remove(_) for _ in glob.glob(
                        os.path.join(tempfile.gettempdir(), fname))
                ]

        scoring_file.close()
        # Now let us test with a scoring which will create transcripts with negative scores
        scoring["scoring"] = dict()
        scoring["scoring"]["cdna_length"] = dict()
        scoring["scoring"]["cdna_length"]["rescaling"] = "min"
        scoring["scoring"]["cdna_length"]["multiplier"] = -10
        scoring["scoring"]["cdna_length"]["filter"] = dict()
        scoring["scoring"]["cdna_length"]["filter"]["operator"] = "lt"
        scoring["scoring"]["cdna_length"]["filter"]["value"] = 1000

        scoring["scoring"]["exon_num"] = dict()
        scoring["scoring"]["exon_num"]["rescaling"] = "max"

        scoring_file = tempfile.NamedTemporaryFile(suffix=".yaml",
                                                   delete=True,
                                                   mode="wt")
        yaml.dump(scoring, scoring_file)
        scoring_file.flush()
        json_conf["pick"]["scoring_file"] = scoring_file.name

        for purging in (False, True):
            with self.subTest(purging=purging):
                json_conf["pick"]["files"][
                    "loci_out"] = "mikado.purging_{}.loci.gff3".format(purging)
                json_conf["pick"]["files"][
                    "subloci_out"] = "mikado.purging_{}.subloci.gff3".format(
                        purging)
                json_conf["pick"]["files"]["log"] = os.path.join(
                    tempfile.gettempdir(),
                    "mikado.purging_{}.log".format(purging))
                json_conf["pick"]["clustering"]["purge"] = purging
                json_conf["pick"]["scoring_file"] = scoring_file.name
                json_conf = configurator.check_json(json_conf)
                self.assertEqual(len(json_conf["scoring"].keys()), 2,
                                 json_conf["scoring"].keys())

                pick_caller = picker.Picker(json_conf=json_conf)
                with self.assertRaises(SystemExit), self.assertLogs(
                        "main_logger", "INFO"):
                    pick_caller()

                with to_gff(
                        os.path.join(
                            tempfile.gettempdir(),
                            json_conf["pick"]["files"]["loci_out"])) as gff:
                    lines = [line for line in gff if line.header is False]
                self.assertGreater(len(lines), 0)
                self.assertTrue(
                    any([
                        _ for _ in lines
                        if _.attributes.get("alias", "") == "foo2.1"
                    ]))
                if purging is True:
                    self.assertFalse(
                        any([
                            _ for _ in lines
                            if _.attributes.get("alias", "") in ("foo1.2",
                                                                 "foo1.1")
                        ]))
                else:
                    found_line = [
                        _ for _ in lines
                        if _.attributes.get("alias", "") in ("foo1.2",
                                                             "foo1.1")
                    ]
                    self.assertTrue(any(found_line))
                    self.assertTrue(
                        any([_ for _ in found_line if _.score <= 0]))

            # Clean up
            for fname in ["mikado.db", "mikado.purging_{}.*".format(purging)]:
                [
                    os.remove(_) for _ in glob.glob(
                        os.path.join(tempfile.gettempdir(), fname))
                ]

        temp_gtf.close()

        temp_gtf = tempfile.NamedTemporaryFile(mode="wt",
                                               suffix=".gtf",
                                               delete=True)

        gtf = "\n".join([_ for _ in gtf.split("\n") if "foo1.1" not in _])

        temp_gtf.write(gtf)
        temp_gtf.flush()
        json_conf["pick"]["files"]["input"] = temp_gtf.name

        for purging in (False, True):
            with self.subTest(purging=purging):
                json_conf["pick"]["files"][
                    "loci_out"] = "mikado.purging_{}.loci.gff3".format(purging)
                json_conf["pick"]["files"][
                    "subloci_out"] = "mikado.purging_{}.subloci.gff3".format(
                        purging)
                json_conf["pick"]["files"][
                    "log"] = "mikado.purging_{}.log".format(purging)
                json_conf["pick"]["clustering"]["purge"] = purging
                json_conf["pick"]["scoring_file"] = scoring_file.name
                json_conf = configurator.check_json(json_conf)
                self.assertEqual(len(json_conf["scoring"].keys()), 2,
                                 json_conf["scoring"].keys())

                pick_caller = picker.Picker(json_conf=json_conf)
                with self.assertRaises(SystemExit), self.assertLogs(
                        "main_logger", "INFO"):
                    pick_caller()

                with to_gff(
                        os.path.join(
                            tempfile.gettempdir(),
                            json_conf["pick"]["files"]["loci_out"])) as gff:
                    lines = [line for line in gff if line.header is False]
                self.assertGreater(len(lines), 0)
                self.assertTrue(
                    any([
                        _ for _ in lines
                        if _.attributes.get("alias", "") == "foo2.1"
                    ]))
                if purging is True:
                    self.assertFalse(
                        any([
                            _ for _ in lines
                            if _.attributes.get("alias", "") == "foo1.2"
                        ]))
                else:
                    found_line = [
                        _ for _ in lines
                        if _.attributes.get("alias", "") == "foo1.2"
                    ]
                    self.assertTrue(any(found_line))
                    self.assertTrue(
                        any([_ for _ in found_line if _.score <= 0]),
                        "\n".join([str(_) for _ in found_line]))

            # Clean up
            for fname in ["mikado.db", "mikado.purging_{}.*".format(purging)]:
                [
                    os.remove(_) for _ in glob.glob(
                        os.path.join(tempfile.gettempdir(), fname))
                ]