def test_index_various(self): logger = create_default_logger("test_index_gff3") for fname in ("trinity.gff3", "trinity.gtf", "trinity.bed12", "trinity.cDNA_match.gff3", "trinity.match_matchpart.gff3"): with self.subTest( fname=fname), tempfile.TemporaryDirectory() as out_folder: shutil.copy( pkg_resources.resource_filename("Mikado.tests", fname), out_folder) fhandle = os.path.join(out_folder, fname) index_name = "{}.midx".format(fhandle) indexing.create_index(parser_factory(fhandle), logger, index_name, ref_gff=(fname.endswith("gff3"))) self.assertTrue(os.path.exists(index_name)) self.assertGreater(os.stat(index_name).st_size, 0) # Now check the index has been created correctly indexing.check_index(index_name, logger) namespace = Namespace(default=False) namespace.reference = parser_factory(fhandle) _ = indexing.load_index(namespace, logger) # Now rebuild with self.assertLogs(logger, level="INFO") as cmo: indexing.create_index(parser_factory(fhandle), logger, index_name, ref_gff=(fname.endswith("gff3"))) self.assertTrue( any([ re.search(r"Removing the old index", _.msg) for _ in cmo.records ]), cmo.records)
def test_asn(self): # Currently DISABLED because the ASN specifications requires the database to be where indicated by the # relative path within the ASN. So for the time being this test is *not active*. asns = [ os.path.join("blast", "asn", "blast.asn.gz"), os.path.join("blast_parse_seqids", "asn", "blast.asn.gz") ] for folder in ["sanitised", "uniprot"]: conf = MikadoConfiguration() targets = os.path.join(self.master, folder, "uniprot.fasta") self.assertTrue(os.path.exists(targets)) for asn in asns: asn = os.path.join(self.master, folder, asn) self.assertTrue(os.path.exists(asn)) with tempfile.TemporaryDirectory() as out_folder: conf.serialise.files.output_dir = out_folder conf.serialise.files.blast_targets = [targets] conf.serialise.files.transcripts = self.queries conf.serialise.files.xml = [asn] logger = create_default_logger(f"test_asn_{folder}") out_db = os.path.join(out_folder, conf.db_settings.db) conf.db_settings.db = out_db self.run_loading(asn, out_db, logger, conf)
def test_not_existent(self): logger = create_default_logger("test_not_existent") with tempfile.NamedTemporaryFile() as foo, self.assertRaises( CorruptIndex): indexing.check_index(foo.name, logger) with tempfile.NamedTemporaryFile() as foo, self.assertRaises( CorruptIndex): namespace = Namespace(default=False) namespace.reference = open(foo.name) indexing.load_index(namespace, logger)
def test_fusion(self): t = Transcript() t.chrom, t.strand, t.start, t.end, t.id, t.parent = "Chr1", "+", 101, 1000, "foo.1", "foo" t.add_exons([(101, 500), (601, 800), (901, 1000)]) t.finalize() t2 = Transcript() t2.chrom, t2.strand, t2.start, t2.end, t2.id, t2.parent = "Chr1", "+", 2001, 3000, "bar.1", "bar" t2.add_exons([(2001, 2500), (2601, 2800), (2901, 3000)]) t2.finalize() t3 = Transcript() t3.chrom, t3.strand, t3.start, t3.end, t3.id, t3.parent = "Chr1", "+", 651, 2703, "faz.1", "faz" t3.add_exons([(651, 800), (901, 1300), (2230, 2500), (2601, 2703)]) t3.finalize() logger = create_default_logger("test_fusion") with tempfile.TemporaryDirectory() as folder: with open(os.path.join(folder, "reference.gtf"), "wt") as reference: print(t.format("gtf"), file=reference) print(t2.format("gtf"), file=reference) self.assertTrue(os.path.exists(reference.name)) _ = [_ for _ in parser_factory(reference.name)] try: indexing.create_index(parser_factory(reference.name), logger, "{}.midx".format(reference.name)) except InvalidParsingFormat: self.assertFalse( True, "\n".join([line.rstrip() for line in open(reference.name)])) namespace = Namespace(default=False) namespace.out = os.path.join(folder, "out") for report in (False, True): with self.subTest(report=report): namespace.report_fusions = report assigner = Assigner("{}.midx".format(reference.name), args=namespace, printout_tmap=False) result = assigner.get_best(t3) if report: self.assertTrue(len(result), 2) self.assertTrue(result[0].ccode == ("f", "j"), str(result[0])) self.assertTrue(result[1].ccode == ("f", "j"), str(result[1])) else: self.assertTrue(result.ccode == ("j", ), str(result))
def test_daa(self): daa_base = os.path.join("diamond", "daa", "blast.daa") for folder in ["sanitised", "uniprot"]: targets = os.path.join(self.master, folder, "uniprot.fasta") self.assertTrue(os.path.exists(targets)) daa = os.path.join(self.master, folder, daa_base) self.assertTrue(os.path.exists(daa)) with tempfile.TemporaryDirectory() as out_folder: conf = MikadoConfiguration() out_db = os.path.join(out_folder, conf.db_settings.db) conf.db_settings.db = out_db conf.serialise.files.output_dir = out_folder conf.serialise.files.blast_targets = [targets] conf.serialise.files.transcripts = self.queries conf.serialise.files.xml = [daa] logger = create_default_logger(f"test_daa_{folder}") self.assertTrue(tempfile.gettempdir() in out_db) self.run_loading(daa, out_db, logger, conf)
def test_xml(self): xml_base = os.path.join("xml", "blast.xml.gz") for folder in ["sanitised", "uniprot"]: for subfolder in ["blast", "blast_parse_seqids", "diamond"]: targets = os.path.join(self.master, folder, "uniprot.fasta") self.assertTrue(os.path.exists(targets)) xml = os.path.join(self.master, folder, subfolder, xml_base) self.assertTrue(os.path.exists(xml)) with tempfile.TemporaryDirectory() as out_folder: conf = MikadoConfiguration() conf.serialise.files.output_dir = out_folder out_db = os.path.join(out_folder, conf.db_settings.db) conf.db_settings.db = out_db conf.serialise.files.blast_targets = [targets] conf.serialise.files.transcripts = self.queries conf.serialise.files.xml = [xml] logger = create_default_logger( f"test_xml_{folder}_{subfolder}") self.assertTrue(tempfile.gettempdir() in out_db) self.run_loading(xml, out_db, logger, conf)
def test_tsv(self): tsv_base = os.path.join("tsv", "blast.tsv.gz") for folder in ["uniprot", "sanitised"]: for subfolder in ["blast", "blast_parse_seqids", "diamond"]: targets = os.path.join(self.master, folder, "uniprot.fasta") self.assertTrue(os.path.exists(targets)) tsv = os.path.join(self.master, folder, subfolder, tsv_base) self.assertTrue(os.path.exists(tsv)) with tempfile.TemporaryDirectory() as out_folder: conf = MikadoConfiguration() conf.serialise.files.output_dir = out_folder conf.serialise.files.blast_targets = [targets] conf.serialise.files.transcripts = self.queries conf.serialise.files.xml = [tsv] out_db = os.path.join(out_folder, conf.db_settings.db) conf.db_settings.db = out_db self.assertTrue(tempfile.gettempdir() in out_db) logger = create_default_logger( f"test_tsv_{folder}_{subfolder}", level="DEBUG") self.run_loading(tsv, out_db, logger, conf)
def test_subprocess_multi_empty_orfs(self): print("Started light test") self.fai = pysam.FastaFile( pkg_resources.resource_filename("Mikado.tests", "chr5.fas.gz")) self.configuration = configurator.load_and_validate_config(None) self.configuration.reference.genome = self.fai.filename.decode() xml = pkg_resources.resource_filename("Mikado.tests", "chunk-001-proteins.xml.gz") transcripts = pkg_resources.resource_filename("Mikado.tests", "mikado_prepared.fasta") junctions = pkg_resources.resource_filename("Mikado.tests", "junctions.bed") tmp_orf = tempfile.NamedTemporaryFile(suffix=".bed12") tmp_orf.write(b"#track\n") tmp_orf.write( b"cufflinks_star_at.23553.1\t0\t1733\tID=1_1;partial=01;start_type=ATG\t0\t+\t312\t1733\t0,0,0\t1\t1733\t0\n" ) tmp_orf.flush() uniprot = pkg_resources.resource_filename( "Mikado.tests", "uniprot_sprot_plants.fasta.gz") mobjects = 300 # Let's test properly the serialisation for BLAST # Set up the command arguments with tempfile.TemporaryDirectory(prefix="has_to_fail") as folder_one, \ tempfile.TemporaryDirectory(prefix="has_to_fail") as folder_two: for procs, folder in [(3, folder_one), (1, folder_two)]: self.configuration = configurator.load_and_validate_config( None) self.configuration.reference.genome = self.fai.filename.decode( ) json_file = os.path.join(folder, "mikado.yaml") db = os.path.join(folder, "mikado.db") log = "failed_serialise.log" uni_out = os.path.join(folder, "uniprot_sprot_plants.fasta") self.configuration.serialise.files.log = os.path.basename(log) self.configuration.multiprocessing_method = "fork" with gzip.open(uniprot, "rb") as uni, open(uni_out, "wb") as uni_out_handle: uni_out_handle.write(uni.read()) self.configuration.serialise.files.transcripts = transcripts self.configuration.serialise.files.blast_targets = uni_out self.configuration.serialise.files.log = log self.configuration.serialise.files.output_dir = folder self.configuration.serialise.force = True self.configuration.serialise.files.orfs = [tmp_orf.name] self.configuration.serialise.files.junctions = [junctions] self.configuration.serialise.files.xml = [xml] self.configuration.threads = procs self.configuration.serialise.max_objects = mobjects self.configuration.db_settings.db = db self.configuration.seed = 1078 self.assertFalse(os.path.exists(db)) logger = create_default_logger(f"test_light_serialise_{procs}", level="INFO") with self.assertRaises(InvalidSerialization), self.assertLogs( logger.name) as cmo: load_orfs(self.configuration, logger) self.assertTrue( any("Mikado serialise failed due to problems with the input data. Please check the logs." in line for line in cmo.output), cmo.output) self.assertTrue( any("The provided ORFs do not match the transcripts provided and " "already present in the database." in line for line in cmo.output), print("\n".join(cmo.output))) print("Finished light test")