class TestConverter(unittest.TestCase): def setUp(self): self.converter = Converter() self.example = Example() self.converter.gff3parser = Mock_gff3_parser self.converter._print_rntptt_title = Mock_func().print_rntptt_title self.converter.tsspredator = Mock_TSSPredatorReader() self.converter._read_file = Mock_func().mock_read_file self.gff_file = self.example.gff_file self.ptt_out = self.example.ptt_out self.rnt_out = self.example.rnt_out self.srna_out = self.example.srna_out self.embl_file = self.example.embl_file self.embl_out = self.example.embl_out self.multi_embl = self.example.multi_embl self.gff_out = self.example.gff_out self.mastertable = self.example.mastertable self.tss_file = self.example.tss_file self.fasta_file = self.example.fasta_file self.transterm = self.example.transterm self.term_file = self.example.term_file self.circ_file = self.example.circrna_table self.circ_all = self.example.circrna_all self.circ_best = self.example.circrna_best self.test_folder = "test_folder" self.mock_args = MockClass() if (not os.path.exists(self.test_folder)): os.mkdir(self.test_folder) def tearDown(self): if os.path.exists(self.test_folder): shutil.rmtree(self.test_folder) def test_print_rntptt_file(self): cdss = [] genes = [] rnas = [] gff_dict = Example().gff_dict for gff in gff_dict: if gff["feature"] == "gene": genes.append(self.converter.gff3parser.entries(self, gff)) elif gff["feature"] == "CDS": cdss.append(self.converter.gff3parser.entries(self, gff)) elif gff["feature"] == "tRNA": rnas.append(self.converter.gff3parser.entries(self, gff)) out_p = StringIO() out_r = StringIO() self.converter._print_rntptt_file(out_p, cdss, genes) self.converter._print_rntptt_file(out_r, rnas, genes) self.assertEqual(out_p.getvalue().split("\n")[:-1], self.example.ptt_out_list) self.assertEqual(out_r.getvalue().split("\n")[:-1], self.example.rnt_out_list) out_p.close() out_r.close() def test_srna2pttrnt(self): srna_input_file = os.path.join(self.test_folder, "srna.gff") srna_output_file = os.path.join(self.test_folder, "srna.out") with open(srna_input_file, "w") as fh: fh.write(self.gff_file) srnas = [] self.converter._srna2rntptt(srna_input_file, srna_output_file, srnas, 1234567) datas = import_data(srna_output_file) self.assertEqual(set(datas), set(self.srna_out.split("\n"))) def test_multi_embl_pos(self): embls = [] for line in self.embl_file.split("\n"): datas = self.converter._multi_embl_pos(line.strip()) if datas != "Wrong": embls.append(datas) for index in range(0, 7): self.assertDictEqual(embls[index], self.embl_out[index]) for index in range(0, 2): self.assertDictEqual(embls[-1]["pos"][index], self.multi_embl[index]) def test_parser_embl_data(self): embl_file = os.path.join(self.test_folder, "test.embl") embl_out = os.path.join(self.test_folder, "test.embl_out") out = StringIO() with open(embl_file, "w") as eh: for line in self.embl_file.split("\n"): eh.write(line + "\n") info = self.converter._parser_embl_data(embl_file, out) datas = out.getvalue().split("\n") self.assertEqual(set(datas[:-1]), set(self.gff_out.split("\n"))) self.assertEqual(info[0], "NC_007795.1") for index in range(0, 2): self.assertDictEqual(info[1]["pos"][index], self.multi_embl[index]) out.close() def test_multi_tss_class(self): nums = {"tss": 0, "tss_uni": 0, "class": 1} utrs = {"total": [], "pri": [], "sec": []} tss_features = {"tss_types": [], "locus_tags": [], "utr_lengths": []} tss_index = defaultdict(lambda: 0) master_file = os.path.join(self.test_folder, "test.tsv") fh = StringIO(self.mastertable) for tss in self.converter.tsspredator.entries(fh): self.converter._multi_tss_class(tss, tss_index, tss_features, nums, utrs) fh.close() self.assertDictEqual(nums, {'tss_uni': 0, 'class': 5, 'tss': 2}) def test_convert_mastertable2gff(self): master_file = os.path.join(self.test_folder, "test.tsv") with open(master_file, "w") as th: th.write(self.mastertable) out_gff = os.path.join(self.test_folder, "test.tsv_out") self.converter.convert_mastertable2gff(master_file, "ANNOgesic", "TSS", "aaa", out_gff) datas = import_data(out_gff) self.assertEqual(set(datas), set(self.tss_file.split("\n"))) def test_convert_gff2rntptt(self): srna_input_file = os.path.join(self.test_folder, "srna.gff") srna_output_file = os.path.join(self.test_folder, "srna.out") gff_file = os.path.join(self.test_folder, "test.gff") rnt_file = os.path.join(self.test_folder, "test.rnt") ptt_file = os.path.join(self.test_folder, "test.ptt") fasta_file = os.path.join(self.test_folder, "test.fa") with open(srna_input_file, "w") as fh: fh.write(self.gff_file) with open(gff_file, "w") as fh: fh.write(self.gff_file) with open(fasta_file, "w") as fh: fh.write(self.fasta_file) self.converter.convert_gff2rntptt(gff_file, fasta_file, ptt_file, rnt_file, srna_input_file, srna_output_file) self.assertTrue(srna_output_file) self.assertTrue(rnt_file) self.assertTrue(ptt_file) def test_convert_embl2gff(self): embl_file = os.path.join(self.test_folder, "test.embl") gff_file = os.path.join(self.test_folder, "test.embl_out") with open(embl_file, "w") as eh: for line in self.embl_file.split("\n"): eh.write(line + "\n") self.converter.convert_embl2gff(embl_file, gff_file) datas = import_data(gff_file) self.assertEqual(set(datas[1:-2]), set(self.gff_out.split("\n"))) def test_convert_transtermhp2gff(self): transterm_file = os.path.join(self.test_folder, "test_best_terminator_after_gene.bag") gff_file = os.path.join(self.test_folder, "transterm.gff") with open(transterm_file, "w") as th: th.write(self.transterm) self.converter.convert_transtermhp2gff(transterm_file, gff_file) datas = import_data(gff_file) self.assertEqual(set(datas), set(self.term_file.split("\n"))) def get_info(datas): f_datas = [] for data in datas: if not data.startswith("#"): f_datas.append("\t".join(data.split("\t")[:8])) return f_datas def test_convert_circ2gff(self): circ_file = os.path.join(self.test_folder, "circ.csv") out_all = os.path.join(self.test_folder, "all.gff") out_filter = os.path.join(self.test_folder, "best.gff") with open(circ_file, "w") as ch: ch.write(self.circ_file) args = self.mock_args.mock() args.start_ratio = 0.5 args.end_ratio = 0.5 args.support = 5 self.converter.convert_circ2gff(circ_file, args, out_all, out_filter) datas = import_data(out_all) f_datas = [] for data in datas: if not data.startswith("#"): f_datas.append("\t".join(data.split("\t")[:8])) c_datas = [] for data in self.circ_all.split("\n"): if not data.startswith("#"): c_datas.append("\t".join(data.split("\t")[:8])) self.assertListEqual(f_datas, c_datas) datas = import_data(out_filter) f_datas = [] for data in datas: if not data.startswith("#"): f_datas.append("\t".join(data.split("\t")[:8])) c_datas = [] for data in self.circ_best.split("\n"): if not data.startswith("#"): c_datas.append("\t".join(data.split("\t")[:8])) self.assertListEqual(f_datas, c_datas)
class CircRNADetection(object): '''Detection of circRNA''' def __init__(self, args_circ): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.alignment_path = os.path.join(args_circ.output_folder, "segemehl_alignment_files") self.splice_path = os.path.join(args_circ.output_folder, "segemehl_splice_results") self.candidate_path = os.path.join(args_circ.output_folder, "circRNA_tables") self.gff_folder = os.path.join(args_circ.output_folder, "gffs") self.gff_path = os.path.join(args_circ.gffs, "tmp") self.splices = {"file": "splicesites.bed", "splice": "splicesites"} self.trans = {"file": "transrealigned.bed", "trans": "transrealigned"} self.fasta_path = os.path.join(args_circ.fastas, "tmp") def _wait_process(self, processes): '''wait for the parallels to finish the process''' for p in processes: p.wait() if p.stdout: p.stdout.close() if p.stdin: p.stdin.close() if p.stderr: p.stderr.close() try: p.kill() except OSError: pass time.sleep(5) def _deal_zip_file(self, read_files, log): tmp_datas = [] tmp_reads = [] for reads in read_files: zips = [] tmp_datas = reads["files"] for read in reads["files"]: if read.endswith(".bz2"): mod_read = read.replace(".bz2", "") if (".fa" not in mod_read) and ( ".fasta" not in mod_read) and (".fna" not in mod_read) and ( ".fq" not in mod_read) and (".fastq" not in mod_read): mod_read = mod_read + ".fa" read_out = open(mod_read, "w") tmp_datas.append(mod_read) zips.append(mod_read) print(" ".join(["Uncompressing", read])) log.write(" ".join(["bzcat", read]) + "\n") call(["bzcat", read], stdout=read_out) log.write("\t" + mod_read + " is generated.\n") read_out.close() elif read.endswith(".gz"): mod_read = read.replace(".gz", "") if (".fa" not in mod_read) and ( ".fasta" not in mod_read) and (".fna" not in mod_read) and ( ".fq" not in mod_read) and (".fastq" not in mod_read): mod_read = mod_read + ".fa" read_out = open(mod_read, "w") tmp_datas.append(mod_read) zips.append(mod_read) print(" ".join(["Uncompressing", read])) log.write(" ".join(["zcat", read]) + "\n") call(["zcat", read], stdout=read_out) read_out.close() log.write("\t" + mod_read + " is generated.\n") tmp_reads.append({ "sample": reads["sample"], "files": tmp_datas, "zips": zips }) return tmp_reads def _run_segemehl_fasta_index(self, segemehl_path, fasta_path, index, fasta, log): log.write(" ".join([ segemehl_path, "-x", os.path.join(fasta_path, index), "-d", os.path.join(fasta_path, fasta) ]) + "\n") call([ segemehl_path, "-x", os.path.join(fasta_path, index), "-d", os.path.join(fasta_path, fasta) ]) def _run_segemehl_align(self, args_circ, index, fasta, read, sam_file, log_file, fasta_prefix, log): out = open(os.path.join(self.alignment_path, fasta_prefix, sam_file), "w") log = open(os.path.join(self.alignment_path, fasta_prefix, log_file), "w") log.write(" ".join([ args_circ.segemehl_path, "-i", os.path.join(self.fasta_path, index), "-d", os.path.join(self.fasta_path, fasta), "-q", read, "-S" ]) + "\n") p = Popen([ args_circ.segemehl_path, "-i", os.path.join(self.fasta_path, index), "-d", os.path.join(self.fasta_path, fasta), "-q", read, "-S" ], stdout=out, stderr=log) return p def _align(self, args_circ, read_datas, log): '''align the read. if the bam files are provided, it can be skipped.''' prefixs = [] align_files = [] log.write("Using segemehl to align the read.\n") log.write( "Please make sure the version of segemehl is at least 0.1.9.\n") for fasta in os.listdir(self.fasta_path): index = fasta.replace(".fa", ".idx") self._run_segemehl_fasta_index(args_circ.segemehl_path, self.fasta_path, index, fasta, log) processes = [] num_process = 0 fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) self.helper.check_make_folder( os.path.join(self.alignment_path, fasta_prefix)) log.write("Running for {0}.\n".format(fasta_prefix)) for reads in read_datas: for read in reads["files"]: num_process += 1 read_name = read.split("/")[-1] if read_name.endswith(".fa") or \ read_name.endswith(".fna") or \ read_name.endswith(".fasta") or \ read_name.endswith(".fq") or \ read_name.endswith(".fastq"): filename = read_name.split(".") read_prefix = ".".join(filename[:-1]) sam_file = "_".join( [read_prefix, fasta_prefix + ".sam"]) log_file = "_".join( [read_prefix, fasta_prefix + ".log"]) align_files.append("_".join( [read_prefix, fasta_prefix])) print("Mapping {0}".format(sam_file)) p = self._run_segemehl_align(args_circ, index, fasta, read, sam_file, log_file, fasta_prefix, log) processes.append(p) if num_process == args_circ.cores: self._wait_process(processes) num_process = 0 self._wait_process(processes) log.write("Done!\n") log.write("The following files are generated in {0}:\n".format( os.path.join(self.alignment_path, fasta_prefix))) for file_ in os.listdir( os.path.join(self.alignment_path, fasta_prefix)): log.write("\t" + file_ + "\n") return align_files, prefixs def _run_samtools_convert_bam(self, samtools_path, pre_sam, out_bam, log): log.write( " ".join([samtools_path, "view", "-bS", pre_sam, "-o", out_bam]) + "\n") call([samtools_path, "view", "-bS", pre_sam, "-o", out_bam]) def _convert_sam2bam(self, sub_alignment_path, samtools_path, align_files, log): bam_files = [] convert_ones = [] remove_ones = [] log.write("Using Samtools to convert SAM files to BAM files.\n") log.write( "Please make sure the version of Samtools is at least 1.3.1.\n") for sam in os.listdir(sub_alignment_path): pre_sam = os.path.join(sub_alignment_path, sam) if sam.endswith(".sam"): bam_file = sam.replace(".sam", ".bam") print("Converting {0} to {1}".format(sam, bam_file)) out_bam = os.path.join(sub_alignment_path, bam_file) self._run_samtools_convert_bam(samtools_path, pre_sam, out_bam, log) bam_files.append(out_bam) if align_files: if bam_file.replace(".bam", "") not in align_files: convert_ones.append(out_bam) else: remove_ones.append(pre_sam) elif sam.endswith(".bam"): if (pre_sam not in convert_ones) and (pre_sam not in remove_ones): bam_files.append(pre_sam) elif sam.endswith(".log"): os.remove(pre_sam) log.write("Done!\n") log.write("The following files are generated:\n") for file_ in os.listdir(sub_alignment_path): if file_.endswith(".bam"): log.write("\t" + os.path.join(sub_alignment_path, file_) + "\n") return bam_files, convert_ones, remove_ones def _run_samtools_merge_sort(self, samtools_path, prefix, out_folder, bam_datas, log): log.write("Using Samtools for merging, sorting and converting " "the BAM files.\n") log.write("Make sure the version Samtools is at least 1.3.1.\n") for bam_data in bam_datas: print("Merging bam files for {0} of {1}".format( prefix, bam_data["sample"])) sample_bam = os.path.join( out_folder, "_".join([prefix, bam_data["sample"] + ".bam"])) if len(bam_data["files"]) <= 1: shutil.copyfile(bam_data["files"][0], sample_bam) else: file_line = " ".join(bam_data["files"]) log.write( " ".join([samtools_path, "merge", sample_bam, file_line]) + "\n") os.system(" ".join( [samtools_path, "merge", sample_bam, file_line])) print("Sorting bam files for {0} of {1}".format( prefix, bam_data["sample"])) sort_sample = os.path.join( out_folder, "_".join([prefix, bam_data["sample"] + "_sort.bam"])) log.write(" ".join( [samtools_path, "sort", "-o", sort_sample, sample_bam]) + "\n") call([samtools_path, "sort", "-o", sort_sample, sample_bam]) os.remove(sample_bam) print("Converting bam files to sam files for {0} of {1}".format( prefix, bam_data["sample"])) log.write(" ".join([ samtools_path, "view", "-h", "-o", sort_sample.replace(".bam", ".sam"), sort_sample ]) + "\n") call([ samtools_path, "view", "-h", "-o", sort_sample.replace(".bam", ".sam"), sort_sample ]) log.write("Done!\n") log.write("\t" + sort_sample.replace(".bam", ".sam") + " is generated.\n") def _merge_sort_aligment_file(self, bam_datas, read_datas, samtools_path, out_folder, convert_ones, tmp_reads, remove_ones, prefix, log): if bam_datas is None: merge_bam_datas = [] for read_data in read_datas: bam_files = [] for read in read_data["files"]: if read.endswith(".gz") or read.endswith(".bz2"): read = ".".join(read.split("/")[-1].split(".")[:-1]) read_prefix = ".".join(read.split("/")[-1].split(".")[:-1]) bam_files.append( os.path.join(self.alignment_path, prefix, "_".join([read_prefix, prefix + ".bam"]))) merge_bam_datas.append({ "sample": read_data["sample"], "files": bam_files }) elif (bam_datas is not None) and (read_datas is not None): merge_bam_datas = copy.deepcopy(bam_datas) for bam_data in merge_bam_datas: for read_data in read_datas: if bam_data["sample"] == read_data["sample"]: for read in read_data["files"]: read_prefix = ".".join( read.split("/")[-1].split(".")[:-1]) bam = os.path.join( self.alignment_path, prefix, "_".join([read_prefix, prefix + ".bam"])) if (bam not in bam_data["files"]): bam_data["files"].append(bam) else: merge_bam_datas = copy.deepcopy(bam_datas) self._run_samtools_merge_sort(samtools_path, prefix, out_folder, merge_bam_datas, log) for bam in convert_ones: os.remove(bam) for sam in remove_ones: os.remove(sam) def _run_testrealign(self, prefix, testrealign_path, out_folder, log): log.write("Using Segemehl to detect circular RNAs.\n") log.write( "Please make sure the version of Segemehl is at least 0.1.9.\n") log.write( "Please make sure your testrealign.x exists. If it does not " "exists, please reinstall your Segemehl via using make all.\n") sub_splice_path = os.path.join(self.splice_path, prefix) if not os.path.exists(sub_splice_path): os.mkdir(sub_splice_path) err_log = os.path.join(sub_splice_path, prefix + ".log") print("Running testrealign.x for {0}".format(prefix)) for sam_file in os.listdir(out_folder): if sam_file.endswith("sort.sam"): sample_prefix = sam_file.replace("_sort.sam", "") command = " ".join([ testrealign_path, "-d", os.path.join(self.fasta_path, prefix + ".fa"), "-q", os.path.join(out_folder, sam_file), "-n", "-U", os.path.join(sub_splice_path, sample_prefix + "_splicesites.bed"), "-T", os.path.join(sub_splice_path, sample_prefix + "_transrealigned.bed") ]) log.write(command + " 2>" + err_log + "\n") os.system(command + " 2>" + err_log) log.write("Done!\n") log.write("The following files are generated:\n") for file_ in os.listdir(sub_splice_path): log.write("\t" + os.path.join(sub_splice_path, file_) + "\n") self.helper.remove_all_content(out_folder, ".sam", "file") def _merge_bed(self, fastas, splice_path, output_folder): '''Merge the bed files for analysis''' fa_prefixs = [] for fasta in os.listdir(fastas): headers = [] if (fasta.endswith(".fa") or fasta.endswith(".fna") or fasta.endswith(".fasta")): with open(os.path.join(fastas, fasta), "r") as f_h: for line in f_h: line = line.strip() if line.startswith(">"): headers.append(line[1:]) filename = fasta.split(".") fasta_prefix = ".".join(filename[:-1]) fa_prefixs.append(fasta_prefix) bed_folder = os.path.join(output_folder, fasta_prefix) self.helper.check_make_folder(bed_folder) samples = [] for header in headers: for splice in os.listdir(os.path.join(splice_path, header)): if splice.endswith(".bed"): if self.splices["file"] in splice: sample = splice.replace(header, "") sample = sample.replace( self.splices["file"], "") if sample not in samples: samples.append(sample) shutil.copyfile( os.path.join(splice_path, header, splice), os.path.join(bed_folder, "tmp_" + splice)) for sample in samples: out_splice = os.path.join( bed_folder, "".join([fasta_prefix + sample + self.splices["file"] ])) out_trans = os.path.join( bed_folder, "".join([fasta_prefix + sample + self.trans["file"]])) if os.path.exists(out_splice): os.remove(out_splice) if os.path.exists(out_trans): os.remove(out_trans) for file_ in os.listdir(bed_folder): if (self.splices["splice"] in file_) and (sample in file_): self.helper.merge_file( os.path.join(bed_folder, file_), out_splice) elif (self.trans["trans"] in file_) and (sample in file_): self.helper.merge_file( os.path.join(bed_folder, file_), out_trans) self.helper.remove_all_content(splice_path, None, "dir") return samples, fa_prefixs def _stat_and_gen_gff(self, prefixs, samples, args_circ, log): '''do statistics and print the result to gff file''' log.write( "Running circRNA.py to do statistics and generate gff files.\n") log.write("The following files are generated:\n") for prefix in prefixs: self.helper.check_make_folder(os.path.join(self.gff_folder, prefix)) self.helper.check_make_folder( os.path.join(self.splice_path, prefix)) for bed in os.listdir(os.path.join(args_circ.output_folder, prefix)): if (bed.split("_")[0] != "tmp") and (bed.endswith(".bed")): shutil.copy( os.path.join(args_circ.output_folder, prefix, bed), os.path.join(self.splice_path, prefix)) self.helper.check_make_folder( os.path.join(self.candidate_path, prefix)) print("Comparing circular RNAs with annotations of {0}".format( prefix)) for sample in samples: splice_file = os.path.join( self.splice_path, prefix, "".join([prefix, sample, self.splices["file"]])) stat_file = os.path.join( args_circ.stat_folder, "".join(["stat_", prefix, sample, "circRNA.csv"])) csv_all = os.path.join( self.candidate_path, prefix, "".join([prefix, sample, "circRNA_all.csv"])) csv_best = os.path.join( self.candidate_path, prefix, "".join([prefix, sample, "circRNA_best.csv"])) gff_all = os.path.join( self.gff_folder, prefix, "".join([prefix, sample, "circRNA_all.gff"])) gff_best = os.path.join( self.gff_folder, prefix, "".join([prefix, sample, "circRNA_best.gff"])) detect_circrna(splice_file, os.path.join(self.gff_path, prefix + ".gff"), csv_all, args_circ, stat_file) self.converter.convert_circ2gff( os.path.join(self.candidate_path, prefix, "".join([prefix, sample, "circRNA_all.csv"])), args_circ, gff_all, gff_best) log.write("\t" + stat_file + "\n") log.write("\t" + csv_all + "\n") log.write("\t" + csv_best + "\n") log.write("\t" + gff_all + "\n") log.write("\t" + gff_best + "\n") def _extract_input_files(self, inputs): input_datas = [] for input_ in inputs: datas = input_.split(":") if len(datas) != 2: print("Error: the format of --bam_files or " "--read_files is wrong!") sys.exit() for file_ in datas[-1].split(","): if not os.path.exists(file_): print("Error: some files in --bam_files or " "--read_files do not exist!") sys.exit() input_datas.append({ "sample": datas[0], "files": datas[-1].split(",") }) return input_datas def _combine_read_bam(self, bam_files, bam_datas, read_datas): if bam_datas is not None: for bam_data in bam_datas: for read_data in read_datas: if bam_data["sample"] == read_data["sample"]: for read in read_data["files"]: prefix = ".".join( read.split("/")[-1].split(".")[:-1]) bam = os.path.join(self.alignment_path, prefix + ".bam") if (bam in bam_files) and ( bam not in bam_data["files"]): bam_data["files"].append(bam) else: bam_datas = [] for read_data in read_datas: bam_files = [] for read in read_data["files"]: prefix = ".".join(read.split("/")[-1].split(".")[:-1]) bam_files.append( os.path.join(self.alignment_path, prefix + ".bam")) bam_datas.append({ "sample": read_data["sample"], "files": bam_files }) return bam_datas def _remove_tmp_files(self, args_circ, fa_prefixs): self.helper.remove_tmp_dir(args_circ.fastas) self.helper.remove_tmp_dir(args_circ.gffs) self.helper.remove_all_content(args_circ.output_folder, ".bam", "file") for prefix in fa_prefixs: shutil.rmtree(os.path.join(args_circ.output_folder, prefix)) def run_circrna(self, args_circ, log): '''detection of circRNA''' bam_datas = None read_datas = None if (args_circ.bams is None) and (args_circ.read_files is None): log.write("--bam_files and --read_files can not be both emtpy.\n") print("Error: --bam_files or --read_files should be assigned.") sys.exit() if args_circ.bams is not None: bam_datas = self._extract_input_files(args_circ.bams) if args_circ.read_files is not None: read_datas = self._extract_input_files(args_circ.read_files) for gff in os.listdir(args_circ.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes( os.path.join(args_circ.gffs, gff)) if args_circ.segemehl_path is None: log.write("segemehl does not exists.\n") print("Error: please assign segemehl path!!") sys.exit() self.multiparser.parser_fasta(args_circ.fastas) self.multiparser.parser_gff(args_circ.gffs, None) self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta", None) tmp_reads = [] if args_circ.read_files: log.write("Raw read files are found.\n") tmp_reads = self._deal_zip_file(read_datas, log) align_files, prefixs = self._align(args_circ, tmp_reads, log) else: align_files = None prefixs = [] for fasta in os.listdir(self.fasta_path): if fasta.endswith(".fa"): fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) for prefix in prefixs: if args_circ.read_files: sub_alignment_path = os.path.join(self.alignment_path, prefix) bam_files, convert_ones, remove_ones = self._convert_sam2bam( sub_alignment_path, args_circ.samtools_path, align_files, log) else: convert_ones = [] remove_ones = [] self._merge_sort_aligment_file(bam_datas, read_datas, args_circ.samtools_path, args_circ.output_folder, convert_ones, tmp_reads, remove_ones, prefix, log) self._run_testrealign(prefix, args_circ.testrealign_path, args_circ.output_folder, log) samples, fa_prefixs = self._merge_bed(args_circ.fastas, self.splice_path, args_circ.output_folder) self._stat_and_gen_gff(fa_prefixs, samples, args_circ, log) if len(tmp_reads) != 0: for reads in tmp_reads: for read in reads["zips"]: os.remove(read) self._remove_tmp_files(args_circ, fa_prefixs)
class CircRNADetection(object): def __init__(self, args_circ): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.alignment_path = os.path.join(args_circ.output_folder, "segemehl_align") self.splice_path = os.path.join(args_circ.output_folder, "segemehl_splice") self.candidate_path = os.path.join(args_circ.output_folder, "circRNA_tables") self.gff_folder = os.path.join(args_circ.output_folder, "gffs") self.gff_path = os.path.join(args_circ.gffs, "tmp") self.splices = { "all_file": "splicesites_all.bed", "file": "splicesites.bed", "all": "splicesites_all", "splice": "splicesites" } self.trans = { "all_file": "transrealigned_all.bed", "file": "transrealigned.bed", "all": "transrealigned_all", "trans": "transrealigned" } self.bams = {"whole": "whole_reads.bam", "sort": "whole_reads_sort"} if args_circ.align: if args_circ.fastas is None: print("Error: There is no genome fasta file!!!") sys.exit() else: self.fasta_path = os.path.join(args_circ.fastas, "tmp") else: self.fasta_path = os.path.join(args_circ.fastas, "tmp") def _wait_process(self, processes): for p in processes: p.wait() if p.stdout: p.stdout.close() if p.stdin: p.stdin.close() if p.stderr: p.stderr.close() try: p.kill() except OSError: pass time.sleep(5) def _deal_zip_file(self, read_folder): tmp_reads = [] for read in os.listdir(read_folder): if read.endswith(".bz2"): mod_read = read.replace(".bz2", "") if (".fa" not in mod_read) and (".fasta" not in mod_read) and ( ".fna" not in mod_read): mod_read = mod_read + ".fa" read_out = open(os.path.join(read_folder, mod_read), "w") tmp_reads.append(os.path.join(read_folder, mod_read)) print(" ".join(["unzip", read])) call(["bzcat", os.path.join(read_folder, read)], stdout=read_out) read_out.close() elif read.endswith(".gz"): mod_read = read.replace(".gz", "") if (".fa" not in mod_read) and (".fasta" not in mod_read) and ( ".fna" not in mod_read): mod_read = mod_read + ".fa" read_out = open(os.path.join(read_folder, mod_read), "w") tmp_reads.append(os.path.join(read_folder, mod_read)) print(" ".join(["unzip", read])) call(["zcat", os.path.join(read_folder, read)], stdout=read_out) read_out.close() return tmp_reads def _run_segemehl_fasta_index(self, segemehl_path, fasta_path, index, fasta): call([ os.path.join(segemehl_path, "segemehl.x"), "-x", os.path.join(fasta_path, index), "-d", os.path.join(fasta_path, fasta) ]) def _run_segemehl_align(self, args_circ, index, fasta, read, sam_file, log_file, fasta_prefix): out = open(os.path.join(self.alignment_path, fasta_prefix, sam_file), "w") log = open(os.path.join(self.alignment_path, fasta_prefix, log_file), "w") p = Popen([ os.path.join(args_circ.segemehl_path, "segemehl.x"), "-i", os.path.join(self.fasta_path, index), "-d", os.path.join(self.fasta_path, fasta), "-q", os.path.join(args_circ.read_folder, read), "-S" ], stdout=out, stderr=log) return p def _align(self, args_circ): prefixs = [] align_files = [] for fasta in os.listdir(self.fasta_path): index = fasta.replace(".fa", ".idx") self._run_segemehl_fasta_index(args_circ.segemehl_path, self.fasta_path, index, fasta) processes = [] num_process = 0 fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) self.helper.check_make_folder( os.path.join(self.alignment_path, fasta_prefix)) for read in os.listdir(args_circ.read_folder): num_process += 1 if read.endswith(".fa") or \ read.endswith(".fna") or \ read.endswith("fasta"): filename = read.split(".") read_prefix = ".".join(filename[:-1]) sam_file = "_".join([read_prefix, fasta_prefix + ".sam"]) log_file = "_".join([read_prefix, fasta_prefix + ".log"]) align_files.append("_".join([read_prefix, fasta_prefix])) print("mapping {0}".format(sam_file)) p = self._run_segemehl_align(args_circ, index, fasta, read, sam_file, log_file, fasta_prefix) processes.append(p) if num_process == args_circ.cores: self._wait_process(processes) num_process = 0 self._wait_process(processes) return align_files, prefixs def _run_samtools_convert_bam(self, samtools_path, pre_sam, out_bam): call([samtools_path, "view", "-bS", pre_sam, "-o", out_bam]) def _convert_sam2bam(self, sub_alignment_path, samtools_path, align_files): bam_files = [] convert_ones = [] remove_ones = [] for sam in os.listdir(sub_alignment_path): pre_sam = os.path.join(sub_alignment_path, sam) if sam.endswith(".sam"): bam_file = sam.replace(".sam", ".bam") print("Convert {0} to {1}".format(sam, bam_file)) out_bam = os.path.join(sub_alignment_path, bam_file) self._run_samtools_convert_bam(samtools_path, pre_sam, out_bam) bam_files.append(out_bam) if align_files: if bam_file.replace(".bam", "") not in align_files: convert_ones.append(out_bam) else: remove_ones.append(pre_sam) elif sam.endswith(".bam"): if (pre_sam not in convert_ones) and (pre_sam not in remove_ones): bam_files.append(pre_sam) elif sam.endswith(".log"): os.remove(pre_sam) return bam_files, convert_ones, remove_ones def _run_samtools_merge_sort(self, samtools_path, sub_alignment_path, bam_files): print("Merge all bam files....") whole_bam = os.path.join(sub_alignment_path, self.bams["whole"]) if len(bam_files) <= 1: shutil.copyfile(bam_files[0], whole_bam) else: file_line = " ".join(bam_files) os.system(" ".join([samtools_path, "merge", whole_bam, file_line])) print("Sort bam files....") call([ samtools_path, "sort", "-o", os.path.join(sub_alignment_path, self.bams["sort"] + ".bam"), whole_bam ]) os.remove(os.path.join(sub_alignment_path, self.bams["whole"])) def _run_samtools_convert_sam(self, samtools_path, sub_alignment_path): print("Convert whole reads bam file to sam file....") call([ samtools_path, "view", "-h", "-o", os.path.join(sub_alignment_path, self.bams["sort"] + ".sam"), os.path.join(sub_alignment_path, self.bams["sort"] + ".bam") ]) def _merge_sort_aligment_file(self, bam_files, samtools_path, sub_alignment_path, convert_ones, tmp_reads, remove_ones): self._run_samtools_merge_sort(samtools_path, sub_alignment_path, bam_files) self._run_samtools_convert_sam(samtools_path, sub_alignment_path) for bam in convert_ones: os.remove(bam) for sam in remove_ones: os.remove(sam) if len(tmp_reads) != 0: for read in tmp_reads: os.remove(read) def _run_testrealign(self, prefix, segemehl_path, sub_alignment_path): self.helper.check_make_folder(os.path.join(self.splice_path, prefix)) sub_splice_path = os.path.join(self.splice_path, prefix) err_log = os.path.join(sub_splice_path, prefix + ".log") print("Running testrealign.x for {0}".format(prefix)) command = " ".join([ os.path.join(segemehl_path, "testrealign.x"), "-d", os.path.join(self.fasta_path, prefix + ".fa"), "-q", os.path.join(sub_alignment_path, self.bams["sort"] + ".sam"), "-n" ]) os.system(command + " 2>" + err_log) self.helper.move_all_content(os.getcwd(), sub_splice_path, [".bed"]) self.helper.remove_all_content(sub_alignment_path, self.bams["sort"], "file") def _merge_bed(self, fastas, splice_path): tmp_prefixs = [] for fasta in os.listdir(fastas): headers = [] if (fasta.endswith(".fa") or fasta.endswith(".fna") or fasta.endswith(".fasta")): with open(os.path.join(fastas, fasta), "r") as f_h: for line in f_h: line = line.strip() if line.startswith(">"): headers.append(line[1:]) filename = fasta.split(".") fasta_prefix = ".".join(filename[:-1]) tmp_prefixs.append(fasta_prefix) self.helper.check_make_folder( os.path.join(os.getcwd(), fasta_prefix)) for header in headers: shutil.copyfile( os.path.join(splice_path, header, self.splices["file"]), os.path.join( fasta_prefix, "_".join([self.splices["splice"], header + ".bed"]))) shutil.copyfile( os.path.join(splice_path, header, self.trans["file"]), os.path.join( fasta_prefix, "_".join([self.trans["trans"], header + ".bed"]))) out_splice = os.path.join(fasta_prefix, self.splices["all_file"]) out_trans = os.path.join(fasta_prefix, self.trans["all_file"]) if len(headers) > 1: for file_ in os.listdir(fasta_prefix): if (self.splices["splice"] in file_) and (self.splices["all"] not in file_): self.helper.merge_file( os.path.join(fasta_prefix, file_), out_splice) elif (self.trans["trans"] in file_) and (self.trans["all"] not in file_): self.helper.merge_file( os.path.join(fasta_prefix, file_), out_trans) else: shutil.move( os.path.join( fasta_prefix, "_".join( [self.splices["splice"], headers[0] + ".bed"])), out_splice) shutil.move( os.path.join( fasta_prefix, "_".join( [self.trans["trans"], headers[0] + ".bed"])), out_trans) self.helper.remove_all_content(splice_path, None, "dir") return tmp_prefixs def _stat_and_gen_gff(self, tmp_prefixs, args_circ): for prefix in tmp_prefixs: self.helper.check_make_folder(os.path.join(self.gff_folder, prefix)) shutil.copytree(prefix, os.path.join(self.splice_path, prefix)) self.helper.check_make_folder( os.path.join(self.candidate_path, prefix)) print("comparing with annotation of {0}".format(prefix)) if self.splices["all_file"] in os.listdir( os.path.join(self.splice_path, prefix)): detect_circrna( os.path.join(self.splice_path, prefix, self.splices["all_file"]), os.path.join(self.gff_path, prefix + ".gff"), os.path.join(self.candidate_path, prefix, "_".join(["circRNA", prefix + "_all.csv"])), args_circ, os.path.join(args_circ.stat_folder, "_".join(["stat_circRNA", prefix + ".csv"]))) self.converter.convert_circ2gff( os.path.join(self.candidate_path, prefix, "_".join(["circRNA", prefix + "_all.csv"])), args_circ, os.path.join(self.gff_folder, prefix, "_".join([prefix, "circRNA_all.gff"])), os.path.join(self.gff_folder, prefix, "_".join([prefix, "circRNA_best.gff"]))) def _assign_merge_bam(self, args_circ): remove_frags = [] bam_files = [] if (args_circ.normal_bams is not None) and (args_circ.frag_bams is not None): for frag in os.listdir(args_circ.frag_bams): if frag.endswith(".bam"): shutil.copyfile(os.path.join(args_circ.frag_bams, frag), os.path.join(args_circ.normal_bams, frag)) remove_frags.append(frag) merge_folder = args_circ.normal_bams elif (args_circ.normal_bams is not None): merge_folder = args_circ.normal_bams elif (args_circ.frag_bams is not None): merge_folder = args_circ.frag_bams else: print("Error: please assign bam folder or do alignment!!") sys.exit() for bam in os.listdir(merge_folder): if bam.endswith(".bam"): bam_files.append(os.path.join(merge_folder, bam)) return merge_folder, remove_frags, bam_files def run_circrna(self, args_circ): for gff in os.listdir(args_circ.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes( os.path.join(args_circ.gffs, gff)) if args_circ.segemehl_path is None: print("Error: please assign segemehl folder!!") sys.exit() self.multiparser.parser_gff(args_circ.gffs, None) self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta", None) tmp_reads = [] if args_circ.align: self.multiparser.parser_fasta(args_circ.fastas) tmp_reads = self._deal_zip_file(args_circ.read_folder) align_files, prefixs = self._align(args_circ) else: self.multiparser.parser_fasta(args_circ.fastas) prefixs = [] for fasta in os.listdir(self.fasta_path): fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) merge_folder, remove_frag, bam_files = self._assign_merge_bam( args_circ) align_files = None for prefix in prefixs: if args_circ.align: sub_alignment_path = os.path.join(self.alignment_path, prefix) bam_files, convert_ones, remove_ones = self._convert_sam2bam( sub_alignment_path, args_circ.samtools_path, align_files) else: sub_alignment_path = merge_folder convert_ones = [] remove_ones = [] self._merge_sort_aligment_file(bam_files, args_circ.samtools_path, sub_alignment_path, convert_ones, tmp_reads, remove_ones) self._run_testrealign(prefix, args_circ.segemehl_path, sub_alignment_path) tmp_prefixs = self._merge_bed(args_circ.fastas, self.splice_path) self.multiparser.parser_gff(args_circ.gffs, None) self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta", None) self._stat_and_gen_gff(tmp_prefixs, args_circ) self.helper.remove_tmp(args_circ.fastas) self.helper.remove_tmp(args_circ.gffs) for tmp_prefix in tmp_prefixs: shutil.rmtree(tmp_prefix) if (not args_circ.align) and (len(remove_frag) != 0): for frag in remove_frag: os.remove(os.path.join(merge_folder, frag))
class TestConverter(unittest.TestCase): def setUp(self): self.converter = Converter() self.example = Example() self.converter.gff3parser = Mock_gff3_parser self.converter._print_rntptt_title = Mock_func().print_rntptt_title self.converter.tsspredator = Mock_TSSPredatorReader() self.converter._read_file = Mock_func().mock_read_file self.gff_file = self.example.gff_file self.ptt_out = self.example.ptt_out self.rnt_out = self.example.rnt_out self.srna_out = self.example.srna_out self.embl_file = self.example.embl_file self.embl_out = self.example.embl_out self.multi_embl = self.example.multi_embl self.gff_out = self.example.gff_out self.mastertable = self.example.mastertable self.tss_file = self.example.tss_file self.fasta_file = self.example.fasta_file self.transterm = self.example.transterm self.term_file = self.example.term_file self.circ_file = self.example.circrna_table self.circ_all = self.example.circrna_all self.circ_best = self.example.circrna_best self.test_folder = "test_folder" self.mock_args = MockClass() if (not os.path.exists(self.test_folder)): os.mkdir(self.test_folder) def tearDown(self): if os.path.exists(self.test_folder): shutil.rmtree(self.test_folder) def test_print_rntptt_file(self): cdss = [] genes = [] rnas = [] gff_dict = Example().gff_dict for gff in gff_dict: if gff["feature"] == "gene": genes.append(self.converter.gff3parser.entries(self, gff)) elif gff["feature"] == "CDS": cdss.append(self.converter.gff3parser.entries(self, gff)) elif gff["feature"] == "tRNA": rnas.append(self.converter.gff3parser.entries(self, gff)) out_p = StringIO() out_r = StringIO() self.converter._print_rntptt_file(out_p, cdss, genes) self.converter._print_rntptt_file(out_r, rnas, genes) self.assertEqual(out_p.getvalue().split("\n")[:-1], self.example.ptt_out_list) self.assertEqual(out_r.getvalue().split("\n")[:-1], self.example.rnt_out_list) out_p.close() out_r.close() def test_srna2pttrnt(self): srna_input_file = os.path.join(self.test_folder, "srna.gff") srna_output_file = os.path.join(self.test_folder, "srna.out") with open(srna_input_file, "w") as fh: fh.write(self.gff_file) srnas = [] self.converter._srna2rntptt(srna_input_file, srna_output_file, srnas, 1234567) datas = import_data(srna_output_file) self.assertEqual(set(datas), set(self.srna_out.split("\n"))) def test_multi_embl_pos(self): embls = [] for line in self.embl_file.split("\n"): datas = self.converter._multi_embl_pos(line.strip()) if datas != "Wrong": embls.append(datas) for index in range(0, 7): self.assertDictEqual(embls[index], self.embl_out[index]) for index in range(0, 2): self.assertDictEqual(embls[-1]["pos"][index], self.multi_embl[index]) def test_parser_embl_data(self): embl_file = os.path.join(self.test_folder, "test.embl") embl_out = os.path.join(self.test_folder, "test.embl_out") out = StringIO() with open(embl_file, "w") as eh: for line in self.embl_file.split("\n"): eh.write(line + "\n") info = self.converter._parser_embl_data(embl_file, out) datas = out.getvalue().split("\n") self.assertEqual(set(datas[:-1]), set(self.gff_out.split("\n"))) self.assertEqual(info[0], "NC_007795.1") for index in range(0, 2): self.assertDictEqual(info[1]["pos"][index], self.multi_embl[index]) out.close() def test_multi_tss_class(self): nums = {"tss": 0, "tss_uni": 0, "class": 1} utrs = {"total": [], "pri": [], "sec": []} tss_features = {"tss_types": [], "locus_tags": [], "utr_lengths": []} tss_index = defaultdict(lambda: 0) master_file = os.path.join(self.test_folder, "test.tsv") fh = StringIO(self.mastertable) for tss in self.converter.tsspredator.entries(fh): self.converter._multi_tss_class( tss, tss_index, tss_features, nums, utrs) fh.close() self.assertDictEqual(nums, {'tss_uni': 0, 'class': 5, 'tss': 2}) def test_convert_mastertable2gff(self): master_file = os.path.join(self.test_folder, "test.tsv") with open(master_file, "w") as th: th.write(self.mastertable) out_gff = os.path.join(self.test_folder, "test.tsv_out") self.converter.convert_mastertable2gff(master_file, "ANNOgesic", "TSS", "aaa", out_gff) datas = import_data(out_gff) self.assertEqual(set(datas), set(self.tss_file.split("\n"))) def test_convert_gff2rntptt(self): srna_input_file = os.path.join(self.test_folder, "srna.gff") srna_output_file = os.path.join(self.test_folder, "srna.out") gff_file = os.path.join(self.test_folder, "test.gff") rnt_file = os.path.join(self.test_folder, "test.rnt") ptt_file = os.path.join(self.test_folder, "test.ptt") fasta_file = os.path.join(self.test_folder, "test.fa") with open(srna_input_file, "w") as fh: fh.write(self.gff_file) with open(gff_file, "w") as fh: fh.write(self.gff_file) with open(fasta_file, "w") as fh: fh.write(self.fasta_file) self.converter.convert_gff2rntptt( gff_file, fasta_file, ptt_file, rnt_file, srna_input_file, srna_output_file) self.assertTrue(srna_output_file) self.assertTrue(rnt_file) self.assertTrue(ptt_file) def test_convert_embl2gff(self): embl_file = os.path.join(self.test_folder, "test.embl") gff_file = os.path.join(self.test_folder, "test.embl_out") with open(embl_file, "w") as eh: for line in self.embl_file.split("\n"): eh.write(line + "\n") self.converter.convert_embl2gff(embl_file, gff_file) datas = import_data(gff_file) self.assertEqual(set(datas[1:-2]), set(self.gff_out.split("\n"))) def test_convert_transtermhp2gff(self): transterm_file = os.path.join( self.test_folder, "test_best_terminator_after_gene.bag") gff_file = os.path.join(self.test_folder, "transterm.gff") with open(transterm_file, "w") as th: th.write(self.transterm) self.converter.convert_transtermhp2gff(transterm_file, gff_file) datas = import_data(gff_file) self.assertEqual(set(datas), set(self.term_file.split("\n"))) def get_info(datas): f_datas = [] for data in datas: if not data.startswith("#"): f_datas.append("\t".join(data.split("\t")[:8])) return f_datas def test_convert_circ2gff(self): circ_file = os.path.join(self.test_folder, "circ.csv") out_all = os.path.join(self.test_folder, "all.gff") out_filter = os.path.join(self.test_folder, "best.gff") with open(circ_file, "w") as ch: ch.write(self.circ_file) args = self.mock_args.mock() args.start_ratio = 0.5 args.end_ratio = 0.5 args.support = 5 self.converter.convert_circ2gff(circ_file, args, out_all, out_filter) datas = import_data(out_all) f_datas = [] for data in datas: if not data.startswith("#"): f_datas.append("\t".join(data.split("\t")[:8])) c_datas = [] for data in self.circ_all.split("\n"): if not data.startswith("#"): c_datas.append("\t".join(data.split("\t")[:8])) self.assertListEqual(f_datas, c_datas) datas = import_data(out_filter) f_datas = [] for data in datas: if not data.startswith("#"): f_datas.append("\t".join(data.split("\t")[:8])) c_datas = [] for data in self.circ_best.split("\n"): if not data.startswith("#"): c_datas.append("\t".join(data.split("\t")[:8])) self.assertListEqual(f_datas, c_datas)
class CircRNADetection(object): def __init__(self, args_circ): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.alignment_path = os.path.join(args_circ.output_folder, "segemehl_align") self.splice_path = os.path.join(args_circ.output_folder, "segemehl_splice") self.candidate_path = os.path.join(args_circ.output_folder, "circRNA_tables") self.gff_folder = os.path.join(args_circ.output_folder, "gffs") self.gff_path = os.path.join(args_circ.gffs, "tmp") self.splices = {"all_file": "splicesites_all.bed", "file": "splicesites.bed", "all": "splicesites_all", "splice": "splicesites"} self.trans = {"all_file": "transrealigned_all.bed", "file": "transrealigned.bed", "all": "transrealigned_all", "trans": "transrealigned"} self.bams = {"whole": "whole_reads.bam", "sort": "whole_reads_sort"} if args_circ.align: if args_circ.fastas is None: print("Error: There is no genome fasta file!!!") sys.exit() else: self.fasta_path = os.path.join(args_circ.fastas, "tmp") else: self.fasta_path = os.path.join(args_circ.fastas, "tmp") def _wait_process(self, processes): for p in processes: p.wait() if p.stdout: p.stdout.close() if p.stdin: p.stdin.close() if p.stderr: p.stderr.close() try: p.kill() except OSError: pass time.sleep(5) def _deal_zip_file(self, read_folder): tmp_reads = [] for read in os.listdir(read_folder): if read.endswith(".bz2"): mod_read = read.replace(".bz2", "") if (".fa" not in mod_read) and ( ".fasta" not in mod_read) and ( ".fna" not in mod_read): mod_read = mod_read + ".fa" read_out = open(os.path.join(read_folder, mod_read), "w") tmp_reads.append(os.path.join(read_folder, mod_read)) print(" ".join(["unzip", read])) call(["bzcat", os.path.join(read_folder, read)], stdout=read_out) read_out.close() elif read.endswith(".gz"): mod_read = read.replace(".gz", "") if (".fa" not in mod_read) and ( ".fasta" not in mod_read) and ( ".fna" not in mod_read): mod_read = mod_read + ".fa" read_out = open(os.path.join(read_folder, mod_read), "w") tmp_reads.append(os.path.join(read_folder, mod_read)) print(" ".join(["unzip", read])) call(["zcat", os.path.join(read_folder, read)], stdout=read_out) read_out.close() return tmp_reads def _run_segemehl_fasta_index(self, segemehl_path, fasta_path, index, fasta): call([os.path.join(segemehl_path, "segemehl.x"), "-x", os.path.join(fasta_path, index), "-d", os.path.join(fasta_path, fasta)]) def _run_segemehl_align(self, args_circ, index, fasta, read, sam_file, log_file, fasta_prefix): out = open(os.path.join(self.alignment_path, fasta_prefix, sam_file), "w") log = open(os.path.join(self.alignment_path, fasta_prefix, log_file), "w") p = Popen([os.path.join(args_circ.segemehl_path, "segemehl.x"), "-i", os.path.join(self.fasta_path, index), "-d", os.path.join(self.fasta_path, fasta), "-q", os.path.join(args_circ.read_folder, read), "-S"], stdout=out, stderr=log) return p def _align(self, args_circ): prefixs = [] align_files = [] for fasta in os.listdir(self.fasta_path): index = fasta.replace(".fa", ".idx") self._run_segemehl_fasta_index(args_circ.segemehl_path, self.fasta_path, index, fasta) processes = [] num_process = 0 fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) self.helper.check_make_folder(os.path.join( self.alignment_path, fasta_prefix)) for read in os.listdir(args_circ.read_folder): num_process += 1 if read.endswith(".fa") or \ read.endswith(".fna") or \ read.endswith("fasta"): filename = read.split(".") read_prefix = ".".join(filename[:-1]) sam_file = "_".join([read_prefix, fasta_prefix + ".sam"]) log_file = "_".join([read_prefix, fasta_prefix + ".log"]) align_files.append("_".join([read_prefix, fasta_prefix])) print("mapping {0}".format(sam_file)) p = self._run_segemehl_align( args_circ, index, fasta, read, sam_file, log_file, fasta_prefix) processes.append(p) if num_process == args_circ.cores: self._wait_process(processes) num_process = 0 self._wait_process(processes) return align_files, prefixs def _run_samtools_convert_bam(self, samtools_path, pre_sam, out_bam): call([samtools_path, "view", "-bS", pre_sam, "-o", out_bam]) def _convert_sam2bam(self, sub_alignment_path, samtools_path, align_files): bam_files = [] convert_ones = [] remove_ones = [] for sam in os.listdir(sub_alignment_path): pre_sam = os.path.join(sub_alignment_path, sam) if sam.endswith(".sam"): bam_file = sam.replace(".sam", ".bam") print("Convert {0} to {1}".format(sam, bam_file)) out_bam = os.path.join(sub_alignment_path, bam_file) self._run_samtools_convert_bam(samtools_path, pre_sam, out_bam) bam_files.append(out_bam) if align_files: if bam_file.replace(".bam", "") not in align_files: convert_ones.append(out_bam) else: remove_ones.append(pre_sam) elif sam.endswith(".bam"): if (pre_sam not in convert_ones) and ( pre_sam not in remove_ones): bam_files.append(pre_sam) elif sam.endswith(".log"): os.remove(pre_sam) return bam_files, convert_ones, remove_ones def _run_samtools_merge_sort(self, samtools_path, sub_alignment_path, bam_files): print("Merge all bam files....") whole_bam = os.path.join(sub_alignment_path, self.bams["whole"]) if len(bam_files) <= 1: shutil.copyfile(bam_files[0], whole_bam) else: file_line = " ".join(bam_files) os.system(" ".join([samtools_path, "merge", whole_bam, file_line])) print("Sort bam files....") call([samtools_path, "sort", "-o", os.path.join(sub_alignment_path, self.bams["sort"] + ".bam"), whole_bam]) os.remove(os.path.join(sub_alignment_path, self.bams["whole"])) def _run_samtools_convert_sam(self, samtools_path, sub_alignment_path): print("Convert whole reads bam file to sam file....") call([samtools_path, "view", "-h", "-o", os.path.join(sub_alignment_path, self.bams["sort"] + ".sam"), os.path.join(sub_alignment_path, self.bams["sort"] + ".bam")]) def _merge_sort_aligment_file(self, bam_files, samtools_path, sub_alignment_path, convert_ones, tmp_reads, remove_ones): self._run_samtools_merge_sort(samtools_path, sub_alignment_path, bam_files) self._run_samtools_convert_sam(samtools_path, sub_alignment_path) for bam in convert_ones: os.remove(bam) for sam in remove_ones: os.remove(sam) if len(tmp_reads) != 0: for read in tmp_reads: os.remove(read) def _run_testrealign(self, prefix, segemehl_path, sub_alignment_path): self.helper.check_make_folder(os.path.join(self.splice_path, prefix)) sub_splice_path = os.path.join(self.splice_path, prefix) err_log = os.path.join(sub_splice_path, prefix + ".log") print("Running testrealign.x for {0}".format(prefix)) command = " ".join([ os.path.join(segemehl_path, "testrealign.x"), "-d", os.path.join(self.fasta_path, prefix + ".fa"), "-q", os.path.join(sub_alignment_path, self.bams["sort"] + ".sam"), "-n"]) os.system(command + " 2>" + err_log) self.helper.move_all_content(os.getcwd(), sub_splice_path, [".bed"]) self.helper.remove_all_content(sub_alignment_path, self.bams["sort"], "file") def _merge_bed(self, fastas, splice_path): tmp_prefixs = [] for fasta in os.listdir(fastas): headers = [] if (fasta.endswith(".fa") or fasta.endswith(".fna") or fasta.endswith(".fasta")): with open(os.path.join(fastas, fasta), "r") as f_h: for line in f_h: line = line.strip() if line.startswith(">"): headers.append(line[1:]) filename = fasta.split(".") fasta_prefix = ".".join(filename[:-1]) tmp_prefixs.append(fasta_prefix) self.helper.check_make_folder(os.path.join( os.getcwd(), fasta_prefix)) for header in headers: shutil.copyfile(os.path.join(splice_path, header, self.splices["file"]), os.path.join(fasta_prefix, "_".join([self.splices["splice"], header + ".bed"]))) shutil.copyfile(os.path.join(splice_path, header, self.trans["file"]), os.path.join(fasta_prefix, "_".join([self.trans["trans"], header + ".bed"]))) out_splice = os.path.join(fasta_prefix, self.splices["all_file"]) out_trans = os.path.join(fasta_prefix, self.trans["all_file"]) if len(headers) > 1: for file_ in os.listdir(fasta_prefix): if (self.splices["splice"] in file_) and ( self.splices["all"] not in file_): self.helper.merge_file(os.path.join( fasta_prefix, file_), out_splice) elif (self.trans["trans"] in file_) and ( self.trans["all"] not in file_): self.helper.merge_file(os.path.join( fasta_prefix, file_), out_trans) else: shutil.move(os.path.join( fasta_prefix, "_".join([self.splices["splice"], headers[0] + ".bed"])), out_splice) shutil.move(os.path.join( fasta_prefix, "_".join([self.trans["trans"], headers[0] + ".bed"])), out_trans) self.helper.remove_all_content(splice_path, None, "dir") return tmp_prefixs def _stat_and_gen_gff(self, tmp_prefixs, args_circ): for prefix in tmp_prefixs: self.helper.check_make_folder(os.path.join(self.gff_folder, prefix)) shutil.copytree(prefix, os.path.join(self.splice_path, prefix)) self.helper.check_make_folder(os.path.join( self.candidate_path, prefix)) print("comparing with annotation of {0}".format(prefix)) if self.splices["all_file"] in os.listdir(os.path.join( self.splice_path, prefix)): detect_circrna(os.path.join(self.splice_path, prefix, self.splices["all_file"]), os.path.join( self.gff_path, prefix + ".gff"), os.path.join(self.candidate_path, prefix, "_".join(["circRNA", prefix + "_all.csv"])), args_circ, os.path.join(args_circ.stat_folder, "_".join(["stat_circRNA", prefix + ".csv"]))) self.converter.convert_circ2gff( os.path.join(self.candidate_path, prefix, "_".join(["circRNA", prefix + "_all.csv"])), args_circ, os.path.join( self.gff_folder, prefix, "_".join([prefix, "circRNA_all.gff"])), os.path.join(self.gff_folder, prefix, "_".join([prefix, "circRNA_best.gff"]))) def _assign_merge_bam(self, args_circ): remove_frags = [] bam_files = [] if (args_circ.normal_bams is not None) and ( args_circ.frag_bams is not None): for frag in os.listdir(args_circ.frag_bams): if frag.endswith(".bam"): shutil.copyfile(os.path.join(args_circ.frag_bams, frag), os.path.join(args_circ.normal_bams, frag)) remove_frags.append(frag) merge_folder = args_circ.normal_bams elif (args_circ.normal_bams is not None): merge_folder = args_circ.normal_bams elif (args_circ.frag_bams is not None): merge_folder = args_circ.frag_bams else: print("Error: please assign bam folder or do alignment!!") sys.exit() for bam in os.listdir(merge_folder): if bam.endswith(".bam"): bam_files.append(os.path.join(merge_folder, bam)) return merge_folder, remove_frags, bam_files def run_circrna(self, args_circ): for gff in os.listdir(args_circ.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_circ.gffs, gff)) if args_circ.segemehl_path is None: print("Error: please assign segemehl folder!!") sys.exit() self.multiparser.parser_gff(args_circ.gffs, None) self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta", None) tmp_reads = [] if args_circ.align: self.multiparser.parser_fasta(args_circ.fastas) tmp_reads = self._deal_zip_file(args_circ.read_folder) align_files, prefixs = self._align(args_circ) else: self.multiparser.parser_fasta(args_circ.fastas) prefixs = [] for fasta in os.listdir(self.fasta_path): fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) merge_folder, remove_frag, bam_files = self._assign_merge_bam( args_circ) align_files = None for prefix in prefixs: if args_circ.align: sub_alignment_path = os.path.join(self.alignment_path, prefix) bam_files, convert_ones, remove_ones = self._convert_sam2bam( sub_alignment_path, args_circ.samtools_path, align_files) else: sub_alignment_path = merge_folder convert_ones = [] remove_ones = [] self._merge_sort_aligment_file( bam_files, args_circ.samtools_path, sub_alignment_path, convert_ones, tmp_reads, remove_ones) self._run_testrealign(prefix, args_circ.segemehl_path, sub_alignment_path) tmp_prefixs = self._merge_bed(args_circ.fastas, self.splice_path) self.multiparser.parser_gff(args_circ.gffs, None) self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta", None) self._stat_and_gen_gff(tmp_prefixs, args_circ) self.helper.remove_tmp(args_circ.fastas) self.helper.remove_tmp(args_circ.gffs) for tmp_prefix in tmp_prefixs: shutil.rmtree(tmp_prefix) if (not args_circ.align) and (len(remove_frag) != 0): for frag in remove_frag: os.remove(os.path.join(merge_folder, frag))
class CircRNADetection(object): '''Detection of circRNA''' def __init__(self, args_circ): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.alignment_path = os.path.join(args_circ.output_folder, "segemehl_alignment_files") self.splice_path = os.path.join(args_circ.output_folder, "segemehl_splice_results") self.candidate_path = os.path.join(args_circ.output_folder, "circRNA_tables") self.gff_folder = os.path.join(args_circ.output_folder, "gffs") self.gff_path = os.path.join(args_circ.gffs, "tmp") self.splices = {"file": "splicesites.bed", "splice": "splicesites"} self.trans = {"file": "transrealigned.bed", "trans": "transrealigned"} self.fasta_path = os.path.join(args_circ.fastas, "tmp") def _wait_process(self, processes): '''wait for the parallels to finish the process''' for p in processes: p.wait() if p.stdout: p.stdout.close() if p.stdin: p.stdin.close() if p.stderr: p.stderr.close() try: p.kill() except OSError: pass time.sleep(5) def _deal_zip_file(self, read_files, log): tmp_datas = [] tmp_reads = [] for reads in read_files: zips = [] tmp_datas = reads["files"] for read in reads["files"]: if read.endswith(".bz2"): mod_read = read.replace(".bz2", "") if (".fa" not in mod_read) and ( ".fasta" not in mod_read) and ( ".fna" not in mod_read) and ( ".fq" not in mod_read) and ( ".fastq" not in mod_read): mod_read = mod_read + ".fa" read_out = open(mod_read, "w") tmp_datas.append(mod_read) zips.append(mod_read) print(" ".join(["Uncompressing", read])) log.write(" ".join(["bzcat", read]) + "\n") call(["bzcat", read], stdout=read_out) log.write("\t" + mod_read + " is generated.\n") read_out.close() elif read.endswith(".gz"): mod_read = read.replace(".gz", "") if (".fa" not in mod_read) and ( ".fasta" not in mod_read) and ( ".fna" not in mod_read) and ( ".fq" not in mod_read) and ( ".fastq" not in mod_read): mod_read = mod_read + ".fa" read_out = open(mod_read, "w") tmp_datas.append(mod_read) zips.append(mod_read) print(" ".join(["Uncompressing", read])) log.write(" ".join(["zcat", read]) + "\n") call(["zcat", read], stdout=read_out) read_out.close() log.write("\t" + mod_read + " is generated.\n") tmp_reads.append({"sample": reads["sample"], "files": tmp_datas, "zips": zips}) return tmp_reads def _run_segemehl_fasta_index(self, segemehl_path, fasta_path, index, fasta, log): log.write(" ".join([segemehl_path, "-x", os.path.join(fasta_path, index), "-d", os.path.join(fasta_path, fasta)]) + "\n") call([segemehl_path, "-x", os.path.join(fasta_path, index), "-d", os.path.join(fasta_path, fasta)]) def _run_segemehl_align(self, args_circ, index, fasta, read, sam_file, log_file, fasta_prefix, log): out = open(os.path.join(self.alignment_path, fasta_prefix, sam_file), "w") log = open(os.path.join(self.alignment_path, fasta_prefix, log_file), "w") log.write(" ".join([args_circ.segemehl_path, "-i", os.path.join(self.fasta_path, index), "-d", os.path.join(self.fasta_path, fasta), "-q", read, "-S"]) + "\n") p = Popen([args_circ.segemehl_path, "-i", os.path.join(self.fasta_path, index), "-d", os.path.join(self.fasta_path, fasta), "-q", read, "-S"], stdout=out, stderr=log) return p def _align(self, args_circ, read_datas, log): '''align the read. if the bam files are provided, it can be skipped.''' prefixs = [] align_files = [] log.write("Using segemehl to align the read.\n") log.write("Please make sure the version of segemehl is at least 0.1.9.\n") for fasta in os.listdir(self.fasta_path): index = fasta.replace(".fa", ".idx") self._run_segemehl_fasta_index(args_circ.segemehl_path, self.fasta_path, index, fasta, log) processes = [] num_process = 0 fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) self.helper.check_make_folder(os.path.join( self.alignment_path, fasta_prefix)) log.write("Running for {0}.\n".format(fasta_prefix)) for reads in read_datas: for read in reads["files"]: num_process += 1 read_name = read.split("/")[-1] if read_name.endswith(".fa") or \ read_name.endswith(".fna") or \ read_name.endswith(".fasta") or \ read_name.endswith(".fq") or \ read_name.endswith(".fastq"): filename = read_name.split(".") read_prefix = ".".join(filename[:-1]) sam_file = "_".join([read_prefix, fasta_prefix + ".sam"]) log_file = "_".join([read_prefix, fasta_prefix + ".log"]) align_files.append("_".join([read_prefix, fasta_prefix])) print("Mapping {0}".format(sam_file)) p = self._run_segemehl_align( args_circ, index, fasta, read, sam_file, log_file, fasta_prefix, log) processes.append(p) if num_process == args_circ.cores: self._wait_process(processes) num_process = 0 self._wait_process(processes) log.write("Done!\n") log.write("The following files are generated in {0}:\n".format( os.path.join(self.alignment_path, fasta_prefix))) for file_ in os.listdir(os.path.join( self.alignment_path, fasta_prefix)): log.write("\t" + file_ + "\n") return align_files, prefixs def _run_samtools_convert_bam(self, samtools_path, pre_sam, out_bam, log): log.write(" ".join([samtools_path, "view", "-bS", pre_sam, "-o", out_bam]) + "\n") call([samtools_path, "view", "-bS", pre_sam, "-o", out_bam]) def _convert_sam2bam(self, sub_alignment_path, samtools_path, align_files, log): bam_files = [] convert_ones = [] remove_ones = [] log.write("Using Samtools to convert SAM files to BAM files.\n") log.write("Please make sure the version of Samtools is at least 1.3.1.\n") for sam in os.listdir(sub_alignment_path): pre_sam = os.path.join(sub_alignment_path, sam) if sam.endswith(".sam"): bam_file = sam.replace(".sam", ".bam") print("Converting {0} to {1}".format(sam, bam_file)) out_bam = os.path.join(sub_alignment_path, bam_file) self._run_samtools_convert_bam(samtools_path, pre_sam, out_bam, log) bam_files.append(out_bam) if align_files: if bam_file.replace(".bam", "") not in align_files: convert_ones.append(out_bam) else: remove_ones.append(pre_sam) elif sam.endswith(".bam"): if (pre_sam not in convert_ones) and ( pre_sam not in remove_ones): bam_files.append(pre_sam) elif sam.endswith(".log"): os.remove(pre_sam) log.write("Done!\n") log.write("The following files are generated:\n") for file_ in os.listdir(sub_alignment_path): if file_.endswith(".bam"): log.write("\t" + os.path.join(sub_alignment_path, file_) + "\n") return bam_files, convert_ones, remove_ones def _run_samtools_merge_sort(self, samtools_path, prefix, out_folder, bam_datas, log): log.write("Using Samtools for merging, sorting and converting " "the BAM files.\n") log.write("Make sure the version Samtools is at least 1.3.1.\n") for bam_data in bam_datas: print("Merging bam files for {0} of {1}".format( prefix, bam_data["sample"])) sample_bam = os.path.join(out_folder, "_".join([ prefix, bam_data["sample"] + ".bam"])) if len(bam_data["files"]) <= 1: shutil.copyfile(bam_data["files"][0], sample_bam) else: file_line = " ".join(bam_data["files"]) log.write(" ".join([samtools_path, "merge", sample_bam, file_line]) + "\n") os.system(" ".join([samtools_path, "merge", sample_bam, file_line])) print("Sorting bam files for {0} of {1}".format( prefix, bam_data["sample"])) sort_sample = os.path.join(out_folder, "_".join([prefix, bam_data["sample"] + "_sort.bam"])) log.write(" ".join([samtools_path, "sort", "-o", sort_sample, sample_bam]) + "\n") call([samtools_path, "sort", "-o", sort_sample, sample_bam]) os.remove(sample_bam) print("Converting bam files to sam files for {0} of {1}".format( prefix, bam_data["sample"])) log.write(" ".join([samtools_path, "view", "-h", "-o", sort_sample.replace(".bam", ".sam"), sort_sample]) + "\n") call([samtools_path, "view", "-h", "-o", sort_sample.replace(".bam", ".sam"), sort_sample]) log.write("Done!\n") log.write("\t" + sort_sample.replace(".bam", ".sam") + " is generated.\n") def _merge_sort_aligment_file( self, bam_datas, read_datas, samtools_path, out_folder, convert_ones, tmp_reads, remove_ones, prefix, log): if bam_datas is None: merge_bam_datas = [] for read_data in read_datas: bam_files = [] for read in read_data["files"]: if read.endswith(".gz") or read.endswith(".bz2"): read = ".".join( read.split("/")[-1].split(".")[:-1]) read_prefix = ".".join( read.split("/")[-1].split(".")[:-1]) bam_files.append(os.path.join( self.alignment_path, prefix, "_".join([read_prefix, prefix + ".bam"]))) merge_bam_datas.append({"sample": read_data["sample"], "files": bam_files}) elif (bam_datas is not None) and (read_datas is not None): merge_bam_datas = copy.deepcopy(bam_datas) for bam_data in merge_bam_datas: for read_data in read_datas: if bam_data["sample"] == read_data["sample"]: for read in read_data["files"]: read_prefix = ".".join( read.split("/")[-1].split(".")[:-1]) bam = os.path.join( self.alignment_path, prefix, "_".join([read_prefix, prefix + ".bam"])) if (bam not in bam_data["files"]): bam_data["files"].append(bam) else: merge_bam_datas = copy.deepcopy(bam_datas) self._run_samtools_merge_sort(samtools_path, prefix, out_folder, merge_bam_datas, log) for bam in convert_ones: os.remove(bam) for sam in remove_ones: os.remove(sam) def _run_testrealign(self, prefix, testrealign_path, out_folder, log): log.write("Using Segemehl to detect circular RNAs.\n") log.write("Please make sure the version of Segemehl is at least 0.1.9.\n") log.write("Please make sure your testrealign.x exists. If it does not " "exists, please reinstall your Segemehl via using make all.\n") sub_splice_path = os.path.join(self.splice_path, prefix) if not os.path.exists(sub_splice_path): os.mkdir(sub_splice_path) err_log = os.path.join(sub_splice_path, prefix + ".log") print("Running testrealign.x for {0}".format(prefix)) for sam_file in os.listdir(out_folder): if sam_file.endswith("sort.sam"): sample_prefix = sam_file.replace("_sort.sam", "") command = " ".join([ testrealign_path, "-d", os.path.join(self.fasta_path, prefix + ".fa"), "-q", os.path.join(out_folder, sam_file), "-n", "-U", os.path.join(sub_splice_path, sample_prefix + "_splicesites.bed"), "-T", os.path.join(sub_splice_path, sample_prefix + "_transrealigned.bed")]) log.write(command + " 2>" + err_log + "\n") os.system(command + " 2>" + err_log) log.write("Done!\n") log.write("The following files are generated:\n") for file_ in os.listdir(sub_splice_path): log.write("\t" + os.path.join(sub_splice_path, file_) + "\n") self.helper.remove_all_content(out_folder, ".sam", "file") def _merge_bed(self, fastas, splice_path, output_folder): '''Merge the bed files for analysis''' fa_prefixs = [] for fasta in os.listdir(fastas): headers = [] if (fasta.endswith(".fa") or fasta.endswith(".fna") or fasta.endswith(".fasta")): with open(os.path.join(fastas, fasta), "r") as f_h: for line in f_h: line = line.strip() if line.startswith(">"): headers.append(line[1:]) filename = fasta.split(".") fasta_prefix = ".".join(filename[:-1]) fa_prefixs.append(fasta_prefix) bed_folder = os.path.join( output_folder, fasta_prefix) self.helper.check_make_folder(bed_folder) samples = [] for header in headers: for splice in os.listdir(os.path.join( splice_path, header)): if splice.endswith(".bed"): if self.splices["file"] in splice: sample = splice.replace(header, "") sample = sample.replace( self.splices["file"], "") if sample not in samples: samples.append(sample) shutil.copyfile( os.path.join( splice_path, header, splice), os.path.join( bed_folder, "tmp_" + splice)) for sample in samples: out_splice = os.path.join(bed_folder, "".join([ fasta_prefix + sample + self.splices["file"]])) out_trans = os.path.join(bed_folder, "".join([ fasta_prefix + sample + self.trans["file"]])) if os.path.exists(out_splice): os.remove(out_splice) if os.path.exists(out_trans): os.remove(out_trans) for file_ in os.listdir(bed_folder): if (self.splices["splice"] in file_) and ( sample in file_): self.helper.merge_file(os.path.join( bed_folder, file_), out_splice) elif (self.trans["trans"] in file_) and ( sample in file_): self.helper.merge_file(os.path.join( bed_folder, file_), out_trans) self.helper.remove_all_content(splice_path, None, "dir") return samples, fa_prefixs def _stat_and_gen_gff(self, prefixs, samples, args_circ, log): '''do statistics and print the result to gff file''' log.write("Running circRNA.py to do statistics and generate gff files.\n") log.write("The following files are generated:\n") for prefix in prefixs: self.helper.check_make_folder(os.path.join(self.gff_folder, prefix)) self.helper.check_make_folder(os.path.join(self.splice_path, prefix)) for bed in os.listdir(os.path.join( args_circ.output_folder, prefix)): if (bed.split("_")[0] != "tmp") and (bed.endswith(".bed")): shutil.copy( os.path.join(args_circ.output_folder, prefix, bed), os.path.join(self.splice_path, prefix)) self.helper.check_make_folder(os.path.join( self.candidate_path, prefix)) print("Comparing circular RNAs with annotations of {0}".format( prefix)) for sample in samples: splice_file = os.path.join( self.splice_path, prefix, "".join([prefix, sample, self.splices["file"]])) stat_file = os.path.join(args_circ.stat_folder, "".join(["stat_", prefix, sample, "circRNA.csv"])) csv_all = os.path.join(self.candidate_path, prefix, "".join([prefix, sample, "circRNA_all.csv"])) csv_best = os.path.join(self.candidate_path, prefix, "".join([prefix, sample, "circRNA_best.csv"])) gff_all = os.path.join(self.gff_folder, prefix, "".join([prefix, sample, "circRNA_all.gff"])) gff_best = os.path.join(self.gff_folder, prefix, "".join([prefix, sample, "circRNA_best.gff"])) detect_circrna(splice_file, os.path.join( self.gff_path, prefix + ".gff"), csv_all, args_circ, stat_file) self.converter.convert_circ2gff( os.path.join(self.candidate_path, prefix, "".join([prefix, sample, "circRNA_all.csv"])), args_circ, gff_all, gff_best) log.write("\t" + stat_file + "\n") log.write("\t" + csv_all + "\n") log.write("\t" + csv_best + "\n") log.write("\t" + gff_all + "\n") log.write("\t" + gff_best + "\n") def _extract_input_files(self, inputs): input_datas = [] for input_ in inputs: datas = input_.split(":") if len(datas) != 2: print("Error: the format of --bam_files or " "--read_files is wrong!") sys.exit() for file_ in datas[-1].split(","): if not os.path.exists(file_): print("Error: some files in --bam_files or " "--read_files do not exist!") sys.exit() input_datas.append({"sample": datas[0], "files": datas[-1].split(",")}) return input_datas def _combine_read_bam(self, bam_files, bam_datas, read_datas): if bam_datas is not None: for bam_data in bam_datas: for read_data in read_datas: if bam_data["sample"] == read_data["sample"]: for read in read_data["files"]: prefix = ".".join( read.split("/")[-1].split(".")[:-1]) bam = os.path.join(self.alignment_path, prefix + ".bam") if (bam in bam_files) and ( bam not in bam_data["files"]): bam_data["files"].append(bam) else: bam_datas = [] for read_data in read_datas: bam_files = [] for read in read_data["files"]: prefix = ".".join( read.split("/")[-1].split(".")[:-1]) bam_files.append(os.path.join( self.alignment_path, prefix + ".bam")) bam_datas.append({"sample": read_data["sample"], "files": bam_files}) return bam_datas def _remove_tmp_files(self, args_circ, fa_prefixs): self.helper.remove_tmp_dir(args_circ.fastas) self.helper.remove_tmp_dir(args_circ.gffs) self.helper.remove_all_content(args_circ.output_folder, ".bam", "file") for prefix in fa_prefixs: shutil.rmtree(os.path.join(args_circ.output_folder, prefix)) def run_circrna(self, args_circ, log): '''detection of circRNA''' bam_datas = None read_datas = None if (args_circ.bams is None) and (args_circ.read_files is None): log.write("--bam_files and --read_files can not be both emtpy.\n") print("Error: --bam_files or --read_files should be assigned.") sys.exit() if args_circ.bams is not None: bam_datas = self._extract_input_files(args_circ.bams) if args_circ.read_files is not None: read_datas = self._extract_input_files(args_circ.read_files) for gff in os.listdir(args_circ.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_circ.gffs, gff)) if args_circ.segemehl_path is None: log.write("segemehl does not exists.\n") print("Error: please assign segemehl path!!") sys.exit() self.multiparser.parser_fasta(args_circ.fastas) self.multiparser.parser_gff(args_circ.gffs, None) self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta", None) tmp_reads = [] if args_circ.read_files: log.write("Raw read files are found.\n") tmp_reads = self._deal_zip_file(read_datas, log) align_files, prefixs = self._align(args_circ, tmp_reads, log) else: align_files = None prefixs = [] for fasta in os.listdir(self.fasta_path): if fasta.endswith(".fa"): fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) for prefix in prefixs: if args_circ.read_files: sub_alignment_path = os.path.join(self.alignment_path, prefix) bam_files, convert_ones, remove_ones = self._convert_sam2bam( sub_alignment_path, args_circ.samtools_path, align_files, log) else: convert_ones = [] remove_ones = [] self._merge_sort_aligment_file( bam_datas, read_datas, args_circ.samtools_path, args_circ.output_folder, convert_ones, tmp_reads, remove_ones, prefix, log) self._run_testrealign(prefix, args_circ.testrealign_path, args_circ.output_folder, log) samples, fa_prefixs = self._merge_bed( args_circ.fastas, self.splice_path, args_circ.output_folder) self._stat_and_gen_gff(fa_prefixs, samples, args_circ, log) if len(tmp_reads) != 0: for reads in tmp_reads: for read in reads["zips"]: os.remove(read) self._remove_tmp_files(args_circ, fa_prefixs)