예제 #1
0
class TestConverter(unittest.TestCase):
    def setUp(self):
        self.converter = Converter()
        self.example = Example()
        self.converter.gff3parser = Mock_gff3_parser
        self.converter._print_rntptt_title = Mock_func().print_rntptt_title
        self.converter.tsspredator = Mock_TSSPredatorReader()
        self.converter._read_file = Mock_func().mock_read_file
        self.gff_file = self.example.gff_file
        self.ptt_out = self.example.ptt_out
        self.rnt_out = self.example.rnt_out
        self.srna_out = self.example.srna_out
        self.embl_file = self.example.embl_file
        self.embl_out = self.example.embl_out
        self.multi_embl = self.example.multi_embl
        self.gff_out = self.example.gff_out
        self.mastertable = self.example.mastertable
        self.tss_file = self.example.tss_file
        self.fasta_file = self.example.fasta_file
        self.transterm = self.example.transterm
        self.term_file = self.example.term_file
        self.circ_file = self.example.circrna_table
        self.circ_all = self.example.circrna_all
        self.circ_best = self.example.circrna_best
        self.test_folder = "test_folder"
        self.mock_args = MockClass()
        if (not os.path.exists(self.test_folder)):
            os.mkdir(self.test_folder)

    def tearDown(self):
        if os.path.exists(self.test_folder):
            shutil.rmtree(self.test_folder)

    def test_print_rntptt_file(self):
        cdss = []
        genes = []
        rnas = []
        gff_dict = Example().gff_dict
        for gff in gff_dict:
            if gff["feature"] == "gene":
                genes.append(self.converter.gff3parser.entries(self, gff))
            elif gff["feature"] == "CDS":
                cdss.append(self.converter.gff3parser.entries(self, gff))
            elif gff["feature"] == "tRNA":
                rnas.append(self.converter.gff3parser.entries(self, gff))
        out_p = StringIO()
        out_r = StringIO()
        self.converter._print_rntptt_file(out_p, cdss, genes)
        self.converter._print_rntptt_file(out_r, rnas, genes)
        self.assertEqual(out_p.getvalue().split("\n")[:-1],
                         self.example.ptt_out_list)
        self.assertEqual(out_r.getvalue().split("\n")[:-1],
                         self.example.rnt_out_list)
        out_p.close()
        out_r.close()

    def test_srna2pttrnt(self):
        srna_input_file = os.path.join(self.test_folder, "srna.gff")
        srna_output_file = os.path.join(self.test_folder, "srna.out")
        with open(srna_input_file, "w") as fh:
            fh.write(self.gff_file)
        srnas = []
        self.converter._srna2rntptt(srna_input_file, srna_output_file, srnas,
                                    1234567)
        datas = import_data(srna_output_file)
        self.assertEqual(set(datas), set(self.srna_out.split("\n")))

    def test_multi_embl_pos(self):
        embls = []
        for line in self.embl_file.split("\n"):
            datas = self.converter._multi_embl_pos(line.strip())
            if datas != "Wrong":
                embls.append(datas)
        for index in range(0, 7):
            self.assertDictEqual(embls[index], self.embl_out[index])
        for index in range(0, 2):
            self.assertDictEqual(embls[-1]["pos"][index],
                                 self.multi_embl[index])

    def test_parser_embl_data(self):
        embl_file = os.path.join(self.test_folder, "test.embl")
        embl_out = os.path.join(self.test_folder, "test.embl_out")
        out = StringIO()
        with open(embl_file, "w") as eh:
            for line in self.embl_file.split("\n"):
                eh.write(line + "\n")
        info = self.converter._parser_embl_data(embl_file, out)
        datas = out.getvalue().split("\n")
        self.assertEqual(set(datas[:-1]), set(self.gff_out.split("\n")))
        self.assertEqual(info[0], "NC_007795.1")
        for index in range(0, 2):
            self.assertDictEqual(info[1]["pos"][index], self.multi_embl[index])
        out.close()

    def test_multi_tss_class(self):
        nums = {"tss": 0, "tss_uni": 0, "class": 1}
        utrs = {"total": [], "pri": [], "sec": []}
        tss_features = {"tss_types": [], "locus_tags": [], "utr_lengths": []}
        tss_index = defaultdict(lambda: 0)
        master_file = os.path.join(self.test_folder, "test.tsv")
        fh = StringIO(self.mastertable)
        for tss in self.converter.tsspredator.entries(fh):
            self.converter._multi_tss_class(tss, tss_index, tss_features, nums,
                                            utrs)
        fh.close()
        self.assertDictEqual(nums, {'tss_uni': 0, 'class': 5, 'tss': 2})

    def test_convert_mastertable2gff(self):
        master_file = os.path.join(self.test_folder, "test.tsv")
        with open(master_file, "w") as th:
            th.write(self.mastertable)
        out_gff = os.path.join(self.test_folder, "test.tsv_out")
        self.converter.convert_mastertable2gff(master_file, "ANNOgesic", "TSS",
                                               "aaa", out_gff)
        datas = import_data(out_gff)
        self.assertEqual(set(datas), set(self.tss_file.split("\n")))

    def test_convert_gff2rntptt(self):
        srna_input_file = os.path.join(self.test_folder, "srna.gff")
        srna_output_file = os.path.join(self.test_folder, "srna.out")
        gff_file = os.path.join(self.test_folder, "test.gff")
        rnt_file = os.path.join(self.test_folder, "test.rnt")
        ptt_file = os.path.join(self.test_folder, "test.ptt")
        fasta_file = os.path.join(self.test_folder, "test.fa")
        with open(srna_input_file, "w") as fh:
            fh.write(self.gff_file)
        with open(gff_file, "w") as fh:
            fh.write(self.gff_file)
        with open(fasta_file, "w") as fh:
            fh.write(self.fasta_file)
        self.converter.convert_gff2rntptt(gff_file, fasta_file, ptt_file,
                                          rnt_file, srna_input_file,
                                          srna_output_file)
        self.assertTrue(srna_output_file)
        self.assertTrue(rnt_file)
        self.assertTrue(ptt_file)

    def test_convert_embl2gff(self):
        embl_file = os.path.join(self.test_folder, "test.embl")
        gff_file = os.path.join(self.test_folder, "test.embl_out")
        with open(embl_file, "w") as eh:
            for line in self.embl_file.split("\n"):
                eh.write(line + "\n")
        self.converter.convert_embl2gff(embl_file, gff_file)
        datas = import_data(gff_file)
        self.assertEqual(set(datas[1:-2]), set(self.gff_out.split("\n")))

    def test_convert_transtermhp2gff(self):
        transterm_file = os.path.join(self.test_folder,
                                      "test_best_terminator_after_gene.bag")
        gff_file = os.path.join(self.test_folder, "transterm.gff")
        with open(transterm_file, "w") as th:
            th.write(self.transterm)
        self.converter.convert_transtermhp2gff(transterm_file, gff_file)
        datas = import_data(gff_file)
        self.assertEqual(set(datas), set(self.term_file.split("\n")))

    def get_info(datas):
        f_datas = []
        for data in datas:
            if not data.startswith("#"):
                f_datas.append("\t".join(data.split("\t")[:8]))
        return f_datas

    def test_convert_circ2gff(self):
        circ_file = os.path.join(self.test_folder, "circ.csv")
        out_all = os.path.join(self.test_folder, "all.gff")
        out_filter = os.path.join(self.test_folder, "best.gff")
        with open(circ_file, "w") as ch:
            ch.write(self.circ_file)
        args = self.mock_args.mock()
        args.start_ratio = 0.5
        args.end_ratio = 0.5
        args.support = 5
        self.converter.convert_circ2gff(circ_file, args, out_all, out_filter)
        datas = import_data(out_all)
        f_datas = []
        for data in datas:
            if not data.startswith("#"):
                f_datas.append("\t".join(data.split("\t")[:8]))
        c_datas = []
        for data in self.circ_all.split("\n"):
            if not data.startswith("#"):
                c_datas.append("\t".join(data.split("\t")[:8]))
        self.assertListEqual(f_datas, c_datas)
        datas = import_data(out_filter)
        f_datas = []
        for data in datas:
            if not data.startswith("#"):
                f_datas.append("\t".join(data.split("\t")[:8]))
        c_datas = []
        for data in self.circ_best.split("\n"):
            if not data.startswith("#"):
                c_datas.append("\t".join(data.split("\t")[:8]))
        self.assertListEqual(f_datas, c_datas)
예제 #2
0
class CircRNADetection(object):
    '''Detection of circRNA'''
    def __init__(self, args_circ):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.alignment_path = os.path.join(args_circ.output_folder,
                                           "segemehl_alignment_files")
        self.splice_path = os.path.join(args_circ.output_folder,
                                        "segemehl_splice_results")
        self.candidate_path = os.path.join(args_circ.output_folder,
                                           "circRNA_tables")
        self.gff_folder = os.path.join(args_circ.output_folder, "gffs")
        self.gff_path = os.path.join(args_circ.gffs, "tmp")
        self.splices = {"file": "splicesites.bed", "splice": "splicesites"}
        self.trans = {"file": "transrealigned.bed", "trans": "transrealigned"}
        self.fasta_path = os.path.join(args_circ.fastas, "tmp")

    def _wait_process(self, processes):
        '''wait for the parallels to finish the process'''
        for p in processes:
            p.wait()
            if p.stdout:
                p.stdout.close()
            if p.stdin:
                p.stdin.close()
            if p.stderr:
                p.stderr.close()
            try:
                p.kill()
            except OSError:
                pass
            time.sleep(5)

    def _deal_zip_file(self, read_files, log):
        tmp_datas = []
        tmp_reads = []
        for reads in read_files:
            zips = []
            tmp_datas = reads["files"]
            for read in reads["files"]:
                if read.endswith(".bz2"):
                    mod_read = read.replace(".bz2", "")
                    if (".fa" not in mod_read) and (
                            ".fasta"
                            not in mod_read) and (".fna" not in mod_read) and (
                                ".fq" not in mod_read) and (".fastq"
                                                            not in mod_read):
                        mod_read = mod_read + ".fa"
                    read_out = open(mod_read, "w")
                    tmp_datas.append(mod_read)
                    zips.append(mod_read)
                    print(" ".join(["Uncompressing", read]))
                    log.write(" ".join(["bzcat", read]) + "\n")
                    call(["bzcat", read], stdout=read_out)
                    log.write("\t" + mod_read + " is generated.\n")
                    read_out.close()
                elif read.endswith(".gz"):
                    mod_read = read.replace(".gz", "")
                    if (".fa" not in mod_read) and (
                            ".fasta"
                            not in mod_read) and (".fna" not in mod_read) and (
                                ".fq" not in mod_read) and (".fastq"
                                                            not in mod_read):
                        mod_read = mod_read + ".fa"
                    read_out = open(mod_read, "w")
                    tmp_datas.append(mod_read)
                    zips.append(mod_read)
                    print(" ".join(["Uncompressing", read]))
                    log.write(" ".join(["zcat", read]) + "\n")
                    call(["zcat", read], stdout=read_out)
                    read_out.close()
                    log.write("\t" + mod_read + " is generated.\n")
            tmp_reads.append({
                "sample": reads["sample"],
                "files": tmp_datas,
                "zips": zips
            })
        return tmp_reads

    def _run_segemehl_fasta_index(self, segemehl_path, fasta_path, index,
                                  fasta, log):
        log.write(" ".join([
            segemehl_path, "-x",
            os.path.join(fasta_path, index), "-d",
            os.path.join(fasta_path, fasta)
        ]) + "\n")
        call([
            segemehl_path, "-x",
            os.path.join(fasta_path, index), "-d",
            os.path.join(fasta_path, fasta)
        ])

    def _run_segemehl_align(self, args_circ, index, fasta, read, sam_file,
                            log_file, fasta_prefix, log):
        out = open(os.path.join(self.alignment_path, fasta_prefix, sam_file),
                   "w")
        log = open(os.path.join(self.alignment_path, fasta_prefix, log_file),
                   "w")
        log.write(" ".join([
            args_circ.segemehl_path, "-i",
            os.path.join(self.fasta_path, index), "-d",
            os.path.join(self.fasta_path, fasta), "-q", read, "-S"
        ]) + "\n")
        p = Popen([
            args_circ.segemehl_path, "-i",
            os.path.join(self.fasta_path, index), "-d",
            os.path.join(self.fasta_path, fasta), "-q", read, "-S"
        ],
                  stdout=out,
                  stderr=log)
        return p

    def _align(self, args_circ, read_datas, log):
        '''align the read. if the bam files are provided, it can be skipped.'''
        prefixs = []
        align_files = []
        log.write("Using segemehl to align the read.\n")
        log.write(
            "Please make sure the version of segemehl is at least 0.1.9.\n")
        for fasta in os.listdir(self.fasta_path):
            index = fasta.replace(".fa", ".idx")
            self._run_segemehl_fasta_index(args_circ.segemehl_path,
                                           self.fasta_path, index, fasta, log)
            processes = []
            num_process = 0
            fasta_prefix = fasta.replace(".fa", "")
            prefixs.append(fasta_prefix)
            self.helper.check_make_folder(
                os.path.join(self.alignment_path, fasta_prefix))
            log.write("Running for {0}.\n".format(fasta_prefix))
            for reads in read_datas:
                for read in reads["files"]:
                    num_process += 1
                    read_name = read.split("/")[-1]
                    if read_name.endswith(".fa") or \
                       read_name.endswith(".fna") or \
                       read_name.endswith(".fasta") or \
                       read_name.endswith(".fq") or \
                       read_name.endswith(".fastq"):
                        filename = read_name.split(".")
                        read_prefix = ".".join(filename[:-1])
                        sam_file = "_".join(
                            [read_prefix, fasta_prefix + ".sam"])
                        log_file = "_".join(
                            [read_prefix, fasta_prefix + ".log"])
                        align_files.append("_".join(
                            [read_prefix, fasta_prefix]))
                        print("Mapping {0}".format(sam_file))
                        p = self._run_segemehl_align(args_circ, index, fasta,
                                                     read, sam_file, log_file,
                                                     fasta_prefix, log)
                        processes.append(p)
                        if num_process == args_circ.cores:
                            self._wait_process(processes)
                            num_process = 0
                self._wait_process(processes)
            log.write("Done!\n")
            log.write("The following files are generated in {0}:\n".format(
                os.path.join(self.alignment_path, fasta_prefix)))
            for file_ in os.listdir(
                    os.path.join(self.alignment_path, fasta_prefix)):
                log.write("\t" + file_ + "\n")
        return align_files, prefixs

    def _run_samtools_convert_bam(self, samtools_path, pre_sam, out_bam, log):
        log.write(
            " ".join([samtools_path, "view", "-bS", pre_sam, "-o", out_bam]) +
            "\n")
        call([samtools_path, "view", "-bS", pre_sam, "-o", out_bam])

    def _convert_sam2bam(self, sub_alignment_path, samtools_path, align_files,
                         log):
        bam_files = []
        convert_ones = []
        remove_ones = []
        log.write("Using Samtools to convert SAM files to BAM files.\n")
        log.write(
            "Please make sure the version of Samtools is at least 1.3.1.\n")
        for sam in os.listdir(sub_alignment_path):
            pre_sam = os.path.join(sub_alignment_path, sam)
            if sam.endswith(".sam"):
                bam_file = sam.replace(".sam", ".bam")
                print("Converting {0} to {1}".format(sam, bam_file))
                out_bam = os.path.join(sub_alignment_path, bam_file)
                self._run_samtools_convert_bam(samtools_path, pre_sam, out_bam,
                                               log)
                bam_files.append(out_bam)
                if align_files:
                    if bam_file.replace(".bam", "") not in align_files:
                        convert_ones.append(out_bam)
                    else:
                        remove_ones.append(pre_sam)
            elif sam.endswith(".bam"):
                if (pre_sam not in convert_ones) and (pre_sam
                                                      not in remove_ones):
                    bam_files.append(pre_sam)
            elif sam.endswith(".log"):
                os.remove(pre_sam)
        log.write("Done!\n")
        log.write("The following files are generated:\n")
        for file_ in os.listdir(sub_alignment_path):
            if file_.endswith(".bam"):
                log.write("\t" + os.path.join(sub_alignment_path, file_) +
                          "\n")
        return bam_files, convert_ones, remove_ones

    def _run_samtools_merge_sort(self, samtools_path, prefix, out_folder,
                                 bam_datas, log):
        log.write("Using Samtools for merging, sorting and converting "
                  "the BAM files.\n")
        log.write("Make sure the version Samtools is at least 1.3.1.\n")
        for bam_data in bam_datas:
            print("Merging bam files for {0} of {1}".format(
                prefix, bam_data["sample"]))
            sample_bam = os.path.join(
                out_folder, "_".join([prefix, bam_data["sample"] + ".bam"]))
            if len(bam_data["files"]) <= 1:
                shutil.copyfile(bam_data["files"][0], sample_bam)
            else:
                file_line = " ".join(bam_data["files"])
                log.write(
                    " ".join([samtools_path, "merge", sample_bam, file_line]) +
                    "\n")
                os.system(" ".join(
                    [samtools_path, "merge", sample_bam, file_line]))
            print("Sorting bam files for {0} of {1}".format(
                prefix, bam_data["sample"]))
            sort_sample = os.path.join(
                out_folder,
                "_".join([prefix, bam_data["sample"] + "_sort.bam"]))
            log.write(" ".join(
                [samtools_path, "sort", "-o", sort_sample, sample_bam]) + "\n")
            call([samtools_path, "sort", "-o", sort_sample, sample_bam])
            os.remove(sample_bam)
            print("Converting bam files to sam files for {0} of {1}".format(
                prefix, bam_data["sample"]))
            log.write(" ".join([
                samtools_path, "view", "-h", "-o",
                sort_sample.replace(".bam", ".sam"), sort_sample
            ]) + "\n")
            call([
                samtools_path, "view", "-h", "-o",
                sort_sample.replace(".bam", ".sam"), sort_sample
            ])
        log.write("Done!\n")
        log.write("\t" + sort_sample.replace(".bam", ".sam") +
                  " is generated.\n")

    def _merge_sort_aligment_file(self, bam_datas, read_datas, samtools_path,
                                  out_folder, convert_ones, tmp_reads,
                                  remove_ones, prefix, log):
        if bam_datas is None:
            merge_bam_datas = []
            for read_data in read_datas:
                bam_files = []
                for read in read_data["files"]:
                    if read.endswith(".gz") or read.endswith(".bz2"):
                        read = ".".join(read.split("/")[-1].split(".")[:-1])
                    read_prefix = ".".join(read.split("/")[-1].split(".")[:-1])
                    bam_files.append(
                        os.path.join(self.alignment_path, prefix,
                                     "_".join([read_prefix, prefix + ".bam"])))
                merge_bam_datas.append({
                    "sample": read_data["sample"],
                    "files": bam_files
                })
        elif (bam_datas is not None) and (read_datas is not None):
            merge_bam_datas = copy.deepcopy(bam_datas)
            for bam_data in merge_bam_datas:
                for read_data in read_datas:
                    if bam_data["sample"] == read_data["sample"]:
                        for read in read_data["files"]:
                            read_prefix = ".".join(
                                read.split("/")[-1].split(".")[:-1])
                            bam = os.path.join(
                                self.alignment_path, prefix,
                                "_".join([read_prefix, prefix + ".bam"]))
                            if (bam not in bam_data["files"]):
                                bam_data["files"].append(bam)
        else:
            merge_bam_datas = copy.deepcopy(bam_datas)
        self._run_samtools_merge_sort(samtools_path, prefix, out_folder,
                                      merge_bam_datas, log)
        for bam in convert_ones:
            os.remove(bam)
        for sam in remove_ones:
            os.remove(sam)

    def _run_testrealign(self, prefix, testrealign_path, out_folder, log):
        log.write("Using Segemehl to detect circular RNAs.\n")
        log.write(
            "Please make sure the version of Segemehl is at least 0.1.9.\n")
        log.write(
            "Please make sure your testrealign.x exists. If it does not "
            "exists, please reinstall your Segemehl via using make all.\n")
        sub_splice_path = os.path.join(self.splice_path, prefix)
        if not os.path.exists(sub_splice_path):
            os.mkdir(sub_splice_path)
        err_log = os.path.join(sub_splice_path, prefix + ".log")
        print("Running testrealign.x for {0}".format(prefix))
        for sam_file in os.listdir(out_folder):
            if sam_file.endswith("sort.sam"):
                sample_prefix = sam_file.replace("_sort.sam", "")
                command = " ".join([
                    testrealign_path, "-d",
                    os.path.join(self.fasta_path, prefix + ".fa"), "-q",
                    os.path.join(out_folder, sam_file), "-n", "-U",
                    os.path.join(sub_splice_path,
                                 sample_prefix + "_splicesites.bed"), "-T",
                    os.path.join(sub_splice_path,
                                 sample_prefix + "_transrealigned.bed")
                ])
                log.write(command + " 2>" + err_log + "\n")
                os.system(command + " 2>" + err_log)
        log.write("Done!\n")
        log.write("The following files are generated:\n")
        for file_ in os.listdir(sub_splice_path):
            log.write("\t" + os.path.join(sub_splice_path, file_) + "\n")
        self.helper.remove_all_content(out_folder, ".sam", "file")

    def _merge_bed(self, fastas, splice_path, output_folder):
        '''Merge the bed files for analysis'''
        fa_prefixs = []
        for fasta in os.listdir(fastas):
            headers = []
            if (fasta.endswith(".fa") or fasta.endswith(".fna")
                    or fasta.endswith(".fasta")):
                with open(os.path.join(fastas, fasta), "r") as f_h:
                    for line in f_h:
                        line = line.strip()
                        if line.startswith(">"):
                            headers.append(line[1:])
                filename = fasta.split(".")
                fasta_prefix = ".".join(filename[:-1])
                fa_prefixs.append(fasta_prefix)
                bed_folder = os.path.join(output_folder, fasta_prefix)
                self.helper.check_make_folder(bed_folder)
                samples = []
                for header in headers:
                    for splice in os.listdir(os.path.join(splice_path,
                                                          header)):
                        if splice.endswith(".bed"):
                            if self.splices["file"] in splice:
                                sample = splice.replace(header, "")
                                sample = sample.replace(
                                    self.splices["file"], "")
                                if sample not in samples:
                                    samples.append(sample)
                            shutil.copyfile(
                                os.path.join(splice_path, header, splice),
                                os.path.join(bed_folder, "tmp_" + splice))
                for sample in samples:
                    out_splice = os.path.join(
                        bed_folder,
                        "".join([fasta_prefix + sample + self.splices["file"]
                                 ]))
                    out_trans = os.path.join(
                        bed_folder,
                        "".join([fasta_prefix + sample + self.trans["file"]]))
                    if os.path.exists(out_splice):
                        os.remove(out_splice)
                    if os.path.exists(out_trans):
                        os.remove(out_trans)
                    for file_ in os.listdir(bed_folder):
                        if (self.splices["splice"] in file_) and (sample
                                                                  in file_):
                            self.helper.merge_file(
                                os.path.join(bed_folder, file_), out_splice)
                        elif (self.trans["trans"] in file_) and (sample
                                                                 in file_):
                            self.helper.merge_file(
                                os.path.join(bed_folder, file_), out_trans)
        self.helper.remove_all_content(splice_path, None, "dir")
        return samples, fa_prefixs

    def _stat_and_gen_gff(self, prefixs, samples, args_circ, log):
        '''do statistics and print the result to gff file'''
        log.write(
            "Running circRNA.py to do statistics and generate gff files.\n")
        log.write("The following files are generated:\n")
        for prefix in prefixs:
            self.helper.check_make_folder(os.path.join(self.gff_folder,
                                                       prefix))
            self.helper.check_make_folder(
                os.path.join(self.splice_path, prefix))
            for bed in os.listdir(os.path.join(args_circ.output_folder,
                                               prefix)):
                if (bed.split("_")[0] != "tmp") and (bed.endswith(".bed")):
                    shutil.copy(
                        os.path.join(args_circ.output_folder, prefix, bed),
                        os.path.join(self.splice_path, prefix))
            self.helper.check_make_folder(
                os.path.join(self.candidate_path, prefix))
            print("Comparing circular RNAs with annotations of {0}".format(
                prefix))
            for sample in samples:
                splice_file = os.path.join(
                    self.splice_path, prefix,
                    "".join([prefix, sample, self.splices["file"]]))
                stat_file = os.path.join(
                    args_circ.stat_folder,
                    "".join(["stat_", prefix, sample, "circRNA.csv"]))
                csv_all = os.path.join(
                    self.candidate_path, prefix,
                    "".join([prefix, sample, "circRNA_all.csv"]))
                csv_best = os.path.join(
                    self.candidate_path, prefix,
                    "".join([prefix, sample, "circRNA_best.csv"]))
                gff_all = os.path.join(
                    self.gff_folder, prefix,
                    "".join([prefix, sample, "circRNA_all.gff"]))
                gff_best = os.path.join(
                    self.gff_folder, prefix,
                    "".join([prefix, sample, "circRNA_best.gff"]))
                detect_circrna(splice_file,
                               os.path.join(self.gff_path, prefix + ".gff"),
                               csv_all, args_circ, stat_file)
                self.converter.convert_circ2gff(
                    os.path.join(self.candidate_path, prefix,
                                 "".join([prefix, sample, "circRNA_all.csv"])),
                    args_circ, gff_all, gff_best)
                log.write("\t" + stat_file + "\n")
                log.write("\t" + csv_all + "\n")
                log.write("\t" + csv_best + "\n")
                log.write("\t" + gff_all + "\n")
                log.write("\t" + gff_best + "\n")

    def _extract_input_files(self, inputs):
        input_datas = []
        for input_ in inputs:
            datas = input_.split(":")
            if len(datas) != 2:
                print("Error: the format of --bam_files or "
                      "--read_files is wrong!")
                sys.exit()
            for file_ in datas[-1].split(","):
                if not os.path.exists(file_):
                    print("Error: some files in --bam_files or "
                          "--read_files do not exist!")
                    sys.exit()
            input_datas.append({
                "sample": datas[0],
                "files": datas[-1].split(",")
            })
        return input_datas

    def _combine_read_bam(self, bam_files, bam_datas, read_datas):
        if bam_datas is not None:
            for bam_data in bam_datas:
                for read_data in read_datas:
                    if bam_data["sample"] == read_data["sample"]:
                        for read in read_data["files"]:
                            prefix = ".".join(
                                read.split("/")[-1].split(".")[:-1])
                            bam = os.path.join(self.alignment_path,
                                               prefix + ".bam")
                            if (bam in bam_files) and (
                                    bam not in bam_data["files"]):
                                bam_data["files"].append(bam)
        else:
            bam_datas = []
            for read_data in read_datas:
                bam_files = []
                for read in read_data["files"]:
                    prefix = ".".join(read.split("/")[-1].split(".")[:-1])
                    bam_files.append(
                        os.path.join(self.alignment_path, prefix + ".bam"))
                bam_datas.append({
                    "sample": read_data["sample"],
                    "files": bam_files
                })
        return bam_datas

    def _remove_tmp_files(self, args_circ, fa_prefixs):
        self.helper.remove_tmp_dir(args_circ.fastas)
        self.helper.remove_tmp_dir(args_circ.gffs)
        self.helper.remove_all_content(args_circ.output_folder, ".bam", "file")
        for prefix in fa_prefixs:
            shutil.rmtree(os.path.join(args_circ.output_folder, prefix))

    def run_circrna(self, args_circ, log):
        '''detection of circRNA'''
        bam_datas = None
        read_datas = None
        if (args_circ.bams is None) and (args_circ.read_files is None):
            log.write("--bam_files and --read_files can not be both emtpy.\n")
            print("Error: --bam_files or --read_files should be assigned.")
            sys.exit()
        if args_circ.bams is not None:
            bam_datas = self._extract_input_files(args_circ.bams)
        if args_circ.read_files is not None:
            read_datas = self._extract_input_files(args_circ.read_files)
        for gff in os.listdir(args_circ.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(
                    os.path.join(args_circ.gffs, gff))
        if args_circ.segemehl_path is None:
            log.write("segemehl does not exists.\n")
            print("Error: please assign segemehl path!!")
            sys.exit()
        self.multiparser.parser_fasta(args_circ.fastas)
        self.multiparser.parser_gff(args_circ.gffs, None)
        self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta",
                                     None)
        tmp_reads = []
        if args_circ.read_files:
            log.write("Raw read files are found.\n")
            tmp_reads = self._deal_zip_file(read_datas, log)
            align_files, prefixs = self._align(args_circ, tmp_reads, log)
        else:
            align_files = None
        prefixs = []
        for fasta in os.listdir(self.fasta_path):
            if fasta.endswith(".fa"):
                fasta_prefix = fasta.replace(".fa", "")
                prefixs.append(fasta_prefix)
        for prefix in prefixs:
            if args_circ.read_files:
                sub_alignment_path = os.path.join(self.alignment_path, prefix)
                bam_files, convert_ones, remove_ones = self._convert_sam2bam(
                    sub_alignment_path, args_circ.samtools_path, align_files,
                    log)
            else:
                convert_ones = []
                remove_ones = []
            self._merge_sort_aligment_file(bam_datas, read_datas,
                                           args_circ.samtools_path,
                                           args_circ.output_folder,
                                           convert_ones, tmp_reads,
                                           remove_ones, prefix, log)
            self._run_testrealign(prefix, args_circ.testrealign_path,
                                  args_circ.output_folder, log)
        samples, fa_prefixs = self._merge_bed(args_circ.fastas,
                                              self.splice_path,
                                              args_circ.output_folder)
        self._stat_and_gen_gff(fa_prefixs, samples, args_circ, log)
        if len(tmp_reads) != 0:
            for reads in tmp_reads:
                for read in reads["zips"]:
                    os.remove(read)
        self._remove_tmp_files(args_circ, fa_prefixs)
예제 #3
0
class CircRNADetection(object):
    def __init__(self, args_circ):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.alignment_path = os.path.join(args_circ.output_folder,
                                           "segemehl_align")
        self.splice_path = os.path.join(args_circ.output_folder,
                                        "segemehl_splice")
        self.candidate_path = os.path.join(args_circ.output_folder,
                                           "circRNA_tables")
        self.gff_folder = os.path.join(args_circ.output_folder, "gffs")
        self.gff_path = os.path.join(args_circ.gffs, "tmp")
        self.splices = {
            "all_file": "splicesites_all.bed",
            "file": "splicesites.bed",
            "all": "splicesites_all",
            "splice": "splicesites"
        }
        self.trans = {
            "all_file": "transrealigned_all.bed",
            "file": "transrealigned.bed",
            "all": "transrealigned_all",
            "trans": "transrealigned"
        }
        self.bams = {"whole": "whole_reads.bam", "sort": "whole_reads_sort"}
        if args_circ.align:
            if args_circ.fastas is None:
                print("Error: There is no genome fasta file!!!")
                sys.exit()
            else:
                self.fasta_path = os.path.join(args_circ.fastas, "tmp")
        else:
            self.fasta_path = os.path.join(args_circ.fastas, "tmp")

    def _wait_process(self, processes):
        for p in processes:
            p.wait()
            if p.stdout:
                p.stdout.close()
            if p.stdin:
                p.stdin.close()
            if p.stderr:
                p.stderr.close()
            try:
                p.kill()
            except OSError:
                pass
            time.sleep(5)

    def _deal_zip_file(self, read_folder):
        tmp_reads = []
        for read in os.listdir(read_folder):
            if read.endswith(".bz2"):
                mod_read = read.replace(".bz2", "")
                if (".fa" not in mod_read) and (".fasta" not in mod_read) and (
                        ".fna" not in mod_read):
                    mod_read = mod_read + ".fa"
                read_out = open(os.path.join(read_folder, mod_read), "w")
                tmp_reads.append(os.path.join(read_folder, mod_read))
                print(" ".join(["unzip", read]))
                call(["bzcat", os.path.join(read_folder, read)],
                     stdout=read_out)
                read_out.close()
            elif read.endswith(".gz"):
                mod_read = read.replace(".gz", "")
                if (".fa" not in mod_read) and (".fasta" not in mod_read) and (
                        ".fna" not in mod_read):
                    mod_read = mod_read + ".fa"
                read_out = open(os.path.join(read_folder, mod_read), "w")
                tmp_reads.append(os.path.join(read_folder, mod_read))
                print(" ".join(["unzip", read]))
                call(["zcat", os.path.join(read_folder, read)],
                     stdout=read_out)
                read_out.close()
        return tmp_reads

    def _run_segemehl_fasta_index(self, segemehl_path, fasta_path, index,
                                  fasta):
        call([
            os.path.join(segemehl_path, "segemehl.x"), "-x",
            os.path.join(fasta_path, index), "-d",
            os.path.join(fasta_path, fasta)
        ])

    def _run_segemehl_align(self, args_circ, index, fasta, read, sam_file,
                            log_file, fasta_prefix):
        out = open(os.path.join(self.alignment_path, fasta_prefix, sam_file),
                   "w")
        log = open(os.path.join(self.alignment_path, fasta_prefix, log_file),
                   "w")
        p = Popen([
            os.path.join(args_circ.segemehl_path, "segemehl.x"), "-i",
            os.path.join(self.fasta_path, index), "-d",
            os.path.join(self.fasta_path, fasta), "-q",
            os.path.join(args_circ.read_folder, read), "-S"
        ],
                  stdout=out,
                  stderr=log)
        return p

    def _align(self, args_circ):
        prefixs = []
        align_files = []
        for fasta in os.listdir(self.fasta_path):
            index = fasta.replace(".fa", ".idx")
            self._run_segemehl_fasta_index(args_circ.segemehl_path,
                                           self.fasta_path, index, fasta)
            processes = []
            num_process = 0
            fasta_prefix = fasta.replace(".fa", "")
            prefixs.append(fasta_prefix)
            self.helper.check_make_folder(
                os.path.join(self.alignment_path, fasta_prefix))
            for read in os.listdir(args_circ.read_folder):
                num_process += 1
                if read.endswith(".fa") or \
                   read.endswith(".fna") or \
                   read.endswith("fasta"):
                    filename = read.split(".")
                    read_prefix = ".".join(filename[:-1])
                    sam_file = "_".join([read_prefix, fasta_prefix + ".sam"])
                    log_file = "_".join([read_prefix, fasta_prefix + ".log"])
                    align_files.append("_".join([read_prefix, fasta_prefix]))
                    print("mapping {0}".format(sam_file))
                    p = self._run_segemehl_align(args_circ, index, fasta, read,
                                                 sam_file, log_file,
                                                 fasta_prefix)
                    processes.append(p)
                    if num_process == args_circ.cores:
                        self._wait_process(processes)
                        num_process = 0
            self._wait_process(processes)
        return align_files, prefixs

    def _run_samtools_convert_bam(self, samtools_path, pre_sam, out_bam):
        call([samtools_path, "view", "-bS", pre_sam, "-o", out_bam])

    def _convert_sam2bam(self, sub_alignment_path, samtools_path, align_files):
        bam_files = []
        convert_ones = []
        remove_ones = []
        for sam in os.listdir(sub_alignment_path):
            pre_sam = os.path.join(sub_alignment_path, sam)
            if sam.endswith(".sam"):
                bam_file = sam.replace(".sam", ".bam")
                print("Convert {0} to {1}".format(sam, bam_file))
                out_bam = os.path.join(sub_alignment_path, bam_file)
                self._run_samtools_convert_bam(samtools_path, pre_sam, out_bam)
                bam_files.append(out_bam)
                if align_files:
                    if bam_file.replace(".bam", "") not in align_files:
                        convert_ones.append(out_bam)
                    else:
                        remove_ones.append(pre_sam)
            elif sam.endswith(".bam"):
                if (pre_sam not in convert_ones) and (pre_sam
                                                      not in remove_ones):
                    bam_files.append(pre_sam)
            elif sam.endswith(".log"):
                os.remove(pre_sam)
        return bam_files, convert_ones, remove_ones

    def _run_samtools_merge_sort(self, samtools_path, sub_alignment_path,
                                 bam_files):
        print("Merge all bam files....")
        whole_bam = os.path.join(sub_alignment_path, self.bams["whole"])
        if len(bam_files) <= 1:
            shutil.copyfile(bam_files[0], whole_bam)
        else:
            file_line = " ".join(bam_files)
            os.system(" ".join([samtools_path, "merge", whole_bam, file_line]))
        print("Sort bam files....")
        call([
            samtools_path, "sort", "-o",
            os.path.join(sub_alignment_path, self.bams["sort"] + ".bam"),
            whole_bam
        ])
        os.remove(os.path.join(sub_alignment_path, self.bams["whole"]))

    def _run_samtools_convert_sam(self, samtools_path, sub_alignment_path):
        print("Convert whole reads bam file to sam file....")
        call([
            samtools_path, "view", "-h", "-o",
            os.path.join(sub_alignment_path, self.bams["sort"] + ".sam"),
            os.path.join(sub_alignment_path, self.bams["sort"] + ".bam")
        ])

    def _merge_sort_aligment_file(self, bam_files, samtools_path,
                                  sub_alignment_path, convert_ones, tmp_reads,
                                  remove_ones):
        self._run_samtools_merge_sort(samtools_path, sub_alignment_path,
                                      bam_files)
        self._run_samtools_convert_sam(samtools_path, sub_alignment_path)
        for bam in convert_ones:
            os.remove(bam)
        for sam in remove_ones:
            os.remove(sam)
        if len(tmp_reads) != 0:
            for read in tmp_reads:
                os.remove(read)

    def _run_testrealign(self, prefix, segemehl_path, sub_alignment_path):
        self.helper.check_make_folder(os.path.join(self.splice_path, prefix))
        sub_splice_path = os.path.join(self.splice_path, prefix)
        err_log = os.path.join(sub_splice_path, prefix + ".log")
        print("Running testrealign.x for {0}".format(prefix))
        command = " ".join([
            os.path.join(segemehl_path, "testrealign.x"), "-d",
            os.path.join(self.fasta_path, prefix + ".fa"), "-q",
            os.path.join(sub_alignment_path, self.bams["sort"] + ".sam"), "-n"
        ])
        os.system(command + " 2>" + err_log)
        self.helper.move_all_content(os.getcwd(), sub_splice_path, [".bed"])
        self.helper.remove_all_content(sub_alignment_path, self.bams["sort"],
                                       "file")

    def _merge_bed(self, fastas, splice_path):
        tmp_prefixs = []
        for fasta in os.listdir(fastas):
            headers = []
            if (fasta.endswith(".fa") or fasta.endswith(".fna")
                    or fasta.endswith(".fasta")):
                with open(os.path.join(fastas, fasta), "r") as f_h:
                    for line in f_h:
                        line = line.strip()
                        if line.startswith(">"):
                            headers.append(line[1:])
                filename = fasta.split(".")
                fasta_prefix = ".".join(filename[:-1])
                tmp_prefixs.append(fasta_prefix)
                self.helper.check_make_folder(
                    os.path.join(os.getcwd(), fasta_prefix))
                for header in headers:
                    shutil.copyfile(
                        os.path.join(splice_path, header,
                                     self.splices["file"]),
                        os.path.join(
                            fasta_prefix,
                            "_".join([self.splices["splice"],
                                      header + ".bed"])))
                    shutil.copyfile(
                        os.path.join(splice_path, header, self.trans["file"]),
                        os.path.join(
                            fasta_prefix,
                            "_".join([self.trans["trans"], header + ".bed"])))
                out_splice = os.path.join(fasta_prefix,
                                          self.splices["all_file"])
                out_trans = os.path.join(fasta_prefix, self.trans["all_file"])
                if len(headers) > 1:
                    for file_ in os.listdir(fasta_prefix):
                        if (self.splices["splice"]
                                in file_) and (self.splices["all"]
                                               not in file_):
                            self.helper.merge_file(
                                os.path.join(fasta_prefix, file_), out_splice)
                        elif (self.trans["trans"]
                              in file_) and (self.trans["all"] not in file_):
                            self.helper.merge_file(
                                os.path.join(fasta_prefix, file_), out_trans)
                else:
                    shutil.move(
                        os.path.join(
                            fasta_prefix, "_".join(
                                [self.splices["splice"],
                                 headers[0] + ".bed"])), out_splice)
                    shutil.move(
                        os.path.join(
                            fasta_prefix, "_".join(
                                [self.trans["trans"], headers[0] + ".bed"])),
                        out_trans)
        self.helper.remove_all_content(splice_path, None, "dir")
        return tmp_prefixs

    def _stat_and_gen_gff(self, tmp_prefixs, args_circ):
        for prefix in tmp_prefixs:
            self.helper.check_make_folder(os.path.join(self.gff_folder,
                                                       prefix))
            shutil.copytree(prefix, os.path.join(self.splice_path, prefix))
            self.helper.check_make_folder(
                os.path.join(self.candidate_path, prefix))
            print("comparing with annotation of {0}".format(prefix))
            if self.splices["all_file"] in os.listdir(
                    os.path.join(self.splice_path, prefix)):
                detect_circrna(
                    os.path.join(self.splice_path, prefix,
                                 self.splices["all_file"]),
                    os.path.join(self.gff_path, prefix + ".gff"),
                    os.path.join(self.candidate_path, prefix,
                                 "_".join(["circRNA", prefix + "_all.csv"])),
                    args_circ,
                    os.path.join(args_circ.stat_folder,
                                 "_".join(["stat_circRNA", prefix + ".csv"])))
                self.converter.convert_circ2gff(
                    os.path.join(self.candidate_path, prefix,
                                 "_".join(["circRNA", prefix + "_all.csv"])),
                    args_circ,
                    os.path.join(self.gff_folder, prefix,
                                 "_".join([prefix, "circRNA_all.gff"])),
                    os.path.join(self.gff_folder, prefix,
                                 "_".join([prefix, "circRNA_best.gff"])))

    def _assign_merge_bam(self, args_circ):
        remove_frags = []
        bam_files = []
        if (args_circ.normal_bams is not None) and (args_circ.frag_bams
                                                    is not None):
            for frag in os.listdir(args_circ.frag_bams):
                if frag.endswith(".bam"):
                    shutil.copyfile(os.path.join(args_circ.frag_bams, frag),
                                    os.path.join(args_circ.normal_bams, frag))
                    remove_frags.append(frag)
            merge_folder = args_circ.normal_bams
        elif (args_circ.normal_bams is not None):
            merge_folder = args_circ.normal_bams
        elif (args_circ.frag_bams is not None):
            merge_folder = args_circ.frag_bams
        else:
            print("Error: please assign bam folder or do alignment!!")
            sys.exit()
        for bam in os.listdir(merge_folder):
            if bam.endswith(".bam"):
                bam_files.append(os.path.join(merge_folder, bam))
        return merge_folder, remove_frags, bam_files

    def run_circrna(self, args_circ):
        for gff in os.listdir(args_circ.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(
                    os.path.join(args_circ.gffs, gff))
        if args_circ.segemehl_path is None:
            print("Error: please assign segemehl folder!!")
            sys.exit()
        self.multiparser.parser_gff(args_circ.gffs, None)
        self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta",
                                     None)
        tmp_reads = []
        if args_circ.align:
            self.multiparser.parser_fasta(args_circ.fastas)
            tmp_reads = self._deal_zip_file(args_circ.read_folder)
            align_files, prefixs = self._align(args_circ)
        else:
            self.multiparser.parser_fasta(args_circ.fastas)
            prefixs = []
            for fasta in os.listdir(self.fasta_path):
                fasta_prefix = fasta.replace(".fa", "")
                prefixs.append(fasta_prefix)
            merge_folder, remove_frag, bam_files = self._assign_merge_bam(
                args_circ)
            align_files = None
        for prefix in prefixs:
            if args_circ.align:
                sub_alignment_path = os.path.join(self.alignment_path, prefix)
                bam_files, convert_ones, remove_ones = self._convert_sam2bam(
                    sub_alignment_path, args_circ.samtools_path, align_files)
            else:
                sub_alignment_path = merge_folder
                convert_ones = []
                remove_ones = []
            self._merge_sort_aligment_file(bam_files, args_circ.samtools_path,
                                           sub_alignment_path, convert_ones,
                                           tmp_reads, remove_ones)
            self._run_testrealign(prefix, args_circ.segemehl_path,
                                  sub_alignment_path)
        tmp_prefixs = self._merge_bed(args_circ.fastas, self.splice_path)
        self.multiparser.parser_gff(args_circ.gffs, None)
        self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta",
                                     None)
        self._stat_and_gen_gff(tmp_prefixs, args_circ)
        self.helper.remove_tmp(args_circ.fastas)
        self.helper.remove_tmp(args_circ.gffs)
        for tmp_prefix in tmp_prefixs:
            shutil.rmtree(tmp_prefix)
        if (not args_circ.align) and (len(remove_frag) != 0):
            for frag in remove_frag:
                os.remove(os.path.join(merge_folder, frag))
예제 #4
0
class TestConverter(unittest.TestCase):

    def setUp(self):
        self.converter = Converter()
        self.example = Example()
        self.converter.gff3parser = Mock_gff3_parser
        self.converter._print_rntptt_title = Mock_func().print_rntptt_title
        self.converter.tsspredator = Mock_TSSPredatorReader()
        self.converter._read_file = Mock_func().mock_read_file
        self.gff_file = self.example.gff_file
        self.ptt_out = self.example.ptt_out
        self.rnt_out = self.example.rnt_out
        self.srna_out = self.example.srna_out
        self.embl_file = self.example.embl_file
        self.embl_out = self.example.embl_out
        self.multi_embl = self.example.multi_embl
        self.gff_out = self.example.gff_out
        self.mastertable = self.example.mastertable
        self.tss_file = self.example.tss_file
        self.fasta_file = self.example.fasta_file
        self.transterm = self.example.transterm
        self.term_file = self.example.term_file
        self.circ_file = self.example.circrna_table
        self.circ_all = self.example.circrna_all
        self.circ_best = self.example.circrna_best
        self.test_folder = "test_folder"
        self.mock_args = MockClass()
        if (not os.path.exists(self.test_folder)):
            os.mkdir(self.test_folder)

    def tearDown(self):
        if os.path.exists(self.test_folder):
            shutil.rmtree(self.test_folder)

    def test_print_rntptt_file(self):
        cdss = []
        genes = []
        rnas = []
        gff_dict = Example().gff_dict
        for gff in gff_dict:
            if gff["feature"] == "gene":
                genes.append(self.converter.gff3parser.entries(self, gff))
            elif gff["feature"] == "CDS":
                cdss.append(self.converter.gff3parser.entries(self, gff))
            elif gff["feature"] == "tRNA":
                rnas.append(self.converter.gff3parser.entries(self, gff))
        out_p = StringIO()
        out_r = StringIO()
        self.converter._print_rntptt_file(out_p, cdss, genes)
        self.converter._print_rntptt_file(out_r, rnas, genes)
        self.assertEqual(out_p.getvalue().split("\n")[:-1],
                         self.example.ptt_out_list)
        self.assertEqual(out_r.getvalue().split("\n")[:-1],
                         self.example.rnt_out_list)
        out_p.close()
        out_r.close()

    def test_srna2pttrnt(self):
        srna_input_file = os.path.join(self.test_folder, "srna.gff")
        srna_output_file = os.path.join(self.test_folder, "srna.out")
        with open(srna_input_file, "w") as fh:
            fh.write(self.gff_file)
        srnas = []
        self.converter._srna2rntptt(srna_input_file, srna_output_file,
                                    srnas, 1234567)
        datas = import_data(srna_output_file)
        self.assertEqual(set(datas), set(self.srna_out.split("\n")))

    def test_multi_embl_pos(self):
        embls = []
        for line in self.embl_file.split("\n"):
            datas = self.converter._multi_embl_pos(line.strip())
            if datas != "Wrong":
                embls.append(datas)
        for index in range(0, 7):
            self.assertDictEqual(embls[index], self.embl_out[index])
        for index in range(0, 2):
            self.assertDictEqual(embls[-1]["pos"][index],
                                 self.multi_embl[index])
        
    def test_parser_embl_data(self):
        embl_file = os.path.join(self.test_folder, "test.embl")
        embl_out = os.path.join(self.test_folder, "test.embl_out")
        out = StringIO()
        with open(embl_file, "w") as eh:
            for line in self.embl_file.split("\n"):
                eh.write(line + "\n")
        info = self.converter._parser_embl_data(embl_file, out)
        datas = out.getvalue().split("\n")
        self.assertEqual(set(datas[:-1]), set(self.gff_out.split("\n")))
        self.assertEqual(info[0], "NC_007795.1")
        for index in range(0, 2):
            self.assertDictEqual(info[1]["pos"][index], self.multi_embl[index])
        out.close()

    def test_multi_tss_class(self):
        nums = {"tss": 0, "tss_uni": 0, "class": 1}
        utrs = {"total": [], "pri": [], "sec": []}
        tss_features = {"tss_types": [], "locus_tags": [], "utr_lengths": []}
        tss_index = defaultdict(lambda: 0)
        master_file = os.path.join(self.test_folder, "test.tsv")
        fh = StringIO(self.mastertable)
        for tss in self.converter.tsspredator.entries(fh):
            self.converter._multi_tss_class(
                tss, tss_index, tss_features, nums, utrs)
        fh.close()
        self.assertDictEqual(nums, {'tss_uni': 0, 'class': 5, 'tss': 2})

    def test_convert_mastertable2gff(self):
        master_file = os.path.join(self.test_folder, "test.tsv")
        with open(master_file, "w") as th:
            th.write(self.mastertable)
        out_gff = os.path.join(self.test_folder, "test.tsv_out")
        self.converter.convert_mastertable2gff(master_file, "ANNOgesic", "TSS",
                                               "aaa", out_gff)
        datas = import_data(out_gff)
        self.assertEqual(set(datas), set(self.tss_file.split("\n")))

    def test_convert_gff2rntptt(self):
        srna_input_file = os.path.join(self.test_folder, "srna.gff")
        srna_output_file = os.path.join(self.test_folder, "srna.out")
        gff_file = os.path.join(self.test_folder, "test.gff")
        rnt_file = os.path.join(self.test_folder, "test.rnt")
        ptt_file = os.path.join(self.test_folder, "test.ptt")
        fasta_file = os.path.join(self.test_folder, "test.fa")
        with open(srna_input_file, "w") as fh:
            fh.write(self.gff_file)
        with open(gff_file, "w") as fh:
            fh.write(self.gff_file)
        with open(fasta_file, "w") as fh:
            fh.write(self.fasta_file)
        self.converter.convert_gff2rntptt(
             gff_file, fasta_file, ptt_file, rnt_file,
             srna_input_file, srna_output_file)
        self.assertTrue(srna_output_file)
        self.assertTrue(rnt_file)
        self.assertTrue(ptt_file)

    def test_convert_embl2gff(self):
        embl_file = os.path.join(self.test_folder, "test.embl")
        gff_file = os.path.join(self.test_folder, "test.embl_out")
        with open(embl_file, "w") as eh:
            for line in self.embl_file.split("\n"):
                eh.write(line + "\n")
        self.converter.convert_embl2gff(embl_file, gff_file)
        datas = import_data(gff_file)
        self.assertEqual(set(datas[1:-2]), set(self.gff_out.split("\n")))

    def test_convert_transtermhp2gff(self):
        transterm_file = os.path.join(
            self.test_folder, "test_best_terminator_after_gene.bag")
        gff_file = os.path.join(self.test_folder, "transterm.gff")
        with open(transterm_file, "w") as th:
            th.write(self.transterm)
        self.converter.convert_transtermhp2gff(transterm_file, gff_file)
        datas = import_data(gff_file)
        self.assertEqual(set(datas), set(self.term_file.split("\n")))

    def get_info(datas):
        f_datas = []
        for data in datas:
            if not data.startswith("#"):
                f_datas.append("\t".join(data.split("\t")[:8]))
        return f_datas

    def test_convert_circ2gff(self):
        circ_file = os.path.join(self.test_folder, "circ.csv")
        out_all = os.path.join(self.test_folder, "all.gff")
        out_filter = os.path.join(self.test_folder, "best.gff")  
        with open(circ_file, "w") as ch:
            ch.write(self.circ_file)
        args = self.mock_args.mock()
        args.start_ratio = 0.5
        args.end_ratio = 0.5
        args.support = 5
        self.converter.convert_circ2gff(circ_file, args, out_all, out_filter)
        datas = import_data(out_all)
        f_datas = []
        for data in datas:
            if not data.startswith("#"):
                f_datas.append("\t".join(data.split("\t")[:8]))
        c_datas = []
        for data in self.circ_all.split("\n"):
            if not data.startswith("#"):
                c_datas.append("\t".join(data.split("\t")[:8]))
        self.assertListEqual(f_datas, c_datas)
        datas = import_data(out_filter)
        f_datas = []
        for data in datas:
            if not data.startswith("#"):
                f_datas.append("\t".join(data.split("\t")[:8]))
        c_datas = []
        for data in self.circ_best.split("\n"):
            if not data.startswith("#"):
                c_datas.append("\t".join(data.split("\t")[:8]))
        self.assertListEqual(f_datas, c_datas)
예제 #5
0
class CircRNADetection(object):

    def __init__(self, args_circ):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.alignment_path = os.path.join(args_circ.output_folder,
                                           "segemehl_align")
        self.splice_path = os.path.join(args_circ.output_folder,
                                        "segemehl_splice")
        self.candidate_path = os.path.join(args_circ.output_folder,
                                           "circRNA_tables")
        self.gff_folder = os.path.join(args_circ.output_folder, "gffs")
        self.gff_path = os.path.join(args_circ.gffs, "tmp")
        self.splices = {"all_file": "splicesites_all.bed",
                        "file": "splicesites.bed",
                        "all": "splicesites_all", "splice": "splicesites"}
        self.trans = {"all_file": "transrealigned_all.bed",
                      "file": "transrealigned.bed",
                      "all": "transrealigned_all", "trans": "transrealigned"}
        self.bams = {"whole": "whole_reads.bam", "sort": "whole_reads_sort"}
        if args_circ.align:
            if args_circ.fastas is None:
                print("Error: There is no genome fasta file!!!")
                sys.exit()
            else:
                self.fasta_path = os.path.join(args_circ.fastas, "tmp")
        else:
            self.fasta_path = os.path.join(args_circ.fastas, "tmp")

    def _wait_process(self, processes):
        for p in processes:
            p.wait()
            if p.stdout:
                p.stdout.close()
            if p.stdin:
                p.stdin.close()
            if p.stderr:
                p.stderr.close()
            try:
                p.kill()
            except OSError:
                pass
            time.sleep(5)

    def _deal_zip_file(self, read_folder):
        tmp_reads = []
        for read in os.listdir(read_folder):
            if read.endswith(".bz2"):
                mod_read = read.replace(".bz2", "")
                if (".fa" not in mod_read) and (
                        ".fasta" not in mod_read) and (
                        ".fna" not in mod_read):
                    mod_read = mod_read + ".fa"
                read_out = open(os.path.join(read_folder, mod_read), "w")
                tmp_reads.append(os.path.join(read_folder, mod_read))
                print(" ".join(["unzip", read]))
                call(["bzcat", os.path.join(read_folder, read)],
                     stdout=read_out)
                read_out.close()
            elif read.endswith(".gz"):
                mod_read = read.replace(".gz", "")
                if (".fa" not in mod_read) and (
                        ".fasta" not in mod_read) and (
                        ".fna" not in mod_read):
                    mod_read = mod_read + ".fa"
                read_out = open(os.path.join(read_folder, mod_read), "w")
                tmp_reads.append(os.path.join(read_folder, mod_read))
                print(" ".join(["unzip", read]))
                call(["zcat", os.path.join(read_folder, read)],
                     stdout=read_out)
                read_out.close()
        return tmp_reads

    def _run_segemehl_fasta_index(self, segemehl_path, fasta_path,
                                  index, fasta):
        call([os.path.join(segemehl_path, "segemehl.x"),
              "-x", os.path.join(fasta_path, index),
              "-d", os.path.join(fasta_path, fasta)])

    def _run_segemehl_align(self, args_circ, index, fasta, read,
                            sam_file, log_file, fasta_prefix):
        out = open(os.path.join(self.alignment_path,
                   fasta_prefix, sam_file), "w")
        log = open(os.path.join(self.alignment_path,
                   fasta_prefix, log_file), "w")
        p = Popen([os.path.join(args_circ.segemehl_path, "segemehl.x"),
                   "-i", os.path.join(self.fasta_path, index),
                   "-d", os.path.join(self.fasta_path, fasta),
                   "-q", os.path.join(args_circ.read_folder, read), "-S"],
                  stdout=out, stderr=log)
        return p

    def _align(self, args_circ):
        prefixs = []
        align_files = []
        for fasta in os.listdir(self.fasta_path):
            index = fasta.replace(".fa", ".idx")
            self._run_segemehl_fasta_index(args_circ.segemehl_path,
                                           self.fasta_path, index, fasta)
            processes = []
            num_process = 0
            fasta_prefix = fasta.replace(".fa", "")
            prefixs.append(fasta_prefix)
            self.helper.check_make_folder(os.path.join(
                                self.alignment_path, fasta_prefix))
            for read in os.listdir(args_circ.read_folder):
                num_process += 1
                if read.endswith(".fa") or \
                   read.endswith(".fna") or \
                   read.endswith("fasta"):
                    filename = read.split(".")
                    read_prefix = ".".join(filename[:-1])
                    sam_file = "_".join([read_prefix, fasta_prefix + ".sam"])
                    log_file = "_".join([read_prefix, fasta_prefix + ".log"])
                    align_files.append("_".join([read_prefix, fasta_prefix]))
                    print("mapping {0}".format(sam_file))
                    p = self._run_segemehl_align(
                            args_circ, index, fasta, read,
                            sam_file, log_file, fasta_prefix)
                    processes.append(p)
                    if num_process == args_circ.cores:
                        self._wait_process(processes)
                        num_process = 0
            self._wait_process(processes)
        return align_files, prefixs

    def _run_samtools_convert_bam(self, samtools_path, pre_sam, out_bam):
        call([samtools_path, "view", "-bS", pre_sam, "-o", out_bam])

    def _convert_sam2bam(self, sub_alignment_path, samtools_path, align_files):
        bam_files = []
        convert_ones = []
        remove_ones = []
        for sam in os.listdir(sub_alignment_path):
            pre_sam = os.path.join(sub_alignment_path, sam)
            if sam.endswith(".sam"):
                bam_file = sam.replace(".sam", ".bam")
                print("Convert {0} to {1}".format(sam, bam_file))
                out_bam = os.path.join(sub_alignment_path, bam_file)
                self._run_samtools_convert_bam(samtools_path, pre_sam, out_bam)
                bam_files.append(out_bam)
                if align_files:
                    if bam_file.replace(".bam", "") not in align_files:
                        convert_ones.append(out_bam)
                    else:
                        remove_ones.append(pre_sam)
            elif sam.endswith(".bam"):
                if (pre_sam not in convert_ones) and (
                        pre_sam not in remove_ones):
                    bam_files.append(pre_sam)
            elif sam.endswith(".log"):
                os.remove(pre_sam)
        return bam_files, convert_ones, remove_ones

    def _run_samtools_merge_sort(self, samtools_path,
                                 sub_alignment_path, bam_files):
        print("Merge all bam files....")
        whole_bam = os.path.join(sub_alignment_path, self.bams["whole"])
        if len(bam_files) <= 1:
            shutil.copyfile(bam_files[0], whole_bam)
        else:
            file_line = " ".join(bam_files)
            os.system(" ".join([samtools_path, "merge",
                                whole_bam, file_line]))
        print("Sort bam files....")
        call([samtools_path, "sort", "-o", os.path.join(sub_alignment_path,
              self.bams["sort"] + ".bam"), whole_bam])
        os.remove(os.path.join(sub_alignment_path, self.bams["whole"]))

    def _run_samtools_convert_sam(self, samtools_path, sub_alignment_path):
        print("Convert whole reads bam file to sam file....")
        call([samtools_path, "view", "-h", "-o",
              os.path.join(sub_alignment_path, self.bams["sort"] + ".sam"),
              os.path.join(sub_alignment_path, self.bams["sort"] + ".bam")])

    def _merge_sort_aligment_file(self, bam_files, samtools_path,
                                  sub_alignment_path, convert_ones,
                                  tmp_reads, remove_ones):
        self._run_samtools_merge_sort(samtools_path,
                                      sub_alignment_path, bam_files)
        self._run_samtools_convert_sam(samtools_path, sub_alignment_path)
        for bam in convert_ones:
            os.remove(bam)
        for sam in remove_ones:
            os.remove(sam)
        if len(tmp_reads) != 0:
            for read in tmp_reads:
                os.remove(read)

    def _run_testrealign(self, prefix, segemehl_path, sub_alignment_path):
        self.helper.check_make_folder(os.path.join(self.splice_path, prefix))
        sub_splice_path = os.path.join(self.splice_path, prefix)
        err_log = os.path.join(sub_splice_path, prefix + ".log")
        print("Running testrealign.x for {0}".format(prefix))
        command = " ".join([
                  os.path.join(segemehl_path, "testrealign.x"),
                  "-d", os.path.join(self.fasta_path, prefix + ".fa"),
                  "-q", os.path.join(sub_alignment_path,
                                     self.bams["sort"] + ".sam"),
                  "-n"])
        os.system(command + " 2>" + err_log)
        self.helper.move_all_content(os.getcwd(), sub_splice_path, [".bed"])
        self.helper.remove_all_content(sub_alignment_path,
                                       self.bams["sort"], "file")

    def _merge_bed(self, fastas, splice_path):
        tmp_prefixs = []
        for fasta in os.listdir(fastas):
            headers = []
            if (fasta.endswith(".fa") or fasta.endswith(".fna") or
                    fasta.endswith(".fasta")):
                with open(os.path.join(fastas, fasta), "r") as f_h:
                    for line in f_h:
                        line = line.strip()
                        if line.startswith(">"):
                            headers.append(line[1:])
                filename = fasta.split(".")
                fasta_prefix = ".".join(filename[:-1])
                tmp_prefixs.append(fasta_prefix)
                self.helper.check_make_folder(os.path.join(
                                              os.getcwd(), fasta_prefix))
                for header in headers:
                    shutil.copyfile(os.path.join(splice_path, header,
                                    self.splices["file"]),
                                    os.path.join(fasta_prefix,
                                    "_".join([self.splices["splice"],
                                              header + ".bed"])))
                    shutil.copyfile(os.path.join(splice_path, header,
                                    self.trans["file"]),
                                    os.path.join(fasta_prefix,
                                    "_".join([self.trans["trans"],
                                              header + ".bed"])))
                out_splice = os.path.join(fasta_prefix,
                                          self.splices["all_file"])
                out_trans = os.path.join(fasta_prefix,
                                         self.trans["all_file"])
                if len(headers) > 1:
                    for file_ in os.listdir(fasta_prefix):
                        if (self.splices["splice"] in file_) and (
                                self.splices["all"] not in file_):
                            self.helper.merge_file(os.path.join(
                                    fasta_prefix, file_), out_splice)
                        elif (self.trans["trans"] in file_) and (
                                self.trans["all"] not in file_):
                            self.helper.merge_file(os.path.join(
                                    fasta_prefix, file_), out_trans)
                else:
                    shutil.move(os.path.join(
                                fasta_prefix,
                                "_".join([self.splices["splice"],
                                         headers[0] + ".bed"])),
                                out_splice)
                    shutil.move(os.path.join(
                                fasta_prefix,
                                "_".join([self.trans["trans"],
                                          headers[0] + ".bed"])),
                                out_trans)
        self.helper.remove_all_content(splice_path, None, "dir")
        return tmp_prefixs

    def _stat_and_gen_gff(self, tmp_prefixs, args_circ):
        for prefix in tmp_prefixs:
            self.helper.check_make_folder(os.path.join(self.gff_folder,
                                                       prefix))
            shutil.copytree(prefix, os.path.join(self.splice_path, prefix))
            self.helper.check_make_folder(os.path.join(
                                          self.candidate_path, prefix))
            print("comparing with annotation of {0}".format(prefix))
            if self.splices["all_file"] in os.listdir(os.path.join(
                                           self.splice_path, prefix)):
                detect_circrna(os.path.join(self.splice_path, prefix,
                               self.splices["all_file"]), os.path.join(
                               self.gff_path, prefix + ".gff"),
                               os.path.join(self.candidate_path, prefix,
                               "_".join(["circRNA", prefix + "_all.csv"])),
                               args_circ, os.path.join(args_circ.stat_folder,
                               "_".join(["stat_circRNA", prefix + ".csv"])))
                self.converter.convert_circ2gff(
                     os.path.join(self.candidate_path, prefix,
                                  "_".join(["circRNA",
                                            prefix + "_all.csv"])),
                     args_circ, os.path.join(
                                self.gff_folder, prefix,
                                "_".join([prefix, "circRNA_all.gff"])),
                     os.path.join(self.gff_folder, prefix,
                                  "_".join([prefix, "circRNA_best.gff"])))

    def _assign_merge_bam(self, args_circ):
        remove_frags = []
        bam_files = []
        if (args_circ.normal_bams is not None) and (
                args_circ.frag_bams is not None):
            for frag in os.listdir(args_circ.frag_bams):
                if frag.endswith(".bam"):
                    shutil.copyfile(os.path.join(args_circ.frag_bams, frag),
                                    os.path.join(args_circ.normal_bams, frag))
                    remove_frags.append(frag)
            merge_folder = args_circ.normal_bams
        elif (args_circ.normal_bams is not None):
            merge_folder = args_circ.normal_bams
        elif (args_circ.frag_bams is not None):
            merge_folder = args_circ.frag_bams
        else:
            print("Error: please assign bam folder or do alignment!!")
            sys.exit()
        for bam in os.listdir(merge_folder):
            if bam.endswith(".bam"):
                bam_files.append(os.path.join(merge_folder, bam))
        return merge_folder, remove_frags, bam_files

    def run_circrna(self, args_circ):
        for gff in os.listdir(args_circ.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(
                                                 args_circ.gffs, gff))
        if args_circ.segemehl_path is None:
            print("Error: please assign segemehl folder!!")
            sys.exit()
        self.multiparser.parser_gff(args_circ.gffs, None)
        self.multiparser.combine_gff(args_circ.fastas, self.gff_path,
                                     "fasta", None)
        tmp_reads = []
        if args_circ.align:
            self.multiparser.parser_fasta(args_circ.fastas)
            tmp_reads = self._deal_zip_file(args_circ.read_folder)
            align_files, prefixs = self._align(args_circ)
        else:
            self.multiparser.parser_fasta(args_circ.fastas)
            prefixs = []
            for fasta in os.listdir(self.fasta_path):
                fasta_prefix = fasta.replace(".fa", "")
                prefixs.append(fasta_prefix)
            merge_folder, remove_frag, bam_files = self._assign_merge_bam(
                                                   args_circ)
            align_files = None
        for prefix in prefixs:
            if args_circ.align:
                sub_alignment_path = os.path.join(self.alignment_path, prefix)
                bam_files, convert_ones, remove_ones = self._convert_sam2bam(
                    sub_alignment_path, args_circ.samtools_path, align_files)
            else:
                sub_alignment_path = merge_folder
                convert_ones = []
                remove_ones = []
            self._merge_sort_aligment_file(
                bam_files, args_circ.samtools_path, sub_alignment_path,
                convert_ones, tmp_reads, remove_ones)
            self._run_testrealign(prefix, args_circ.segemehl_path,
                                  sub_alignment_path)
        tmp_prefixs = self._merge_bed(args_circ.fastas, self.splice_path)
        self.multiparser.parser_gff(args_circ.gffs, None)
        self.multiparser.combine_gff(args_circ.fastas, self.gff_path,
                                     "fasta", None)
        self._stat_and_gen_gff(tmp_prefixs, args_circ)
        self.helper.remove_tmp(args_circ.fastas)
        self.helper.remove_tmp(args_circ.gffs)
        for tmp_prefix in tmp_prefixs:
            shutil.rmtree(tmp_prefix)
        if (not args_circ.align) and (len(remove_frag) != 0):
            for frag in remove_frag:
                os.remove(os.path.join(merge_folder, frag))
예제 #6
0
class CircRNADetection(object):
    '''Detection of circRNA'''

    def __init__(self, args_circ):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.alignment_path = os.path.join(args_circ.output_folder,
                                           "segemehl_alignment_files")
        self.splice_path = os.path.join(args_circ.output_folder,
                                        "segemehl_splice_results")
        self.candidate_path = os.path.join(args_circ.output_folder,
                                           "circRNA_tables")
        self.gff_folder = os.path.join(args_circ.output_folder, "gffs")
        self.gff_path = os.path.join(args_circ.gffs, "tmp")
        self.splices = {"file": "splicesites.bed",
                        "splice": "splicesites"}
        self.trans = {"file": "transrealigned.bed",
                      "trans": "transrealigned"}
        self.fasta_path = os.path.join(args_circ.fastas, "tmp")

    def _wait_process(self, processes):
        '''wait for the parallels to finish the process'''
        for p in processes:
            p.wait()
            if p.stdout:
                p.stdout.close()
            if p.stdin:
                p.stdin.close()
            if p.stderr:
                p.stderr.close()
            try:
                p.kill()
            except OSError:
                pass
            time.sleep(5)

    def _deal_zip_file(self, read_files, log):
        tmp_datas = []
        tmp_reads = []
        for reads in read_files:
            zips = []
            tmp_datas = reads["files"]
            for read in reads["files"]:
                if read.endswith(".bz2"):
                    mod_read = read.replace(".bz2", "")
                    if (".fa" not in mod_read) and (
                            ".fasta" not in mod_read) and (
                            ".fna" not in mod_read) and (
                            ".fq" not in mod_read) and (
                            ".fastq" not in mod_read):
                        mod_read = mod_read + ".fa"
                    read_out = open(mod_read, "w")
                    tmp_datas.append(mod_read)
                    zips.append(mod_read)
                    print(" ".join(["Uncompressing", read]))
                    log.write(" ".join(["bzcat", read]) + "\n")
                    call(["bzcat", read], stdout=read_out)
                    log.write("\t" + mod_read + " is generated.\n")
                    read_out.close()
                elif read.endswith(".gz"):
                    mod_read = read.replace(".gz", "")
                    if (".fa" not in mod_read) and (
                            ".fasta" not in mod_read) and (
                            ".fna" not in mod_read) and (
                            ".fq" not in mod_read) and (
                            ".fastq" not in mod_read):
                        mod_read = mod_read + ".fa"
                    read_out = open(mod_read, "w")
                    tmp_datas.append(mod_read)
                    zips.append(mod_read)
                    print(" ".join(["Uncompressing", read]))
                    log.write(" ".join(["zcat", read]) + "\n")
                    call(["zcat", read], stdout=read_out)
                    read_out.close()
                    log.write("\t" + mod_read + " is generated.\n")
            tmp_reads.append({"sample": reads["sample"],
                              "files": tmp_datas, "zips": zips})   
        return tmp_reads

    def _run_segemehl_fasta_index(self, segemehl_path, fasta_path,
                                  index, fasta, log):
        log.write(" ".join([segemehl_path,
                  "-x", os.path.join(fasta_path, index),
                  "-d", os.path.join(fasta_path, fasta)]) + "\n")
        call([segemehl_path,
              "-x", os.path.join(fasta_path, index),
              "-d", os.path.join(fasta_path, fasta)])

    def _run_segemehl_align(self, args_circ, index, fasta, read,
                            sam_file, log_file, fasta_prefix, log):
        out = open(os.path.join(self.alignment_path,
                   fasta_prefix, sam_file), "w")
        log = open(os.path.join(self.alignment_path,
                   fasta_prefix, log_file), "w")
        log.write(" ".join([args_circ.segemehl_path,
                   "-i", os.path.join(self.fasta_path, index),
                   "-d", os.path.join(self.fasta_path, fasta),
                   "-q", read, "-S"]) + "\n")
        p = Popen([args_circ.segemehl_path,
                   "-i", os.path.join(self.fasta_path, index),
                   "-d", os.path.join(self.fasta_path, fasta),
                   "-q", read, "-S"],
                  stdout=out, stderr=log)
        return p

    def _align(self, args_circ, read_datas, log):
        '''align the read. if the bam files are provided, it can be skipped.'''
        prefixs = []
        align_files = []
        log.write("Using segemehl to align the read.\n")
        log.write("Please make sure the version of segemehl is at least 0.1.9.\n")
        for fasta in os.listdir(self.fasta_path):
            index = fasta.replace(".fa", ".idx")
            self._run_segemehl_fasta_index(args_circ.segemehl_path,
                                           self.fasta_path, index, fasta, log)
            processes = []
            num_process = 0
            fasta_prefix = fasta.replace(".fa", "")
            prefixs.append(fasta_prefix)
            self.helper.check_make_folder(os.path.join(
                                self.alignment_path, fasta_prefix))
            log.write("Running for {0}.\n".format(fasta_prefix))
            for reads in read_datas:
                for read in reads["files"]:
                    num_process += 1
                    read_name = read.split("/")[-1]
                    if read_name.endswith(".fa") or \
                       read_name.endswith(".fna") or \
                       read_name.endswith(".fasta") or \
                       read_name.endswith(".fq") or \
                       read_name.endswith(".fastq"):
                        filename = read_name.split(".")
                        read_prefix = ".".join(filename[:-1])
                        sam_file = "_".join([read_prefix, fasta_prefix + ".sam"])
                        log_file = "_".join([read_prefix, fasta_prefix + ".log"])
                        align_files.append("_".join([read_prefix, fasta_prefix]))
                        print("Mapping {0}".format(sam_file))
                        p = self._run_segemehl_align(
                                args_circ, index, fasta, read,
                                sam_file, log_file, fasta_prefix, log)
                        processes.append(p)
                        if num_process == args_circ.cores:
                            self._wait_process(processes)
                            num_process = 0
                self._wait_process(processes)
            log.write("Done!\n")
            log.write("The following files are generated in {0}:\n".format(
                  os.path.join(self.alignment_path, fasta_prefix)))
            for file_ in os.listdir(os.path.join(
                   self.alignment_path, fasta_prefix)):
                log.write("\t" + file_ + "\n")
        return align_files, prefixs

    def _run_samtools_convert_bam(self, samtools_path, pre_sam, out_bam, log):
        log.write(" ".join([samtools_path, "view",
                            "-bS", pre_sam, "-o", out_bam]) + "\n")
        call([samtools_path, "view", "-bS", pre_sam, "-o", out_bam])

    def _convert_sam2bam(self, sub_alignment_path, samtools_path, align_files, log):
        bam_files = []
        convert_ones = []
        remove_ones = []
        log.write("Using Samtools to convert SAM files to BAM files.\n")
        log.write("Please make sure the version of Samtools is at least 1.3.1.\n")
        for sam in os.listdir(sub_alignment_path):
            pre_sam = os.path.join(sub_alignment_path, sam)
            if sam.endswith(".sam"):
                bam_file = sam.replace(".sam", ".bam")
                print("Converting {0} to {1}".format(sam, bam_file))
                out_bam = os.path.join(sub_alignment_path, bam_file)
                self._run_samtools_convert_bam(samtools_path, pre_sam,
                                               out_bam, log)
                bam_files.append(out_bam)
                if align_files:
                    if bam_file.replace(".bam", "") not in align_files:
                        convert_ones.append(out_bam)
                    else:
                        remove_ones.append(pre_sam)
            elif sam.endswith(".bam"):
                if (pre_sam not in convert_ones) and (
                        pre_sam not in remove_ones):
                    bam_files.append(pre_sam)
            elif sam.endswith(".log"):
                os.remove(pre_sam)
        log.write("Done!\n")
        log.write("The following files are generated:\n")
        for file_ in os.listdir(sub_alignment_path):
            if file_.endswith(".bam"):
                log.write("\t" + os.path.join(sub_alignment_path, file_) + "\n")
        return bam_files, convert_ones, remove_ones

    def _run_samtools_merge_sort(self, samtools_path, prefix,
                                 out_folder, bam_datas, log):
        log.write("Using Samtools for merging, sorting and converting "
                  "the BAM files.\n")
        log.write("Make sure the version Samtools is at least 1.3.1.\n")
        for bam_data in bam_datas:
            print("Merging bam files for {0} of {1}".format(
                prefix, bam_data["sample"]))
            sample_bam = os.path.join(out_folder, "_".join([
                prefix, bam_data["sample"] + ".bam"]))
            if len(bam_data["files"]) <= 1:
                shutil.copyfile(bam_data["files"][0], sample_bam)
            else:
                file_line = " ".join(bam_data["files"])
                log.write(" ".join([samtools_path, "merge",
                                    sample_bam, file_line]) + "\n")
                os.system(" ".join([samtools_path, "merge",
                                    sample_bam, file_line]))
            print("Sorting bam files for {0} of {1}".format(
                prefix, bam_data["sample"]))
            sort_sample = os.path.join(out_folder,
                  "_".join([prefix, bam_data["sample"] + "_sort.bam"]))
            log.write(" ".join([samtools_path, "sort",
                      "-o", sort_sample, sample_bam]) + "\n")
            call([samtools_path, "sort", "-o", sort_sample, sample_bam])
            os.remove(sample_bam)
            print("Converting bam files to sam files for {0} of {1}".format(
                prefix, bam_data["sample"]))
            log.write(" ".join([samtools_path, "view", "-h", "-o",
                      sort_sample.replace(".bam", ".sam"), sort_sample]) + "\n")
            call([samtools_path, "view", "-h", "-o",
                  sort_sample.replace(".bam", ".sam"), sort_sample])
        log.write("Done!\n")
        log.write("\t" + sort_sample.replace(".bam", ".sam") + " is generated.\n")

    def _merge_sort_aligment_file(
            self, bam_datas, read_datas, samtools_path,
            out_folder, convert_ones, tmp_reads, remove_ones, prefix, log):
        if bam_datas is None:
            merge_bam_datas = []
            for read_data in read_datas:
                bam_files = []
                for read in read_data["files"]:
                    if read.endswith(".gz") or read.endswith(".bz2"):
                        read = ".".join(
                                read.split("/")[-1].split(".")[:-1])
                    read_prefix = ".".join(
                        read.split("/")[-1].split(".")[:-1])
                    bam_files.append(os.path.join(
                        self.alignment_path, prefix,
                        "_".join([read_prefix, prefix + ".bam"])))
                merge_bam_datas.append({"sample": read_data["sample"],
                                        "files": bam_files})
        elif (bam_datas is not None) and (read_datas is not None):
            merge_bam_datas = copy.deepcopy(bam_datas)
            for bam_data in merge_bam_datas:
                for read_data in read_datas:
                    if bam_data["sample"] == read_data["sample"]:
                        for read in read_data["files"]:
                            read_prefix = ".".join(
                                read.split("/")[-1].split(".")[:-1])
                            bam = os.path.join(
                                self.alignment_path, prefix,
                                "_".join([read_prefix, prefix + ".bam"]))
                            if (bam not in bam_data["files"]):
                                bam_data["files"].append(bam)
        else:
            merge_bam_datas = copy.deepcopy(bam_datas)
        self._run_samtools_merge_sort(samtools_path, prefix,
                                      out_folder, merge_bam_datas, log)
        for bam in convert_ones:
            os.remove(bam)
        for sam in remove_ones:
            os.remove(sam)

    def _run_testrealign(self, prefix, testrealign_path, out_folder, log):
        log.write("Using Segemehl to detect circular RNAs.\n")
        log.write("Please make sure the version of Segemehl is at least 0.1.9.\n")
        log.write("Please make sure your testrealign.x exists. If it does not "
                  "exists, please reinstall your Segemehl via using make all.\n")
        sub_splice_path = os.path.join(self.splice_path, prefix)
        if not os.path.exists(sub_splice_path):
            os.mkdir(sub_splice_path)
        err_log = os.path.join(sub_splice_path, prefix + ".log")
        print("Running testrealign.x for {0}".format(prefix))
        for sam_file in os.listdir(out_folder):
            if sam_file.endswith("sort.sam"):
                sample_prefix = sam_file.replace("_sort.sam", "")
                command = " ".join([
                    testrealign_path,
                    "-d", os.path.join(self.fasta_path, prefix + ".fa"),
                    "-q", os.path.join(out_folder, sam_file), "-n",
                    "-U", os.path.join(sub_splice_path,
                                       sample_prefix + "_splicesites.bed"),
                    "-T", os.path.join(sub_splice_path,
                                       sample_prefix + "_transrealigned.bed")])
                log.write(command + " 2>" + err_log + "\n")
                os.system(command + " 2>" + err_log)
        log.write("Done!\n")
        log.write("The following files are generated:\n")
        for file_ in os.listdir(sub_splice_path):
            log.write("\t" + os.path.join(sub_splice_path, file_) + "\n")
        self.helper.remove_all_content(out_folder, ".sam", "file")

    def _merge_bed(self, fastas, splice_path, output_folder):
        '''Merge the bed files for analysis'''
        fa_prefixs = []
        for fasta in os.listdir(fastas):
            headers = []
            if (fasta.endswith(".fa") or fasta.endswith(".fna") or
                    fasta.endswith(".fasta")):
                with open(os.path.join(fastas, fasta), "r") as f_h:
                    for line in f_h:
                        line = line.strip()
                        if line.startswith(">"):
                            headers.append(line[1:])
                filename = fasta.split(".")
                fasta_prefix = ".".join(filename[:-1])
                fa_prefixs.append(fasta_prefix)
                bed_folder = os.path.join(
                    output_folder, fasta_prefix)
                self.helper.check_make_folder(bed_folder)
                samples = []
                for header in headers:
                    for splice in os.listdir(os.path.join(
                            splice_path, header)):
                        if splice.endswith(".bed"):
                            if self.splices["file"] in splice:
                                sample = splice.replace(header, "")
                                sample = sample.replace(
                                    self.splices["file"], "")
                                if sample not in samples:
                                    samples.append(sample)
                            shutil.copyfile(
                                os.path.join(
                                splice_path, header, splice),
                                os.path.join(
                                bed_folder, "tmp_" + splice))
                for sample in samples:
                    out_splice = os.path.join(bed_folder, "".join([
                        fasta_prefix + sample + self.splices["file"]]))
                    out_trans = os.path.join(bed_folder, "".join([
                        fasta_prefix + sample + self.trans["file"]]))
                    if os.path.exists(out_splice):
                        os.remove(out_splice)
                    if os.path.exists(out_trans):
                        os.remove(out_trans)
                    for file_ in os.listdir(bed_folder):
                        if (self.splices["splice"] in file_) and (
                                sample in file_):
                            self.helper.merge_file(os.path.join(
                                    bed_folder, file_), out_splice)
                        elif (self.trans["trans"] in file_) and (
                                sample in file_):
                            self.helper.merge_file(os.path.join(
                                    bed_folder, file_), out_trans)
        self.helper.remove_all_content(splice_path, None, "dir")
        return samples, fa_prefixs

    def _stat_and_gen_gff(self, prefixs, samples, args_circ, log):
        '''do statistics and print the result to gff file'''
        log.write("Running circRNA.py to do statistics and generate gff files.\n")
        log.write("The following files are generated:\n")
        for prefix in prefixs:
            self.helper.check_make_folder(os.path.join(self.gff_folder,
                                                       prefix))
            self.helper.check_make_folder(os.path.join(self.splice_path,
                                                       prefix))
            for bed in os.listdir(os.path.join(
                args_circ.output_folder, prefix)):
                if (bed.split("_")[0] != "tmp") and (bed.endswith(".bed")):
                    shutil.copy(
                        os.path.join(args_circ.output_folder, prefix, bed),
                        os.path.join(self.splice_path, prefix))
            self.helper.check_make_folder(os.path.join(
                                          self.candidate_path, prefix))
            print("Comparing circular RNAs with annotations of {0}".format(
                prefix))
            for sample in samples:
                splice_file = os.path.join(
                    self.splice_path, prefix,
                    "".join([prefix, sample, self.splices["file"]]))
                stat_file = os.path.join(args_circ.stat_folder,
                               "".join(["stat_", prefix, sample,
                                        "circRNA.csv"]))
                csv_all = os.path.join(self.candidate_path, prefix,
                               "".join([prefix, sample, "circRNA_all.csv"]))
                csv_best = os.path.join(self.candidate_path, prefix,
                               "".join([prefix, sample, "circRNA_best.csv"]))
                gff_all = os.path.join(self.gff_folder, prefix,
                                "".join([prefix, sample, "circRNA_all.gff"]))
                gff_best = os.path.join(self.gff_folder, prefix,
                                  "".join([prefix, sample, "circRNA_best.gff"]))
                detect_circrna(splice_file, os.path.join(
                               self.gff_path, prefix + ".gff"), csv_all,
                               args_circ, stat_file)
                self.converter.convert_circ2gff(
                     os.path.join(self.candidate_path, prefix,
                                  "".join([prefix, sample, "circRNA_all.csv"])),
                     args_circ, gff_all, gff_best)
                log.write("\t" + stat_file + "\n")
                log.write("\t" + csv_all + "\n")
                log.write("\t" + csv_best + "\n")
                log.write("\t" + gff_all + "\n")
                log.write("\t" + gff_best + "\n")

    def _extract_input_files(self, inputs):
        input_datas = []
        for input_ in inputs:
            datas = input_.split(":")
            if len(datas) != 2:
                print("Error: the format of --bam_files or "
                      "--read_files is wrong!")
                sys.exit()
            for file_ in datas[-1].split(","):
                if not os.path.exists(file_):
                    print("Error: some files in --bam_files or "
                          "--read_files do not exist!")
                    sys.exit()
            input_datas.append({"sample": datas[0],
                                "files": datas[-1].split(",")})
        return input_datas

    def _combine_read_bam(self, bam_files, bam_datas, read_datas):
        if bam_datas is not None:
            for bam_data in bam_datas:
                for read_data in read_datas:
                    if bam_data["sample"] == read_data["sample"]:
                        for read in read_data["files"]:
                            prefix = ".".join(
                                read.split("/")[-1].split(".")[:-1])
                            bam = os.path.join(self.alignment_path,
                                               prefix + ".bam")
                            if (bam in bam_files) and (
                                    bam not in bam_data["files"]):
                                bam_data["files"].append(bam)
        else:
            bam_datas = []
            for read_data in read_datas:
                bam_files = []
                for read in read_data["files"]:
                    prefix = ".".join(
                        read.split("/")[-1].split(".")[:-1])
                    bam_files.append(os.path.join(
                        self.alignment_path, prefix + ".bam"))
                bam_datas.append({"sample": read_data["sample"],
                                  "files": bam_files})
        return bam_datas

    def _remove_tmp_files(self, args_circ, fa_prefixs):
        self.helper.remove_tmp_dir(args_circ.fastas)
        self.helper.remove_tmp_dir(args_circ.gffs)
        self.helper.remove_all_content(args_circ.output_folder,
                                       ".bam", "file")
        for prefix in fa_prefixs:
            shutil.rmtree(os.path.join(args_circ.output_folder, prefix))

    def run_circrna(self, args_circ, log):
        '''detection of circRNA'''
        bam_datas = None
        read_datas = None
        if (args_circ.bams is None) and (args_circ.read_files is None):
            log.write("--bam_files and --read_files can not be both emtpy.\n")
            print("Error: --bam_files or --read_files should be assigned.")
            sys.exit()
        if args_circ.bams is not None:
            bam_datas = self._extract_input_files(args_circ.bams)
        if args_circ.read_files is not None:
            read_datas = self._extract_input_files(args_circ.read_files)
        for gff in os.listdir(args_circ.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(
                                                 args_circ.gffs, gff))
        if args_circ.segemehl_path is None:
            log.write("segemehl does not exists.\n")
            print("Error: please assign segemehl path!!")
            sys.exit()
        self.multiparser.parser_fasta(args_circ.fastas)
        self.multiparser.parser_gff(args_circ.gffs, None)
        self.multiparser.combine_gff(args_circ.fastas, self.gff_path,
                                     "fasta", None)
        tmp_reads = []
        if args_circ.read_files:
            log.write("Raw read files are found.\n")
            tmp_reads = self._deal_zip_file(read_datas, log)
            align_files, prefixs = self._align(args_circ, tmp_reads, log)
        else:
            align_files = None
        prefixs = []
        for fasta in os.listdir(self.fasta_path):
            if fasta.endswith(".fa"):
                fasta_prefix = fasta.replace(".fa", "")
                prefixs.append(fasta_prefix)
        for prefix in prefixs:
            if args_circ.read_files:
                sub_alignment_path = os.path.join(self.alignment_path, prefix)
                bam_files, convert_ones, remove_ones = self._convert_sam2bam(
                sub_alignment_path, args_circ.samtools_path, align_files, log)
            else:
                convert_ones = []
                remove_ones = []
            self._merge_sort_aligment_file(
                bam_datas, read_datas, args_circ.samtools_path,
                args_circ.output_folder,
                convert_ones, tmp_reads, remove_ones, prefix, log)
            self._run_testrealign(prefix, args_circ.testrealign_path,
                                  args_circ.output_folder, log)
        samples, fa_prefixs = self._merge_bed(
            args_circ.fastas, self.splice_path, args_circ.output_folder)
        self._stat_and_gen_gff(fa_prefixs, samples, args_circ, log)
        if len(tmp_reads) != 0:
            for reads in tmp_reads:
                for read in reads["zips"]:
                    os.remove(read)
        self._remove_tmp_files(args_circ, fa_prefixs)