예제 #1
0
 def setUp(self):
     self.converter = Converter()
     self.example = Example()
     self.converter.gff3parser = Mock_gff3_parser
     self.converter._print_rntptt_title = Mock_func().print_rntptt_title
     self.converter.tsspredator = Mock_TSSPredatorReader()
     self.converter._read_file = Mock_func().mock_read_file
     self.gff_file = self.example.gff_file
     self.ptt_out = self.example.ptt_out
     self.rnt_out = self.example.rnt_out
     self.srna_out = self.example.srna_out
     self.embl_file = self.example.embl_file
     self.embl_out = self.example.embl_out
     self.multi_embl = self.example.multi_embl
     self.gff_out = self.example.gff_out
     self.mastertable = self.example.mastertable
     self.tss_file = self.example.tss_file
     self.fasta_file = self.example.fasta_file
     self.transterm = self.example.transterm
     self.term_file = self.example.term_file
     self.circ_file = self.example.circrna_table
     self.circ_all = self.example.circrna_all
     self.circ_best = self.example.circrna_best
     self.test_folder = "test_folder"
     self.mock_args = MockClass()
     if (not os.path.exists(self.test_folder)):
         os.mkdir(self.test_folder)
예제 #2
0
 def __init__(self, args_circ):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.converter = Converter()
     self.alignment_path = os.path.join(args_circ.output_folder,
                                        "segemehl_align")
     self.splice_path = os.path.join(args_circ.output_folder,
                                     "segemehl_splice")
     self.candidate_path = os.path.join(args_circ.output_folder,
                                        "circRNA_tables")
     self.gff_folder = os.path.join(args_circ.output_folder, "gffs")
     self.gff_path = os.path.join(args_circ.gffs, "tmp")
     self.splices = {
         "all_file": "splicesites_all.bed",
         "file": "splicesites.bed",
         "all": "splicesites_all",
         "splice": "splicesites"
     }
     self.trans = {
         "all_file": "transrealigned_all.bed",
         "file": "transrealigned.bed",
         "all": "transrealigned_all",
         "trans": "transrealigned"
     }
     self.bams = {"whole": "whole_reads.bam", "sort": "whole_reads_sort"}
     if args_circ.align:
         if args_circ.fastas is None:
             print("Error: There is no genome fasta file!!!")
             sys.exit()
         else:
             self.fasta_path = os.path.join(args_circ.fastas, "tmp")
     else:
         self.fasta_path = os.path.join(args_circ.fastas, "tmp")
예제 #3
0
 def __init__(self, args_term):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.converter = Converter()
     self.gff_parser = Gff3Parser()
     self.gff_path = os.path.join(args_term.gffs, "tmp")
     self.fasta_path = os.path.join(args_term.fastas, "tmp")
     self.tran_path = os.path.join(args_term.trans, "tmp")
     self.outfolder = {
         "term": os.path.join(args_term.out_folder, "gffs"),
         "csv": os.path.join(args_term.out_folder, "tables")
     }
     self.terms = {
         "all": os.path.join(self.outfolder["term"], "all_candidates"),
         "express": os.path.join(self.outfolder["term"],
                                 "expressed_candidates"),
         "best": os.path.join(self.outfolder["term"], "best_candidates"),
         "non": os.path.join(self.outfolder["term"],
                             "non_expressed_candidates")
     }
     self.csvs = {
         "all": os.path.join(self.outfolder["csv"], "all_candidates"),
         "express": os.path.join(self.outfolder["csv"],
                                 "expressed_candidates"),
         "best": os.path.join(self.outfolder["csv"], "best_candidates"),
         "non": os.path.join(self.outfolder["csv"],
                             "non_expressed_candidates")
     }
     self.combine_path = os.path.join(self.gff_path, "combine")
     self.tmps = {
         "transterm": os.path.join(os.getcwd(), "tmp_transterm"),
         "hp": "transtermhp",
         "hp_gff": "transtermhp.gff",
         "hp_path": "tmp_transterm/tmp",
         "term_table": os.path.join(os.getcwd(), "tmp_term_table"),
         "merge": os.path.join(os.getcwd(), "tmp_merge_gff"),
         "gff": "tmp.gff",
         "folder": os.path.join(os.getcwd(), "tmp")
     }
     self.suffixs = {
         "gff": "term.gff",
         "csv": "term.csv",
         "allgff": "term_all.gff"
     }
     if args_term.srnas:
         self.srna_path = os.path.join(args_term.srnas, "tmp")
     else:
         self.srna_path = None
     self._make_gff_folder()
예제 #4
0
 def __init__(self, args_circ):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.converter = Converter()
     self.alignment_path = os.path.join(args_circ.output_folder,
                                        "segemehl_alignment_files")
     self.splice_path = os.path.join(args_circ.output_folder,
                                     "segemehl_splice_results")
     self.candidate_path = os.path.join(args_circ.output_folder,
                                        "circRNA_tables")
     self.gff_folder = os.path.join(args_circ.output_folder, "gffs")
     self.gff_path = os.path.join(args_circ.gffs, "tmp")
     self.splices = {"file": "splicesites.bed", "splice": "splicesites"}
     self.trans = {"file": "transrealigned.bed", "trans": "transrealigned"}
     self.fasta_path = os.path.join(args_circ.fastas, "tmp")
예제 #5
0
 def setUp(self):
     self.converter = Converter()
     self.example = Example()
     self.converter.gff3parser = Mock_gff3_parser
     self.converter._print_rntptt_title = Mock_func().print_rntptt_title
     self.converter.tsspredator = Mock_TSSPredatorReader()
     self.converter._read_file = Mock_func().mock_read_file
     self.gff_file = self.example.gff_file
     self.ptt_out = self.example.ptt_out
     self.rnt_out = self.example.rnt_out
     self.srna_out = self.example.srna_out
     self.embl_file = self.example.embl_file
     self.embl_out = self.example.embl_out
     self.multi_embl = self.example.multi_embl
     self.gff_out = self.example.gff_out
     self.mastertable = self.example.mastertable
     self.tss_file = self.example.tss_file
     self.fasta_file = self.example.fasta_file
     self.transterm = self.example.transterm
     self.term_file = self.example.term_file
     self.circ_file = self.example.circrna_table
     self.circ_all = self.example.circrna_all
     self.circ_best = self.example.circrna_best
     self.test_folder = "test_folder"
     self.mock_args = MockClass()
     if (not os.path.exists(self.test_folder)):
         os.mkdir(self.test_folder)
예제 #6
0
 def __init__(self, args_circ):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.converter = Converter()
     self.alignment_path = os.path.join(args_circ.output_folder,
                                        "segemehl_align")
     self.splice_path = os.path.join(args_circ.output_folder,
                                     "segemehl_splice")
     self.candidate_path = os.path.join(args_circ.output_folder,
                                        "circRNA_tables")
     self.gff_folder = os.path.join(args_circ.output_folder, "gffs")
     self.gff_path = os.path.join(args_circ.gffs, "tmp")
     self.splices = {"all_file": "splicesites_all.bed",
                     "file": "splicesites.bed",
                     "all": "splicesites_all", "splice": "splicesites"}
     self.trans = {"all_file": "transrealigned_all.bed",
                   "file": "transrealigned.bed",
                   "all": "transrealigned_all", "trans": "transrealigned"}
     self.bams = {"whole": "whole_reads.bam", "sort": "whole_reads_sort"}
     if args_circ.align:
         if args_circ.fastas is None:
             print("Error: There is no genome fasta file!!!")
             sys.exit()
         else:
             self.fasta_path = os.path.join(args_circ.fastas, "tmp")
     else:
         self.fasta_path = os.path.join(args_circ.fastas, "tmp")
예제 #7
0
 def __init__(self, args_ratt):
     self.multiparser = Multiparser()
     self.converter = Converter()
     self.format_fixer = FormatFixer()
     self.helper = Helper()
     self.gbk = os.path.join(args_ratt.ref_embls, "gbk_tmp")
     self.gbk_tmp = os.path.join(self.gbk, "tmp")
     self.embl = os.path.join(args_ratt.ref_embls, "embls")
     self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt")
     self.tmp_files = {
         "tar": os.path.join(args_ratt.tar_fastas, "tmp"),
         "ref": os.path.join(args_ratt.ref_fastas, "tmp"),
         "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"),
         "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"),
         "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"),
         "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt")
     }
예제 #8
0
 def __init__(self, args_tss):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.converter = Converter()
     self.master = os.path.join(args_tss.out_folder, "MasterTables")
     self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta":
                  "tmp_tss", "tmp": "tmp"}
     if args_tss.ta_files is not None:
         self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp")
     else:
         self.tmps["ta"] = None
     self.gff_path = os.path.join(args_tss.gffs, "tmp")
     if args_tss.manual is not None:
         self.manual_path = os.path.join(args_tss.manual, "tmp")
     self.wig_path = os.path.join(args_tss.wig_folder, "tmp")
     self.fasta_path = os.path.join(args_tss.fastas, "tmp")
     self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics")
     self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs")
예제 #9
0
 def __init__(self, out_folder):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.converter = Converter()
     self.gffparser = Gff3Parser()
     self.tmp_id = os.path.join(out_folder, "tmp_id_list")
     self.all_result = os.path.join(out_folder, "all_results")
     self.best_result = os.path.join(out_folder, "best_results")
     self.fig = os.path.join(out_folder, "figures")
     self.with_strain = "with_strain"
     self.without_strain = "without_strain"
     self.tmp_files = {
         "log": "tmp_log",
         "action": "tmp_action.log",
         "pubmed": "tmp_pubmed.log",
         "specific": os.path.join(out_folder, "tmp_specific"),
         "nospecific": os.path.join(out_folder, "tmp_nospecific"),
         "wget_action": os.path.join(out_folder, "tmp_action")
     }
예제 #10
0
def convert2gff(out_path, gff_files, args_ops):
    for core in range(1, args_ops.cores+1):
        output_folder = os.path.join(
                out_path, "_".join(["MasterTable", str(core)]))
        gff_file = os.path.join(
                output_folder, "_".join(["TSSpredator", str(core) + ".gff"]))
        Converter().convert_mastertable2gff(
                    os.path.join(output_folder, "MasterTable.tsv"),
                    "TSSpredator", args_ops.program,
                    args_ops.project_strain, gff_file)
        gff_files.append(gff_file)
예제 #11
0
 def __init__(self, args_tss):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.converter = Converter()
     self.master = os.path.join(args_tss.out_folder, "MasterTables")
     self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta":
                  "tmp_tss", "tmp": "tmp"}
     if args_tss.ta_files is not None:
         self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp")
     else:
         self.tmps["ta"] = None
     self.gff_path = os.path.join(args_tss.gffs, "tmp")
     self.wig_path = os.path.join(args_tss.wig_folder, "tmp")
     self.fasta_path = os.path.join(args_tss.fastas, "tmp")
     self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics")
     self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs")
예제 #12
0
 def __init__(self, args_circ):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.converter = Converter()
     self.alignment_path = os.path.join(args_circ.output_folder,
                                        "segemehl_alignment_files")
     self.splice_path = os.path.join(args_circ.output_folder,
                                     "segemehl_splice_results")
     self.candidate_path = os.path.join(args_circ.output_folder,
                                        "circRNA_tables")
     self.gff_folder = os.path.join(args_circ.output_folder, "gffs")
     self.gff_path = os.path.join(args_circ.gffs, "tmp")
     self.splices = {"file": "splicesites.bed",
                     "splice": "splicesites"}
     self.trans = {"file": "transrealigned.bed",
                   "trans": "transrealigned"}
     self.fasta_path = os.path.join(args_circ.fastas, "tmp")
예제 #13
0
 def get_input(self):
     """Download required files from website."""
     print("Running get input files...")
     if self._args.FTP_path is None:
         print("Error: Please assign the path for downloading the data!!")
         sys.exit()
     if self._args.for_target:
         annotation_folder = self._paths.tar_annotation_folder
         fasta_folder = self._paths.tar_fasta_folder
     else:
         annotation_folder = self._paths.ref_annotation_folder
         fasta_folder = self._paths.ref_fasta_folder
     self.helper.check_make_folder(annotation_folder)
     self.helper.check_make_folder(fasta_folder)
     if self._args.ref_gff is True:
         get_file(self._args.FTP_path, annotation_folder, "gff",
                  self._args.for_target)
         get_file(self._args.FTP_path, annotation_folder, "_genomic.gff.gz",
                  self._args.for_target)
     if self._args.ref_fasta is True:
         get_file(self._args.FTP_path, fasta_folder, "fna",
                  self._args.for_target)
         get_file(self._args.FTP_path, fasta_folder, "_genomic.fna.gz",
                  self._args.for_target)
     if self._args.ref_gbk is True:
         get_file(self._args.FTP_path, annotation_folder, "gbk",
                  self._args.for_target)
         get_file(self._args.FTP_path, annotation_folder, "gbff",
                  self._args.for_target)
         get_file(self._args.FTP_path, annotation_folder,
                  "_genomic.gbff.gz", self._args.for_target)
     if self._args.ref_ptt is True:
         get_file(self._args.FTP_path, annotation_folder, "ptt",
                  self._args.for_target)
     if self._args.ref_rnt is True:
         get_file(self._args.FTP_path, annotation_folder, "rnt",
                  self._args.for_target)
     if self._args.convert_embl is True:
         annotation_files = os.listdir(annotation_folder)
         if len(annotation_files) == 0:
             sys.stdout.write("No gbk files!!\n")
         else:
             Converter().convert_gbk2embl(annotation_folder)
예제 #14
0
 def __init__(self, args_tran):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.converter = Converter()
     self.gff_outfolder = os.path.join(args_tran.out_folder, "gffs")
     self.tran_path = os.path.join(self.gff_outfolder, "tmp")
     self.stat_path = os.path.join(args_tran.out_folder, "statistics")
     self.tmps = {"gff": "tmp.gff", "merge": "tmp_merge",
                  "tran": os.path.join(args_tran.out_folder, "tmp_tran"),
                  "tss_ta": os.path.join(self.gff_outfolder, "tmp_tss_ta"),
                  "ta_tss": os.path.join(self.gff_outfolder, "tmp_ta_tss"),
                  "ta_gff": os.path.join(self.gff_outfolder, "tmp_ta_gff"),
                  "gff_ta": os.path.join(self.gff_outfolder, "tmp_gff_ta"),
                  "uni": os.path.join(self.gff_outfolder, "tmp_uni"),
                  "overlap": os.path.join(
                      self.gff_outfolder, "tmp_overlap")}
     self.frag = "transcript_fragment.gff"
     self.tex = "transcript_tex_notex.gff"
     self.endfix_tran = "transcript.gff"
예제 #15
0
 def __init__(self, args_ratt):
     self.multiparser = Multiparser()
     self.converter = Converter()
     self.format_fixer = FormatFixer()
     self.helper = Helper()
     self.gbk = os.path.join(args_ratt.ref_embls, "gbk_tmp")
     self.gbk_tmp = os.path.join(self.gbk, "tmp")
     self.embl = os.path.join(args_ratt.ref_embls, "embls")
     self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt")
     self.tmp_files = {"tar": os.path.join(args_ratt.tar_fastas, "tmp"),
                       "ref": os.path.join(args_ratt.ref_fastas, "tmp"),
                       "out_gff": os.path.join(args_ratt.gff_outfolder,
                                               "tmp"),
                       "gff": os.path.join(args_ratt.gff_outfolder,
                                           "tmp.gff"),
                       "ptt": os.path.join(args_ratt.gff_outfolder,
                                           "tmp.ptt"),
                       "rnt": os.path.join(args_ratt.gff_outfolder,
                                           "tmp.rnt")}
예제 #16
0
 def __init__(self, out_folder):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.converter = Converter()
     self.gffparser = Gff3Parser()
     self.tmp_id = os.path.join(out_folder, "tmp_id_list")
     self.all_result = os.path.join(out_folder, "all_results")
     self.best_result = os.path.join(out_folder, "best_results")
     self.fig = os.path.join(out_folder, "figures")
     self.with_strain = "with_strain"
     self.without_strain = "without_strain"
     self.tmp_files = {"log": "tmp_log", "action": "tmp_action.log",
                       "pubmed": "tmp_pubmed.log",
                       "specific": os.path.join(
                                   out_folder, "tmp_specific"),
                       "nospecific": os.path.join(
                                     out_folder, "tmp_nospecific"),
                       "wget_action": os.path.join(
                                      out_folder, "tmp_action")}
예제 #17
0
 def __init__(self, args_term):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.converter = Converter()
     self.gff_parser = Gff3Parser()
     self.gff_path = os.path.join(args_term.gffs, "tmp")
     self.fasta_path = os.path.join(args_term.fastas, "tmp")
     self.tran_path = os.path.join(args_term.trans, "tmp")
     self.outfolder = {"term": os.path.join(args_term.out_folder, "gffs"),
                       "csv": os.path.join(args_term.out_folder, "tables")}
     self.terms = {"all": os.path.join(self.outfolder["term"],
                                       "all_candidates"),
                   "express": os.path.join(self.outfolder["term"],
                                           "expressed_candidates"),
                   "best": os.path.join(self.outfolder["term"],
                                        "best_candidates"),
                   "non": os.path.join(self.outfolder["term"],
                                       "non_expressed_candidates")}
     self.csvs = {"all": os.path.join(self.outfolder["csv"],
                                      "all_candidates"),
                  "express": os.path.join(self.outfolder["csv"],
                                          "expressed_candidates"),
                  "best": os.path.join(self.outfolder["csv"],
                                       "best_candidates"),
                  "non": os.path.join(self.outfolder["csv"],
                                      "non_expressed_candidates")}
     self.combine_path = os.path.join(self.gff_path, "combine")
     self.tmps = {"transterm": os.path.join(os.getcwd(), "tmp_transterm"),
                  "hp": "transtermhp", "hp_gff": "transtermhp.gff",
                  "hp_path": "tmp_transterm/tmp",
                  "term_table": os.path.join(os.getcwd(), "tmp_term_table"),
                  "merge": os.path.join(os.getcwd(), "tmp_merge_gff"),
                  "gff": "tmp.gff",
                  "folder": os.path.join(os.getcwd(), "tmp")}
     self.suffixs = {"gff": "term.gff", "csv": "term.csv",
                     "allgff": "term_all.gff"}
     if args_term.srnas:
         self.srna_path = os.path.join(args_term.srnas, "tmp")
     else:
         self.srna_path = None
     self._make_gff_folder()
예제 #18
0
class RATT(object):
    '''annotation transfer'''
    def __init__(self, args_ratt):
        self.multiparser = Multiparser()
        self.converter = Converter()
        self.format_fixer = FormatFixer()
        self.helper = Helper()
        if args_ratt.ref_gbk:
            self.gbk = os.path.join(args_ratt.ref_gbk, "gbk_tmp")
            self.gbk_tmp = os.path.join(self.gbk, "tmp")
            self.embl = os.path.join(args_ratt.ref_gbk, "embls")
        if args_ratt.ref_embls:
            self.embl = args_ratt.ref_embls
        self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt")
        self.tmp_files = {
            "tar": os.path.join(args_ratt.tar_fastas, "tmp"),
            "ref": os.path.join(args_ratt.ref_fastas, "tmp"),
            "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"),
            "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"),
            "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"),
            "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt")
        }

    def _convert_to_pttrnt(self, gffs, files):
        for gff in files:
            if gff.endswith(".gff"):
                gff = os.path.join(gffs, gff)
                filename = gff.split("/")
                prefix = filename[-1][:-4]
                rnt = gff[:-3] + "rnt"
                ptt = gff[:-3] + "ptt"
                fasta = self.helper.get_correct_file(self.tmp_files["tar"],
                                                     ".fa", prefix, None, None)
                if fasta:
                    self.converter.convert_gff2rntptt(gff, fasta, ptt, rnt,
                                                      None, None)

    def _remove_files(self, args_ratt, out_gbk):
        self.helper.remove_all_content(args_ratt.gff_outfolder, ".gff", "file")
        self.helper.remove_all_content(args_ratt.gff_outfolder, ".ptt", "file")
        self.helper.remove_all_content(args_ratt.gff_outfolder, ".rnt", "file")
        self.helper.move_all_content(self.tmp_files["out_gff"],
                                     args_ratt.gff_outfolder, None)
        shutil.rmtree(self.tmp_files["out_gff"])
        shutil.rmtree(self.tmp_files["tar"])
        shutil.rmtree(self.tmp_files["ref"])
        self.helper.remove_tmp_dir(args_ratt.tar_fastas)
        self.helper.remove_tmp_dir(args_ratt.ref_fastas)
        self.helper.remove_tmp_dir(args_ratt.ref_embls)
        self.helper.remove_tmp_dir(args_ratt.ref_gbk)

    def _convert_to_gff(self, ratt_result, args_ratt, files):
        name = ratt_result.split(".")
        filename = ".".join(name[1:-2]) + ".gff"
        output_file = os.path.join(args_ratt.output_path, filename)
        self.converter.convert_embl2gff(
            os.path.join(args_ratt.output_path, ratt_result), output_file)
        self.format_fixer.fix_ratt(output_file, ".".join(name[1:-2]),
                                   "tmp_gff")
        shutil.move("tmp_gff", output_file)
        shutil.copy(output_file, os.path.join(args_ratt.gff_outfolder,
                                              filename))
        files.append(filename)

    def _parser_embl_gbk(self, files):
        self.helper.check_make_folder(self.gbk)
        for file_ in files:
            close = False
            with open(file_, "r") as f_h:
                for line in f_h:
                    if (line.startswith("LOCUS")):
                        out = open(self.gbk_tmp, "w")
                        datas = line.split(" ")
                        for data in datas:
                            if (len(data) != 0) and (data != "LOCUS"):
                                filename = ".".join([data, "gbk"])
                                break
                    elif (line.startswith("VERSION")):
                        datas = line.split(" ")
                        for data in datas:
                            if (len(data) != 0) and (data != "VERSION"):
                                new_filename = ".".join([data, "gbk"])
                                break
                        if new_filename.find(filename):
                            filename = new_filename
                    if out:
                        out.write(line)
                    if line.startswith("//"):
                        out.close()
                        close = True
                        shutil.move(self.gbk_tmp,
                                    os.path.join(self.gbk, filename))
            if not close:
                out.close()
        return self.gbk

    def _convert_embl(self, ref_embls):
        '''convert gbk to embl'''
        detect_gbk = False
        gbks = []
        out_gbk = None
        for embl in os.listdir(ref_embls):
            if (embl.endswith(".gbk")) or (embl.endswith(".gbff")) or (
                    embl.endswith(".gb")):
                detect_gbk = True
                gbks.append(os.path.join(ref_embls, embl))
        if not detect_gbk:
            print("Error: Please assign proper Genebank files!")
            sys.exit()
        elif detect_gbk:
            out_gbk = self._parser_embl_gbk(gbks)
            self.converter.convert_gbk2embl(out_gbk)
            self.helper.check_make_folder(self.embl)
            self.helper.move_all_content(out_gbk, self.embl, [".embl"])
        return out_gbk

    def _run_ratt(self, args_ratt, tar, ref, out):
        call([
            args_ratt.ratt_path, self.embl,
            os.path.join(self.tmp_files["tar"], tar + ".fa"),
            args_ratt.element, args_ratt.transfer_type,
            os.path.join(self.tmp_files["ref"], ref + ".fa")
        ],
             stdout=out,
             stderr=DEVNULL)

    def _format_and_run(self, args_ratt):
        print("Running RATT")
        for pair in args_ratt.pairs:
            ref = pair.split(":")[0]
            tar = pair.split(":")[1]
            out = open(self.ratt_log, "w+")
            self._run_ratt(args_ratt, tar, ref, out)
            for filename in os.listdir():
                if ("final" in filename):
                    shutil.move(filename,
                                os.path.join(args_ratt.output_path, filename))
                elif (args_ratt.element in filename) or (
                        "query" in filename) or ("Reference" in filename) or (
                            "Query" in filename) or ("Sequences" in filename):
                    if os.path.isfile(filename):
                        os.remove(filename)
                    if os.path.isdir(filename):
                        shutil.rmtree(filename)
        out.close()

    def annotation_transfer(self, args_ratt):
        self.multiparser.parser_fasta(args_ratt.tar_fastas)
        self.multiparser.parser_fasta(args_ratt.ref_fastas)
        out_gbk = None
        if args_ratt.ref_embls is None:
            out_gbk = self._convert_embl(args_ratt.ref_gbk)
        self._format_and_run(args_ratt)
        if args_ratt.convert:
            files = []
            for data in os.listdir(args_ratt.output_path):
                if "final.embl" in data:
                    self._convert_to_gff(data, args_ratt, files)
                    self._convert_to_pttrnt(args_ratt.gff_outfolder, files)
            self.helper.check_make_folder(self.tmp_files["out_gff"])
            for folder in os.listdir(args_ratt.tar_fastas):
                files = []
                if "_folder" in folder:
                    datas = folder.split("_folder")
                    prefix = ".".join(datas[0].split(".")[:-1])
                    for file_ in os.listdir(
                            os.path.join(args_ratt.tar_fastas, folder)):
                        files.append(file_[:-3])
                    for gff in os.listdir(args_ratt.gff_outfolder):
                        for file_ in files:
                            if (".gff" in gff) and (file_ == gff[:-4]):
                                self.helper.merge_file(
                                    os.path.join(args_ratt.gff_outfolder, gff),
                                    self.tmp_files["gff"])
                            if (".ptt" in gff) and (file_ == gff[:-4]):
                                self.helper.merge_file(
                                    os.path.join(args_ratt.gff_outfolder, gff),
                                    self.tmp_files["ptt"])
                            if (".rnt" in gff) and (file_ == gff[:-4]):
                                self.helper.merge_file(
                                    os.path.join(args_ratt.gff_outfolder, gff),
                                    self.tmp_files["rnt"])
                    if os.path.exists(self.tmp_files["gff"]):
                        shutil.move(
                            self.tmp_files["gff"],
                            os.path.join(self.tmp_files["out_gff"],
                                         prefix + ".gff"))
                        shutil.move(
                            self.tmp_files["ptt"],
                            os.path.join(self.tmp_files["out_gff"],
                                         prefix + ".ptt"))
                        shutil.move(
                            self.tmp_files["rnt"],
                            os.path.join(self.tmp_files["out_gff"],
                                         prefix + ".rnt"))
                    else:
                        print("Error: Please check your fasta or "
                              "annotation files, they should only contain "
                              "the query genome. And make sure your RATT can "
                              "work properly (check $ANNOgesic/output/"
                              "annotation_transfer/ratt_log.txt).")
        self._remove_files(args_ratt, out_gbk)
예제 #19
0
class Terminator(object):
    '''detection of terminator'''
    def __init__(self, args_term):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.gff_parser = Gff3Parser()
        self.gff_path = os.path.join(args_term.gffs, "tmp")
        self.fasta_path = os.path.join(args_term.fastas, "tmp")
        self.tran_path = os.path.join(args_term.trans, "tmp")
        self.outfolder = {
            "term": os.path.join(args_term.out_folder, "gffs"),
            "csv": os.path.join(args_term.out_folder, "tables")
        }
        self.terms = {
            "all": os.path.join(self.outfolder["term"], "all_candidates"),
            "express": os.path.join(self.outfolder["term"],
                                    "expressed_candidates"),
            "best": os.path.join(self.outfolder["term"], "best_candidates"),
            "non": os.path.join(self.outfolder["term"],
                                "non_expressed_candidates")
        }
        self.csvs = {
            "all": os.path.join(self.outfolder["csv"], "all_candidates"),
            "express": os.path.join(self.outfolder["csv"],
                                    "expressed_candidates"),
            "best": os.path.join(self.outfolder["csv"], "best_candidates"),
            "non": os.path.join(self.outfolder["csv"],
                                "non_expressed_candidates")
        }
        self.combine_path = os.path.join(self.gff_path, "combine")
        self.tmps = {
            "transterm": os.path.join(os.getcwd(), "tmp_transterm"),
            "hp": "transtermhp",
            "hp_gff": "transtermhp.gff",
            "hp_path": "tmp_transterm/tmp",
            "term_table": os.path.join(os.getcwd(), "tmp_term_table"),
            "merge": os.path.join(os.getcwd(), "tmp_merge_gff"),
            "gff": "tmp.gff",
            "folder": os.path.join(os.getcwd(), "tmp")
        }
        self.suffixs = {
            "gff": "term.gff",
            "csv": "term.csv",
            "allgff": "term_all.gff"
        }
        if args_term.srnas:
            self.srna_path = os.path.join(args_term.srnas, "tmp")
        else:
            self.srna_path = None
        self._make_gff_folder()

    def _combine_annotation(self, combine_file, files):
        with open(combine_file, 'w') as result:
            for file_ in files:
                check_start = False
                fh = open(file_, 'r')
                for line in fh:
                    if check_start:
                        result.write(line)
                    if "Location" in line:
                        check_start = True
                if "\n" not in line:
                    result.write("\n")
                fh.close()

    def _make_gff_folder(self):
        self.helper.check_make_folder(self.terms["all"])
        self.helper.check_make_folder(self.csvs["all"])
        self.helper.check_make_folder(self.terms["best"])
        self.helper.check_make_folder(self.csvs["best"])
        self.helper.check_make_folder(self.terms["express"])
        self.helper.check_make_folder(self.csvs["express"])
        self.helper.check_make_folder(self.terms["non"])
        self.helper.check_make_folder(self.csvs["non"])

    def _convert_gff2rntptt(self, gff_path, fasta_path, sRNAs):
        file_types = {}
        prefixs = []
        for gff in os.listdir(gff_path):
            if gff.endswith(".gff"):
                filename = gff.split("/")
                prefix = filename[-1][:-4]
                prefixs.append(prefix)
                gff_file = os.path.join(gff_path, gff)
                rnt_file = os.path.join(gff_path, gff.replace(".gff", ".rnt"))
                ptt_file = os.path.join(gff_path, gff.replace(".gff", ".ptt"))
                fasta = self.helper.get_correct_file(fasta_path, ".fa", prefix,
                                                     None, None)
                if not fasta:
                    print("Error: {0}.fa can not be found!".format(prefix))
                    sys.exit()
                if sRNAs:
                    self.multiparser.parser_gff(sRNAs, "sRNA")
                    srna = self.helper.get_correct_file(
                        self.srna_path, "_sRNA.gff", prefix, None, None)
                    if (srna) and (fasta):
                        self.converter.convert_gff2rntptt(
                            gff_file, fasta, ptt_file, rnt_file, srna,
                            srna.replace(".gff", ".rnt"))
                        file_types[prefix] = "srna"
                    if (not srna) and (fasta):
                        self.converter.convert_gff2rntptt(
                            gff_file, fasta, ptt_file, rnt_file, None, None)
                        file_types[prefix] = "normal"
                else:
                    self.converter.convert_gff2rntptt(gff_file, fasta,
                                                      ptt_file, rnt_file, None,
                                                      None)
                    file_types[prefix] = "normal"
        return file_types, prefixs

    def _combine_ptt_rnt(self, gff_path, file_types, srna_path):
        self.helper.check_make_folder(self.combine_path)
        for prefix, file_type in file_types.items():
            combine_file = os.path.join(self.combine_path, prefix + '.ptt')
            if file_type == "normal":
                files = [
                    os.path.join(gff_path, prefix + ".ptt"),
                    os.path.join(gff_path, prefix + ".rnt")
                ]
                self._combine_annotation(combine_file, files)
            elif file_type == "srna":
                files = [
                    os.path.join(gff_path, prefix + ".ptt"),
                    os.path.join(gff_path, prefix + ".rnt"),
                    os.path.join(srna_path, "_".join([prefix, "sRNA.rnt"]))
                ]
                self._combine_annotation(combine_file, files)

    def _TransTermHP(self, fasta, file_, out_path, prefix, out, args_term):
        call([
            args_term.TransTermHP_path, "-p", args_term.expterm_path, fasta,
            os.path.join(self.combine_path, file_), "--t2t-perf",
            os.path.join(
                out_path, "_".join([
                    prefix,
                    "terminators_within_robust_tail-to-tail_regions.t2t"
                ])), "--bag-output",
            os.path.join(out_path, "_".join(
                [prefix, "best_terminator_after_gene.bag"]))
        ],
             stdout=out)

    def _run_TransTermHP(self, args_term):
        self.helper.check_make_folder(self.tmps["transterm"])
        for file_ in os.listdir(self.combine_path):
            if ".ptt" in file_:
                prefix = file_.replace(".ptt", "")
                fasta = self.helper.get_correct_file(self.fasta_path, ".fa",
                                                     prefix, None, None)
                if not fasta:
                    print("Error: {0}.fa can not be found!".format(prefix))
                    sys.exit()
                out_path = os.path.join(args_term.hp_folder, prefix)
                self.helper.check_make_folder(out_path)
                out = open(
                    os.path.join(out_path,
                                 "_".join([prefix, "terminators.txt"])), "w")
                self._TransTermHP(fasta, file_, out_path, prefix, out,
                                  args_term)
                out.close()
        shutil.rmtree(self.combine_path)

    def _convert_to_gff(self, prefixs, args_term):
        for prefix in prefixs:
            for folder in os.listdir(args_term.hp_folder):
                if prefix == folder:
                    out_path = os.path.join(args_term.hp_folder, folder)
                    for file_ in os.listdir(out_path):
                        if file_.endswith(".bag"):
                            out_file = os.path.join(
                                self.tmps["transterm"],
                                "_".join([prefix, self.tmps["hp_gff"]]))
                            self.converter.convert_transtermhp2gff(
                                os.path.join(out_path, file_), out_file)
        self.multiparser.combine_gff(args_term.gffs, self.tmps["transterm"],
                                     None, self.tmps["hp"])

    def _combine_wigs(self, args_term):
        if (args_term.tex_wigs is not None) and (args_term.frag_wigs
                                                 is not None):
            folder = args_term.tex_wigs.split("/")
            folder = "/".join(folder[:-1])
            merge_wigs = os.path.join(folder, "merge_wigs")
            self.helper.check_make_folder(merge_wigs)
            for wig in os.listdir(args_term.tex_wigs):
                if os.path.isdir(os.path.join(args_term.tex_wigs, wig)):
                    pass
                else:
                    shutil.copy(os.path.join(args_term.tex_wigs, wig),
                                merge_wigs)
            for wig in os.listdir(args_term.frag_wigs):
                if os.path.isdir(os.path.join(args_term.frag_wigs, wig)):
                    pass
                else:
                    shutil.copy(os.path.join(args_term.frag_wigs, wig),
                                merge_wigs)
        elif (args_term.tex_wigs is not None):
            merge_wigs = args_term.tex_wigs
        elif (args_term.frag_wigs is not None):
            merge_wigs = args_term.frag_wigs
        else:
            print("Error: Wiggle files are not assigned!")
            sys.exit()
        return merge_wigs

    def _merge_sRNA(self, sRNAs, prefixs, gff_path):
        '''searching the terminator with sRNA information'''
        if sRNAs is not None:
            self.multiparser.parser_gff(sRNAs, "sRNA")
            self.helper.check_make_folder(self.tmps["merge"])
            for prefix in prefixs:
                tmp_gff = os.path.join(self.tmps["merge"], self.tmps["gff"])
                if self.tmps["gff"] in os.listdir(self.tmps["merge"]):
                    os.remove(tmp_gff)
                self.helper.merge_file(os.path.join(gff_path, prefix + ".gff"),
                                       tmp_gff)
                self.helper.merge_file(
                    os.path.join(self.srna_path,
                                 "_".join([prefix, "sRNA.gff"])), tmp_gff)
                self.helper.sort_gff(
                    tmp_gff, os.path.join(self.tmps["merge"], prefix + ".gff"))
                os.remove(tmp_gff)
            merge_path = self.tmps["merge"]
        else:
            merge_path = gff_path
        return merge_path

    def _move_file(self, term_outfolder, csv_outfolder):
        for gff in os.listdir(term_outfolder):
            if gff.endswith("_term.gff"):
                self.helper.sort_gff(os.path.join(term_outfolder, gff),
                                     self.tmps["gff"])
                shutil.move(self.tmps["gff"],
                            os.path.join(term_outfolder, gff))
                prefix = gff.replace("_term.gff", "")
                new_gff = os.path.join(
                    self.terms["all"],
                    "_".join([prefix, self.suffixs["allgff"]]))
                csv_file = os.path.join(
                    os.path.join(self.csvs["all"],
                                 "_".join([prefix, self.suffixs["csv"]])))
                out = open(new_gff, "w")
                out.write("##gff-version 3\n")
                out.close()
                self.helper.merge_file(
                    os.path.join(term_outfolder, gff),
                    os.path.join(self.terms["all"],
                                 "_".join([prefix, self.suffixs["allgff"]])))
                os.remove(os.path.join(term_outfolder, gff))
                pre_strain = ""
                if ("_".join([prefix, self.suffixs["csv"]])
                        in os.listdir(self.csvs["all"])):
                    os.remove(csv_file)
                out_csv = open(csv_file, "w")
                out_csv.write("\t".join([
                    "Genome", "Name", "Start", "End", "Strand", "Detect",
                    "Coverage_decrease", "Coverage_detail"
                ]) + "\n")
                out_csv.close()
                fh = open(new_gff)
                for entry in self.gff_parser.entries(fh):
                    if entry.seq_id != pre_strain:
                        self.helper.merge_file(
                            os.path.join(
                                self.tmps["term_table"],
                                "_".join([entry.seq_id, "term_raw.csv"])),
                            os.path.join(
                                self.csvs["all"],
                                "_".join([prefix, self.suffixs["csv"]])))
                    pre_strain = entry.seq_id
                fh.close()

    def _run_rnafold(self, RNAfold_path, tmp_seq, tmp_sec, prefix):
        print("Computing secondray structures of {0}".format(prefix))
        self.helper.check_make_folder(self.tmps["folder"])
        pre_cwd = os.getcwd()
        os.chdir(self.tmps["folder"])
        os.system(" ".join([
            RNAfold_path, "<",
            os.path.join("..", tmp_seq), ">",
            os.path.join("..", tmp_sec)
        ]))
        os.chdir(pre_cwd)
        shutil.rmtree(self.tmps["folder"])

    def _compute_intersection_forward_reverse(self, prefixs, merge_path,
                                              wig_path, merge_wigs, args_term):
        '''the approach for searching gene converged region terminator'''
        for prefix in prefixs:
            tmp_seq = os.path.join(args_term.out_folder,
                                   "_".join(["inter_seq", prefix]))
            tmp_index = os.path.join(args_term.out_folder,
                                     "_".join(["inter_index", prefix]))
            tmp_sec = os.path.join(args_term.out_folder,
                                   "_".join(["inter_sec", prefix]))
            tran_file = os.path.join(self.tran_path,
                                     "_".join([prefix, "transcript.gff"]))
            gff_file = os.path.join(merge_path, prefix + ".gff")
            tmp_cand = tmp_cand = os.path.join(
                args_term.out_folder, "_".join(["term_candidates", prefix]))
            if os.path.exists(tran_file):
                print("Extracting sequences of {0}".format(prefix))
                intergenic_seq(os.path.join(self.fasta_path,
                                            prefix + ".fa"), tran_file,
                               gff_file, tmp_seq, tmp_index, args_term)
                self._run_rnafold(args_term.RNAfold_path, tmp_seq, tmp_sec,
                                  prefix)
                extract_info_sec(tmp_sec, tmp_seq, tmp_index)
                os.remove(tmp_index)
                poly_t(tmp_seq, tmp_sec, gff_file, tran_file, tmp_cand,
                       args_term)
            print("Detecting terminators for " + prefix)
            detect_coverage(
                tmp_cand, os.path.join(merge_path, prefix + ".gff"),
                os.path.join(self.tran_path,
                             "_".join([prefix, "transcript.gff"])),
                os.path.join(self.fasta_path, prefix + ".fa"),
                os.path.join(wig_path, "_".join([prefix, "forward.wig"])),
                os.path.join(wig_path, "_".join([prefix, "reverse.wig"])),
                os.path.join(self.tmps["hp_path"],
                             "_".join([prefix, self.tmps["hp_gff"]])),
                merge_wigs,
                os.path.join(self.outfolder["term"],
                             "_".join([prefix, self.suffixs["gff"]])),
                os.path.join(self.tmps["term_table"],
                             "_".join([prefix, "term_raw.csv"])), args_term)
        self.multiparser.combine_gff(args_term.gffs, self.outfolder["term"],
                                     None, "term")
        self._move_file(self.outfolder["term"], self.outfolder["csv"])

    def _remove_tmp_file(self, merge_wigs, args_term):
        self.helper.remove_tmp_dir(args_term.gffs)
        self.helper.remove_tmp_dir(args_term.fastas)
        if args_term.srnas is not None:
            self.helper.remove_tmp(args_term.srnas)
            shutil.rmtree(self.tmps["merge"])
        if (args_term.tex_wigs is not None) and (args_term.frag_wigs
                                                 is not None):
            shutil.rmtree(merge_wigs)
        self.helper.remove_tmp_dir(args_term.trans)
        if "tmp_wig" in os.listdir(args_term.out_folder):
            shutil.rmtree(os.path.join(args_term.out_folder, "tmp_wig"))
        self.helper.remove_tmp(self.outfolder["term"])
        shutil.rmtree(self.tmps["transterm"])
        shutil.rmtree(self.tmps["term_table"])
        self.helper.remove_all_content(args_term.out_folder, "inter_seq_",
                                       "file")
        self.helper.remove_all_content(self.outfolder["term"], "_term.gff",
                                       "file")
        self.helper.remove_all_content(args_term.out_folder, "inter_sec_",
                                       "file")
        self.helper.remove_all_content(args_term.out_folder,
                                       "term_candidates_", "file")

    def _compute_stat(self, args_term):
        new_prefixs = []
        for gff in os.listdir(self.terms["all"]):
            if gff.endswith("_term_all.gff"):
                out_tmp = open(self.tmps["gff"], "w")
                out_tmp.write("##gff-version 3\n")
                new_prefix = gff.replace("_term_all.gff", "")
                new_prefixs.append(gff.replace("_term_all.gff", ""))
                num = 0
                fh = open(os.path.join(self.terms["all"], gff))
                for entry in self.gff_parser.entries(fh):
                    name = '%0*d' % (5, num)
                    entry.attributes["ID"] = (entry.seq_id + "_terminator" +
                                              str(num))
                    entry.attributes["Name"] = "_".join(["terminator_" + name])
                    entry.attribute_string = ";".join([
                        "=".join(items) for items in entry.attributes.items()
                    ])
                    out_tmp.write("\t".join([
                        entry.info_without_attributes, entry.attribute_string
                    ]) + "\n")
                    num += 1
                out_tmp.close()
                fh.close()
                shutil.move(
                    self.tmps["gff"],
                    os.path.join(self.terms["all"],
                                 "_".join([new_prefix, self.suffixs["gff"]])))
        stat_path = os.path.join(args_term.out_folder, "statistics")
        for prefix in new_prefixs:
            stat_term(
                os.path.join(self.terms["all"],
                             "_".join([prefix, self.suffixs["gff"]])),
                os.path.join(self.csvs["all"],
                             "_".join([prefix, self.suffixs["csv"]])),
                os.path.join(stat_path, "_".join(["stat", prefix + ".csv"])),
                os.path.join(self.terms["best"], "_".join([prefix, "term"])),
                os.path.join(self.terms["express"], "_".join([prefix,
                                                              "term"])),
                os.path.join(self.terms["non"], "_".join([prefix, "term"])))
            shutil.move(
                os.path.join(self.terms["best"],
                             "_".join([prefix, self.suffixs["csv"]])),
                os.path.join(self.csvs["best"],
                             "_".join([prefix, self.suffixs["csv"]])))
            shutil.move(
                os.path.join(self.terms["express"],
                             "_".join([prefix, self.suffixs["csv"]])),
                os.path.join(self.csvs["express"],
                             "_".join([prefix, self.suffixs["csv"]])))
            shutil.move(
                os.path.join(self.terms["non"],
                             "_".join([prefix, self.suffixs["csv"]])),
                os.path.join(self.csvs["non"],
                             "_".join([prefix, self.suffixs["csv"]])))
            os.remove(
                os.path.join(self.terms["all"],
                             "_".join([prefix, self.suffixs["allgff"]])))

    def _check_gff_file(self, folder):
        for file_ in os.listdir(folder):
            if file_.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(folder, file_))

    def _compare_term_tran(self, args_term, prefixs):
        '''searching the associated terminator to transcript'''
        self.multiparser.combine_gff(args_term.gffs, self.tran_path, None,
                                     "transcript")
        prefixs = []
        print("Comparing terminators with transcripts now")
        for file_ in os.listdir(self.tran_path):
            if file_.endswith("_transcript.gff"):
                prefixs.append(file_.replace("_transcript.gff", ""))
        for type_ in ("best_candidates", "expressed_candidates",
                      "all_candidates"):
            compare_term_tran(self.tran_path,
                              os.path.join(self.outfolder["term"], type_),
                              args_term.fuzzy_up_ta, args_term.fuzzy_down_ta,
                              args_term.out_folder, "terminator",
                              self.outfolder["term"], args_term.trans)
            for prefix in prefixs:
                shutil.move(
                    os.path.join(
                        args_term.out_folder, "statistics",
                        "stat_compare_transcript_terminator_" + prefix +
                        ".csv"),
                    os.path.join(
                        args_term.out_folder, "statistics", "_".join([
                            "stat_compare_terminator_transcript", prefix,
                            type_ + ".csv"
                        ])))

    def run_terminator(self, args_term):
        self._check_gff_file(args_term.gffs)
        self._check_gff_file(args_term.trans)
        self.multiparser.parser_fasta(args_term.fastas)
        if (not args_term.gffs) or (not args_term.fastas):
            print("Error: Please assign gff files " "and fasta files!")
            sys.exit()
        file_types, prefixs = self._convert_gff2rntptt(self.gff_path,
                                                       self.fasta_path,
                                                       args_term.srnas)
        self._combine_ptt_rnt(self.gff_path, file_types, self.srna_path)
        self._run_TransTermHP(args_term)
        self._convert_to_gff(prefixs, args_term)
        self.helper.remove_tmp(self.gff_path)
        self.multiparser.parser_gff(args_term.trans, "transcript")
        self.helper.check_make_folder(self.tmps["term_table"])
        self.multiparser.parser_gff(self.tmps["transterm"], self.tmps["hp"])
        merge_path = self._merge_sRNA(args_term.srnas, prefixs, self.gff_path)
        self._compute_intersection_forward_reverse(prefixs, merge_path,
                                                   args_term.wig_path,
                                                   args_term.merge_wigs,
                                                   args_term)
        self._compute_stat(args_term)
        self._compare_term_tran(args_term, prefixs)
        self._remove_tmp_file(args_term.merge_wigs, args_term)
예제 #20
0
class TSSpredator(object):

    def __init__(self, args_tss):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.master = os.path.join(args_tss.out_folder, "MasterTables")
        self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta":
                     "tmp_tss", "tmp": "tmp"}
        if args_tss.ta_files is not None:
            self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp")
        else:
            self.tmps["ta"] = None
        self.gff_path = os.path.join(args_tss.gffs, "tmp")
        if args_tss.manual is not None:
            self.manual_path = os.path.join(args_tss.manual, "tmp")
        self.wig_path = os.path.join(args_tss.wig_folder, "tmp")
        self.fasta_path = os.path.join(args_tss.fastas, "tmp")
        self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics")
        self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs")

    def _assign_dict(self, lib_datas):
        return {"wig": lib_datas[0],
                "tex": lib_datas[1],
                "condition": int(lib_datas[2]),
                "replicate": lib_datas[3],
                "strand": lib_datas[4]}

    def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix, rep_set):
        for num_id in range(1, lib_num+1):
            cond_list = []
            for lib in lib_list:
                if num_id == lib["condition"]:
                    cond_list.append(lib)
            cond_sort_list = sorted(cond_list, key=lambda k: k['replicate'])
            reps = []
            for cond in cond_sort_list:
                out.write("{0}_{1}{2} = {3}\n".format(
                          prefix, cond["condition"], cond["replicate"],
                          os.path.join(wig_folder, cond["wig"])))
                reps.append(cond["replicate"])
            for rep in sorted(rep_set):
                if rep not in reps:
                    out.write("{0}_{1}{2} = \n".format(
                              prefix, cond["condition"], rep))

    def _start_to_run(self, tsspredator_path, config_file, out_path, prefix, log):
        print("Running TSSpredator for " + prefix)
        log.write("Make sure the version of TSSpredator is at least 1.06.\n")
        out = open(os.path.join(out_path, "log.txt"), "w")
        err = open(os.path.join(out_path, "err.txt"), "w")
        log.write(" ".join(["java", "-jar", tsspredator_path,
                            config_file]) + "\n")
        call(["java", "-jar", tsspredator_path,
              config_file], stdout=out, stderr=err)
        out.close()
        err.close()
        log.write("Done!\n")
        log.write("The following files are generated in {0}:\n".format(out_path))
        for file_ in os.listdir(out_path):
            log.write("\t" + file_ + "\n")

    def _import_lib(self, libs, wig_folder, project_strain_name,
                    out, gff, program, fasta):
        lib_dict = {"fp": [], "fm": [], "nm": [], "np": []}
        lib_num = 0
        rep_set = set()
        list_num_id = []
        for lib in libs:
            lib_datas = lib.split(":")
            if not lib_datas[0].endswith(".wig"):
                print("Error: Wiggle files are not end with .wig!")
                sys.exit()
            for wig in os.listdir(wig_folder):
                filename = wig.split("_STRAIN_")
                if (filename[0] == lib_datas[0][:-4]) and (
                        filename[1][:-4] == project_strain_name):
                    lib_datas[0] = wig
            if int(lib_datas[2]) > lib_num:
                lib_num = int(lib_datas[2])
            if lib_datas[3] not in rep_set:
                rep_set.add(lib_datas[3])
            if (lib_datas[1] == "tex") and (lib_datas[4] == "+"):
                lib_dict["fp"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"):
                lib_dict["fm"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"):
                lib_dict["np"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"):
                lib_dict["nm"].append(self._assign_dict(lib_datas))
        for num_id in range(1, lib_num+1):
            out.write("annotation_{0} = {1}\n".format(num_id, gff))
        if program.lower() == "tss":
            self._print_lib(lib_num, lib_dict["fm"], out,
                            wig_folder, "fivePrimeMinus", rep_set)
            self._print_lib(lib_num, lib_dict["fp"], out,
                            wig_folder, "fivePrimePlus", rep_set)
        elif program.lower() == "ps":
            self._print_lib(lib_num, lib_dict["nm"], out,
                            wig_folder, "fivePrimeMinus", rep_set)
            self._print_lib(lib_num, lib_dict["np"], out,
                            wig_folder, "fivePrimePlus", rep_set)
        else:
            print("Error: Wrong program name! Please assing tss "
                  "or processing_site.")
            sys.exit()
        for num_id in range(1, lib_num+1):
            out.write("genome_{0} = {1}\n".format(num_id, fasta))
        for num_id in range(1, lib_num+1):
            list_num_id.append(str(num_id))
        return lib_num, num_id, rep_set, lib_dict, list_num_id

    def _print_repmatch(self, args_tss, out):
        '''check replicate match'''
        detect_all = False
        for rep in args_tss.repmatch:
            if "all" in rep:
                detect_all = True
                match = rep.split("_")[-1]
                out.write("minNumRepMatches = {0}\n".format(match))
                break
        if not detect_all:
            nums = {}
            matchs = {}
            for match in args_tss.repmatch:
                lib = match.split("_")[0]
                rep = match.split("_")[-1]
                matchs[lib] = rep
                if rep not in nums.keys():
                    nums[rep] = 1
                else:
                    nums[rep] += 1
            for rep, num in nums.items():
                if num == max(nums.values()):
                    out.write("minNumRepMatches = {0}\n".format(rep))
                    max_rep = rep
                    break
            for lib, rep in matchs.items():
                if rep != max_rep:
                    out.write("minNumRepMatches_{0} = {1}\n".format(
                        lib, rep))

    def _gen_config(self, project_strain_name, args_tss, gff,
                    wig_folder, fasta, config_file, log):
        '''generation of config files'''
        master_folder = "MasterTable_" + project_strain_name
        out_path = os.path.join(self.master, master_folder)
        self.helper.check_make_folder(out_path)
        out = open(config_file, "w")
        out.write("TSSinClusterSelectionMethod = HIGHEST\n")
        out.write("allowedCompareShift = 1\n")
        out.write("allowedRepCompareShift = 1\n")
        lib_num, num_id, rep_set, lib_dict, list_num_id = \
            self._import_lib(args_tss.libs, wig_folder, project_strain_name,
                             out, gff, args_tss.program, fasta)
        out.write("idList = ")
        out.write(",".join(list_num_id) + "\n")
        out.write("maxASutrLength = 100\n")
        out.write("maxGapLengthInGene = 500\n")
        out.write("maxNormalTo5primeFactor = {0}\n".format(
                  args_tss.processing_factor))
        out.write("maxTSSinClusterDistance = {0}\n".format(
                  args_tss.cluster + 1))
        out.write("maxUTRlength = {0}\n".format(args_tss.utr_length))
        out.write("min5primeToNormalFactor = {0}\n".format(
                  args_tss.enrichment_factor))
        out.write("minCliffFactor = {0}\n".format(args_tss.factor))
        out.write("minCliffFactorDiscount = {0}\n".format(
                  args_tss.factor_reduction))
        out.write("minCliffHeight = {0}\n".format(args_tss.height))
        out.write("minCliffHeightDiscount = {0}\n".format(
                  args_tss.height_reduction))
        out.write("minNormalHeight = {0}\n".format(args_tss.base_height))
        self._print_repmatch(args_tss, out)
        out.write("minPlateauLength = 0\n")
        out.write("mode = cond\n")
        out.write("normPercentile = 0.9\n")
        if args_tss.program.lower() == "tss":
            self._print_lib(lib_num, lib_dict["nm"], out,
                            wig_folder, "normalMinus", rep_set)
            self._print_lib(lib_num, lib_dict["np"], out,
                            wig_folder, "normalPlus", rep_set)
        else:
            self._print_lib(lib_num, lib_dict["fm"], out,
                            wig_folder, "normalMinus", rep_set)
            self._print_lib(lib_num, lib_dict["fp"], out,
                            wig_folder, "normalPlus", rep_set)
        out.write("numReplicates = {0}\n".format(len(rep_set)))
        out.write("numberOfDatasets = {0}\n".format(lib_num))
        out.write("outputDirectory = {0}\n".format(out_path))
        for prefix_id in range(len(args_tss.output_prefixs)):
            out.write("outputPrefix_{0} = {1}\n".format(
                      prefix_id + 1, args_tss.output_prefixs[prefix_id]))
        out.write("projectName = {0}\n".format(project_strain_name))
        out.write("superGraphCompatibility = igb\n")
        out.write("texNormPercentile = 0.5\n")
        out.write("writeGraphs = 0\n")
        out.write("writeNocornacFiles = 0\n")
        log.write("\t" + config_file + " is generated.\n")
        out.close()

    def _convert_gff(self, prefixs, args_tss, log):
        for prefix in prefixs:
            out_file = os.path.join(self.gff_outfolder, "_".join([
                           prefix, args_tss.program]) + ".gff")
            gff_f = open(out_file, "w")
            out_path = os.path.join(self.master, "_".join([
                           "MasterTable", prefix]))
            if "MasterTable.tsv" not in os.listdir(out_path):
                print("Error: There is not MasterTable file in {0} ".format(
                      out_path))
                print("Please check configuration file.")
                log.write("not MasterTable file is found in {0}\n".format(
                           out_path))
            else:
                if args_tss.program.lower() == "processing":
                    feature = "processing_site"
                elif args_tss.program.lower() == "tss":
                    feature = "TSS"
                self.converter.convert_mastertable2gff(
                    os.path.join(out_path, "MasterTable.tsv"),
                    "ANNOgesic", feature, prefix, out_file)
                log.write("\t" + out_file + "is generated.\n")
            gff_f.close()

    def _merge_manual(self, tsss, args_tss):
        '''if manual detected TSS is provided, it can merge manual detected TSS 
        and TSSpredator predicted TSS'''
        self.helper.check_make_folder(os.path.join(os.getcwd(),
                                      self.tmps["tss"]))
        for tss in tsss:
            for gff in os.listdir(args_tss.gffs):
                if (gff[:-4] == tss) and (".gff" in gff):
                    break
            filename = "_".join([tss, args_tss.program]) + ".gff"
            predict = os.path.join(self.gff_outfolder, filename)
            manual = os.path.join(self.manual_path, tss + ".gff")
            fasta = os.path.join(self.fasta_path, tss + ".fa")
            stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss)
            if os.path.exists(manual):
                print("Merging and classiflying manually-detected "
                      "TSSs for {0}".format(tss))
                merge_manual_predict_tss(
                    predict, stat_file,
                    os.path.join(self.tmps["tss"], filename),
                    os.path.join(args_tss.gffs, gff), args_tss, manual, fasta)
            if os.path.exists(stat_file):
                shutil.move(stat_file, os.path.join(
                    args_tss.out_folder, "statistics", tss, stat_file))
        self.helper.move_all_content(self.tmps["tss"],
                                     self.gff_outfolder, [".gff"])
        shutil.rmtree(self.tmps["tss"])

    def _validate(self, tsss, args_tss, log):
        '''validate TSS with genome annotation'''
        print("Validating TSSs with genome annotations")
        log.write("Running validate_gene.py to compare genome "
                  "annotations and TSSs/PSs.\n")
        for tss in tsss:
            for gff in os.listdir(args_tss.gffs):
                if (gff[:-4] == tss) and (".gff" in gff):
                    break
            stat_file = os.path.join(
                    self.stat_outfolder, tss,
                    "".join(["stat_gene_vali_", tss, ".csv"]))
            out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff")
            if args_tss.program.lower() == "tss":
                compare_file = os.path.join(self.gff_outfolder,
                                            "_".join([tss, "TSS.gff"]))
            elif args_tss.program.lower() == "processing":
                compare_file = os.path.join(self.gff_outfolder,
                                            "_".join([tss, "processing.gff"]))
            validate_gff(compare_file, os.path.join(args_tss.gffs, gff),
                         stat_file, out_cds_file, args_tss.utr_length,
                         args_tss.program.lower())
            log.write("\t" + stat_file + " is generated.\n")
            shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff))

    def _compare_ta(self, tsss, args_tss, log):
        '''compare TSS with transcript'''
        detect = False
        log.write("Running stat_TA_comparison to compare transcripts "
                  "and TSSs/PSs.\n")
        print("Comparing transcripts and TSSs")
        self.multiparser.parser_gff(args_tss.ta_files, "transcript")
        self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"],
                                     None, "transcript")
        for tss in tsss:
            stat_out = os.path.join(
                    self.stat_outfolder, tss, "".join([
                        "stat_compare_TSS_transcript_",
                        tss, ".csv"]))
            for ta in os.listdir(self.tmps["ta"]):
                filename = ta.split("_transcript")
                if (filename[0] == tss) and (filename[1] == ".gff"):
                    detect = True
                    break
            compare_file = os.path.join(self.gff_outfolder,
                                        "_".join([tss, "TSS.gff"]))
            if detect:
                stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file,
                            stat_out, self.tmps["ta_tss"],
                            self.tmps["tss_ta"], args_tss.fuzzy)
                self.helper.sort_gff(self.tmps["tss_ta"], compare_file)
                self.helper.sort_gff(self.tmps["ta_tss"],
                                     os.path.join(args_tss.ta_files, ta))
                os.remove(self.tmps["tss_ta"])
                os.remove(self.tmps["ta_tss"])
                detect = False
            log.write("\t" + stat_out + " is generated.\n")

    def _stat_tss(self, tsss, feature, log):
        print("Running statistaics")
        for tss in tsss:
            compare_file = os.path.join(self.gff_outfolder,
                                        "_".join([tss, feature]) + ".gff")
            stat_tsspredator(
                compare_file, feature,
                os.path.join(self.stat_outfolder, tss, "_".join([
                    "stat", feature, "class", tss]) + ".csv"),
                os.path.join(self.stat_outfolder, tss, "_".join([
                    "stat", feature, "libs", tss]) + ".csv"))
            self.helper.move_all_content(os.getcwd(), os.path.join(
                self.stat_outfolder, tss), ["_class", ".png"])
            if os.path.exists(os.path.join(
                    self.stat_outfolder, "TSSstatistics.tsv")):
                shutil.move(
                    os.path.join(
                        self.stat_outfolder, "TSSstatistics.tsv"),
                    os.path.join(
                        self.stat_outfolder, tss, "TSSstatistics.tsv"))
            plot_venn(compare_file, feature)
            self.helper.move_all_content(os.getcwd(), os.path.join(
                self.stat_outfolder, tss), ["_venn", ".png"])
            log.write("The following files in {0} are generated:\n".format(
                (os.path.join(self.stat_outfolder, tss))))
            for file_ in os.listdir(os.path.join(
                    self.stat_outfolder, tss)):
                log.write("\t" + file_ + "\n")

    def _set_gen_config(self, args_tss, input_folder, log):
        prefixs = []
        detect = False
        log.write("Generating config files for TSSpredator.\n")
        for fasta in os.listdir(self.fasta_path):
            run = False
            for gff in os.listdir(self.gff_path):
                if fasta[:-3] == gff[:-4]:
                    prefix = fasta[:-3]
                    for wig in os.listdir(self.wig_path):
                        filename = wig.split("_STRAIN_")
                        if filename[1][:-4] == prefix:
                            detect = True
                            break
                    if detect:
                        prefixs.append(prefix)
                        config = os.path.join(
                                input_folder,
                                "_".join(["config", prefix]) + ".ini")
                        self._gen_config(
                            prefix, args_tss,
                            os.path.join(self.gff_path, gff), self.wig_path,
                            os.path.join(self.fasta_path, fasta), config, log)
        return prefixs

    def _merge_wigs(self, wig_folder, prefix, libs):
        self.helper.check_make_folder(os.path.join(os.getcwd(),
                                      self.tmps["tmp"]))
        for wig_file in os.listdir(wig_folder):
            for lib in libs:
                info = lib.split(":")
                if (info[0][:-4] in wig_file) and (info[-1] == "+") and (
                        prefix in wig_file) and (
                        os.path.isfile(os.path.join(wig_folder, wig_file))):
                    Helper().merge_file(
                            os.path.join(wig_folder, wig_file),
                            os.path.join("tmp", "merge_forward.wig"))
                if (info[0][:-4] in wig_file) and (info[-1] == "-") and (
                        prefix in wig_file) and (
                        os.path.isfile(os.path.join(wig_folder, wig_file))):
                    Helper().merge_file(
                            os.path.join(wig_folder, wig_file),
                            os.path.join("tmp", "merge_reverse.wig"))

    def _check_orphan(self, prefixs, wig_folder, args_tss):
        '''if genome has no locus tag, it can use for classify the TSS'''
        for prefix in prefixs:
            self._merge_wigs(wig_folder, prefix, args_tss.libs)
            tmp_tss = os.path.join(self.tmps["tmp"], "_".join([
                          prefix, args_tss.program + ".gff"]))
            pre_tss = os.path.join(self.gff_outfolder, "_".join([
                          prefix, args_tss.program + ".gff"]))
            check_orphan(pre_tss, os.path.join(
                args_tss.gffs, prefix + ".gff"),
                "tmp/merge_forward.wig", "tmp/merge_reverse.wig", tmp_tss)
            shutil.move(tmp_tss, pre_tss)
        shutil.rmtree("tmp")

    def _remove_files(self, args_tss):
        print("Remove temperary files and folders")
        self.helper.remove_tmp_dir(args_tss.fastas)
        self.helper.remove_tmp_dir(args_tss.gffs)
        self.helper.remove_tmp_dir(args_tss.ta_files)
        if "merge_forward.wig" in os.listdir(os.getcwd()):
            os.remove("merge_forward.wig")
        if "merge_reverse.wig" in os.listdir(os.getcwd()):
            os.remove("merge_reverse.wig")
        shutil.rmtree(args_tss.wig_folder)
        if args_tss.manual is not None:
            shutil.rmtree(args_tss.manual)

    def _deal_with_overlap(self, out_folder, args_tss):
        '''deal with the situation that TSS and 
        processing site at the same position'''
        if not args_tss.overlap_feature:
            pass
        else:
            print("Comparing TSSs and Processing sites")
            if args_tss.program.lower() == "tss":
                for tss in os.listdir(out_folder):
                    if tss.endswith("_TSS.gff"):
                        ref = self.helper.get_correct_file(
                                args_tss.overlap_gffs, "_processing.gff",
                                tss.replace("_TSS.gff", ""), None, None)
                        filter_tss_pro(os.path.join(out_folder, tss),
                                       ref, args_tss.program,
                                       args_tss.cluster)
            elif args_tss.program.lower() == "processing":
                for tss in os.listdir(out_folder):
                    if tss.endswith("_processing.gff"):
                        ref = self.helper.get_correct_file(
                                args_tss.overlap_gffs, "_TSS.gff",
                                tss.replace("_processing.gff", ""), None, None)
                        filter_tss_pro(os.path.join(out_folder, tss),
                                       ref, args_tss.program,
                                       args_tss.cluster)

    def _low_expression(self, args_tss, gff_folder):
        '''deal with the low expressed TSS'''
        prefix = None
        self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs)
        for gff in os.listdir(gff_folder):
            if (args_tss.program.lower() == "tss") and (
                    gff.endswith("_TSS.gff")):
                prefix = gff.replace("_TSS.gff", "")
            elif (args_tss.program.lower() == "processing") and (
                    gff.endswith("_processing.gff")):
                prefix = gff.replace("_processing.gff", "")
            if prefix:
                out = open(os.path.join(
                    self.stat_outfolder, prefix, "_".join([
                        "stat", prefix, "low_expression_cutoff.csv"])), "w")
                out.write("\t".join(["Genome", "Cutoff_coverage"]) + "\n")
                cutoff = filter_low_expression(
                        os.path.join(gff_folder, gff), args_tss,
                        "tmp/merge_forward.wig", "tmp/merge_reverse.wig",
                        "tmp/without_low_expression.gff")
                out.write("\t".join([prefix, str(cutoff)]) + "\n")
                os.remove(os.path.join(gff_folder, gff))
                shutil.move("tmp/without_low_expression.gff",
                            os.path.join(gff_folder, gff))
                prefix = None
        out.close()

    def run_tsspredator(self, args_tss, log):
        input_folder = os.path.join(args_tss.out_folder, "configs")
        for gff in os.listdir(args_tss.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(
                                                 args_tss.gffs, gff))
        self.helper.check_make_folder(self.gff_outfolder)
        self.multiparser.parser_fasta(args_tss.fastas)
        self.multiparser.parser_gff(args_tss.gffs, None)
        self.multiparser.parser_wig(args_tss.wig_folder)
        prefixs = self._set_gen_config(args_tss, input_folder, log)
        for prefix in prefixs:
            out_path = os.path.join(
                    self.master, "_".join(["MasterTable", prefix]))
            config_file = os.path.join(
                    input_folder, "_".join(["config", prefix]) + ".ini")
            self._start_to_run(args_tss.tsspredator_path, config_file,
                               out_path, prefix, log)
            if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")):
                shutil.move(os.path.join(out_path, "TSSstatistics.tsv"),
                            os.path.join(
                                self.stat_outfolder, "TSSstatistics.tsv"))
        if args_tss.program.lower() == "ps":
            args_tss.program = "processing"
        self._convert_gff(prefixs, args_tss, log)
        if args_tss.check_orphan:
            print("checking the orphan TSSs")
            log.write("Running check_orphan.py to re-check orphan TSSs.\n")
            self._check_orphan(prefixs,
                               os.path.join(args_tss.wig_folder, "tmp"),
                               args_tss)
        self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder,
                                     None, args_tss.program)
        datas = []
        for gff in os.listdir(self.gff_outfolder):
            if gff.endswith(".gff"):
                gff_folder = gff.replace("".join(["_", args_tss.program,
                                                  ".gff"]), "")
                self.helper.check_make_folder(
                     os.path.join(self.stat_outfolder, gff_folder))
                datas.append(gff_folder)
        if args_tss.remove_low_expression is not None:
            log.write("Running filter_low_expression.py to filter out "
                      "low expressed TSS/PS.\n")
            self._low_expression(args_tss, self.gff_outfolder)
        if args_tss.manual is not None:
            self.multiparser.parser_gff(args_tss.manual, None)
            self.multiparser.combine_gff(args_tss.gffs, self.manual_path,
                                         None, None)
            self.multiparser.combine_fasta(args_tss.gffs, self.fasta_path,
                                         None)
            self.multiparser.combine_wig(args_tss.gffs, self.wig_path,
                                         None, args_tss.libs)
            log.write("Running merge_manual.py to merge the manual TSSs.\n")
            self._merge_manual(datas, args_tss)
        log.write("Running filter_TSS_pro.py to deal with the overlap "
                  "position between TSS and PS.\n")
        self._deal_with_overlap(self.gff_outfolder, args_tss)
        log.write("Running stat_TSSpredator.py to do statistics.\n")
        self._stat_tss(datas, args_tss.program, log)
        if args_tss.validate:
            self._validate(datas, args_tss, log)
        if args_tss.ta_files is not None:
            self._compare_ta(datas, args_tss, log)
        self._remove_files(args_tss)
예제 #21
0
class Terminator(object):

    def __init__(self, args_term):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.gff_parser = Gff3Parser()
        self.gff_path = os.path.join(args_term.gffs, "tmp")
        self.fasta_path = os.path.join(args_term.fastas, "tmp")
        self.tran_path = os.path.join(args_term.trans, "tmp")
        self.outfolder = {"term": os.path.join(args_term.out_folder, "gffs"),
                          "csv": os.path.join(args_term.out_folder, "tables")}
        self.terms = {"all": os.path.join(self.outfolder["term"],
                                          "all_candidates"),
                      "express": os.path.join(self.outfolder["term"],
                                              "express"),
                      "best": os.path.join(self.outfolder["term"], "best"),
                      "non": os.path.join(self.outfolder["term"],
                                          "non_express")}
        self.csvs = {"all": os.path.join(self.outfolder["csv"],
                                         "all_candidates"),
                     "express": os.path.join(self.outfolder["csv"], "express"),
                     "best": os.path.join(self.outfolder["csv"], "best"),
                     "non": os.path.join(self.outfolder["csv"], "non_express")}
        self.combine_path = os.path.join(self.gff_path, "combine")
        self.tmps = {"transterm": os.path.join(os.getcwd(), "tmp_transterm"),
                     "hp": "transtermhp", "hp_gff": "transtermhp.gff",
                     "hp_path": "tmp_transterm/tmp",
                     "term_table": os.path.join(os.getcwd(), "tmp_term_table"),
                     "merge": os.path.join(os.getcwd(), "tmp_merge_gff"),
                     "gff": "tmp.gff",
                     "folder": os.path.join(os.getcwd(), "tmp")}
        self.suffixs = {"gff": "term.gff", "csv": "term.csv",
                        "allgff": "term_all.gff"}
        if args_term.srnas:
            self.srna_path = os.path.join(args_term.srnas, "tmp")
        else:
            self.srna_path = None
        self._make_gff_folder()

    def _combine_annotation(self, combine_file, files):
        with open(combine_file, 'w') as result:
            for file_ in files:
                check_start = False
                fh = open(file_, 'r')
                for line in fh:
                    if check_start:
                        result.write(line)
                    if "Location" in line:
                        check_start = True
                if "\n" not in line:
                    result.write("\n")
                fh.close()

    def _make_gff_folder(self):
        self.helper.check_make_folder(self.terms["all"])
        self.helper.check_make_folder(self.csvs["all"])
        self.helper.check_make_folder(self.terms["best"])
        self.helper.check_make_folder(self.csvs["best"])
        self.helper.check_make_folder(self.terms["express"])
        self.helper.check_make_folder(self.csvs["express"])
        self.helper.check_make_folder(self.terms["non"])
        self.helper.check_make_folder(self.csvs["non"])

    def _convert_gff2rntptt(self, gff_path, fasta_path, sRNAs):
        file_types = {}
        prefixs = []
        for gff in os.listdir(gff_path):
            if gff.endswith(".gff"):
                filename = gff.split("/")
                prefix = filename[-1][:-4]
                prefixs.append(prefix)
                gff_file = os.path.join(gff_path, gff)
                rnt_file = os.path.join(gff_path, gff.replace(".gff", ".rnt"))
                ptt_file = os.path.join(gff_path, gff.replace(".gff", ".ptt"))
                fasta = self.helper.get_correct_file(
                             fasta_path, ".fa", prefix, None, None)
                if not fasta:
                    print("Error: no proper file - {0}.fa".format(prefix))
                    sys.exit()
                if sRNAs:
                    self.multiparser.parser_gff(sRNAs, "sRNA")
                    srna = self.helper.get_correct_file(
                            self.srna_path, "_sRNA.gff", prefix, None, None)
                    if (srna) and (fasta):
                        self.converter.convert_gff2rntptt(
                            gff_file, fasta, ptt_file, rnt_file, srna,
                            srna.replace(".gff", ".rnt"))
                        file_types[prefix] = "srna"
                    if (not srna) and (fasta):
                        self.converter.convert_gff2rntptt(
                            gff_file, fasta, ptt_file, rnt_file, None, None)
                        file_types[prefix] = "normal"
                else:
                    self.converter.convert_gff2rntptt(
                        gff_file, fasta, ptt_file, rnt_file, None, None)
                    file_types[prefix] = "normal"
        return file_types, prefixs

    def _combine_ptt_rnt(self, gff_path, file_types, srna_path):
        self.helper.check_make_folder(self.combine_path)
        for prefix, file_type in file_types.items():
            combine_file = os.path.join(self.combine_path, prefix + '.ptt')
            if file_type == "normal":
                files = [os.path.join(gff_path, prefix + ".ptt"),
                         os.path.join(gff_path, prefix + ".rnt")]
                self._combine_annotation(combine_file, files)
            elif file_type == "srna":
                files = [os.path.join(gff_path, prefix + ".ptt"),
                         os.path.join(gff_path, prefix + ".rnt"),
                         os.path.join(srna_path,
                                      "_".join([prefix, "sRNA.rnt"]))]
                self._combine_annotation(combine_file, files)

    def _TransTermHP(self, fasta, file_, out_path, prefix, out, args_term):
        call([args_term.TransTermHP_path, "-p", args_term.expterm_path,
              fasta, os.path.join(self.combine_path, file_), "--t2t-perf",
              os.path.join(out_path, "_".join([
                  prefix,
                  "terminators_within_robust_tail-to-tail_regions.t2t"])),
              "--bag-output", os.path.join(out_path, "_".join([
                  prefix, "best_terminator_after_gene.bag"]))],
             stdout=out)

    def _run_TransTermHP(self, args_term):
        self.helper.check_make_folder(self.tmps["transterm"])
        for file_ in os.listdir(self.combine_path):
            if ".ptt" in file_:
                prefix = file_.replace(".ptt", "")
                fasta = self.helper.get_correct_file(
                             self.fasta_path, ".fa", prefix, None, None)
                if not fasta:
                    print("Error: no proper file - {0}.fa".format(prefix))
                    sys.exit()
                out_path = os.path.join(args_term.hp_folder, prefix)
                self.helper.check_make_folder(out_path)
                out = open(os.path.join(out_path,
                           "_".join([prefix, "terminators.txt"])), "w")
                self._TransTermHP(fasta, file_, out_path,
                                  prefix, out, args_term)
                out.close()
        shutil.rmtree(self.combine_path)

    def _convert_to_gff(self, prefixs, args_term):
        for prefix in prefixs:
            for folder in os.listdir(args_term.hp_folder):
                if prefix == folder:
                    out_path = os.path.join(args_term.hp_folder, folder)
                    for file_ in os.listdir(out_path):
                        if file_.endswith(".bag"):
                            out_file = os.path.join(
                                    self.tmps["transterm"],
                                    "_".join([prefix, self.tmps["hp_gff"]]))
                            self.converter.convert_transtermhp2gff(
                                 os.path.join(out_path, file_), out_file)
        self.multiparser.combine_gff(args_term.gffs, self.tmps["transterm"],
                                     None, self.tmps["hp"])

    def _combine_wigs(self, args_term):
        if (args_term.tex_wigs is not None) and (
                args_term.frag_wigs is not None):
            folder = args_term.tex_wigs.split("/")
            folder = "/".join(folder[:-1])
            merge_wigs = os.path.join(folder, "merge_wigs")
            self.helper.check_make_folder(merge_wigs)
            for wig in os.listdir(args_term.tex_wigs):
                if os.path.isdir(os.path.join(args_term.tex_wigs, wig)):
                    pass
                else:
                    shutil.copy(os.path.join(args_term.tex_wigs, wig),
                                merge_wigs)
            for wig in os.listdir(args_term.frag_wigs):
                if os.path.isdir(os.path.join(args_term.frag_wigs, wig)):
                    pass
                else:
                    shutil.copy(os.path.join(args_term.frag_wigs, wig),
                                merge_wigs)
        elif (args_term.tex_wigs is not None):
            merge_wigs = args_term.tex_wigs
        elif (args_term.frag_wigs is not None):
            merge_wigs = args_term.frag_wigs
        else:
            print("Error: no proper wig files!!!")
            sys.exit()
        return merge_wigs

    def _merge_sRNA(self, sRNAs, prefixs, gff_path):
        if sRNAs is not None:
            self.multiparser.parser_gff(sRNAs, "sRNA")
            self.helper.check_make_folder(self.tmps["merge"])
            for prefix in prefixs:
                tmp_gff = os.path.join(self.tmps["merge"], self.tmps["gff"])
                if self.tmps["gff"] in os.listdir(self.tmps["merge"]):
                    os.remove(tmp_gff)
                self.helper.merge_file(os.path.join(gff_path, prefix + ".gff"),
                                       tmp_gff)
                self.helper.merge_file(os.path.join(
                    self.srna_path, "_".join([prefix, "sRNA.gff"])), tmp_gff)
                self.helper.sort_gff(tmp_gff, os.path.join(
                    self.tmps["merge"], prefix + ".gff"))
                os.remove(tmp_gff)
            merge_path = self.tmps["merge"]
        else:
            merge_path = gff_path
        return merge_path

    def _move_file(self, term_outfolder, csv_outfolder):
        for gff in os.listdir(term_outfolder):
            if gff.endswith("_term.gff"):
                self.helper.sort_gff(os.path.join(term_outfolder, gff),
                                     self.tmps["gff"])
                shutil.move(self.tmps["gff"],
                            os.path.join(term_outfolder, gff))
                prefix = gff.replace("_term.gff", "")
                new_gff = os.path.join(self.terms["all"], "_".join([
                        prefix, self.suffixs["allgff"]]))
                csv_file = os.path.join(
                        os.path.join(self.csvs["all"], "_".join([
                            prefix, self.suffixs["csv"]])))
                out = open(new_gff, "w")
                out.write("##gff-version 3\n")
                out.close()
                self.helper.merge_file(
                        os.path.join(term_outfolder, gff),
                        os.path.join(
                            self.terms["all"], "_".join([
                                prefix, self.suffixs["allgff"]])))
                os.remove(os.path.join(term_outfolder, gff))
                pre_strain = ""
                if ("_".join([prefix, self.suffixs["csv"]]) in
                        os.listdir(self.csvs["all"])):
                    os.remove(csv_file)
                out_csv = open(csv_file, "w")
                out_csv.write("\t".join(["strain", "name", "start", "end",
                              "strand", "detect", "coverage_detail"]) + "\n")
                out_csv.close()
                fh = open(new_gff)
                for entry in self.gff_parser.entries(fh):
                    if entry.seq_id != pre_strain:
                        self.helper.merge_file(os.path.join(
                            self.tmps["term_table"], "_".join([
                                entry.seq_id, "term_raw.csv"])),
                            os.path.join(self.csvs["all"], "_".join([
                                prefix, self.suffixs["csv"]])))
                    pre_strain = entry.seq_id
                fh.close()

    def _run_rnafold(self, RNAfold_path, tmp_seq, tmp_sec, prefix):
        print("Computing secondray structure of {0}".format(prefix))
        self.helper.check_make_folder(self.tmps["folder"])
        pre_cwd = os.getcwd()
        os.chdir(self.tmps["folder"])
        os.system(" ".join([RNAfold_path, "<", os.path.join("..", tmp_seq),
                  ">", os.path.join("..", tmp_sec)]))
        os.chdir(pre_cwd)
        shutil.rmtree(self.tmps["folder"])

    def _compute_intersection_forward_reverse(
            self, prefixs, merge_path, wig_path, merge_wigs, args_term):
        for prefix in prefixs:
            tmp_seq = os.path.join(args_term.out_folder,
                                   "_".join(["inter_seq", prefix]))
            tmp_sec = os.path.join(args_term.out_folder,
                                   "_".join(["inter_sec", prefix]))
            tran_file = os.path.join(self.tran_path,
                                     "_".join([prefix, "transcript.gff"]))
            gff_file = os.path.join(merge_path, prefix + ".gff")
            print("Extracting seq of {0}".format(prefix))
            intergenic_seq(os.path.join(self.fasta_path, prefix + ".fa"),
                           tran_file, gff_file, tmp_seq)
            self._run_rnafold(args_term.RNAfold_path, tmp_seq, tmp_sec, prefix)
            tmp_cand = os.path.join(args_term.out_folder,
                                    "_".join(["term_candidates", prefix]))
            poly_t(tmp_seq, tmp_sec, gff_file, tran_file, tmp_cand, args_term)
            print("detection of terminator")
            detect_coverage(
                tmp_cand, os.path.join(merge_path, prefix + ".gff"),
                os.path.join(self.tran_path, "_".join([
                    prefix, "transcript.gff"])),
                os.path.join(self.fasta_path, prefix + ".fa"),
                os.path.join(wig_path, "_".join([prefix, "forward.wig"])),
                os.path.join(wig_path, "_".join([prefix, "reverse.wig"])),
                os.path.join(self.tmps["hp_path"], "_".join([
                    prefix, self.tmps["hp_gff"]])), merge_wigs,
                os.path.join(self.outfolder["term"], "_".join([
                    prefix, self.suffixs["gff"]])),
                os.path.join(self.tmps["term_table"], "_".join([
                    prefix, "term_raw.csv"])), args_term)
        self.multiparser.combine_gff(args_term.gffs, self.outfolder["term"],
                                     None, "term")
        self._move_file(self.outfolder["term"], self.outfolder["csv"])

    def _remove_tmp_file(self, merge_wigs, args_term):
        self.helper.remove_tmp(args_term.gffs)
        self.helper.remove_tmp(args_term.fastas)
        if args_term.srnas is not None:
            self.helper.remove_tmp(args_term.srnas)
            shutil.rmtree(self.tmps["merge"])
        if (args_term.tex_wigs is not None) and (
                args_term.frag_wigs is not None):
            shutil.rmtree(merge_wigs)
        self.helper.remove_tmp(args_term.trans)
        self.helper.remove_tmp(args_term.tex_wigs)
        self.helper.remove_tmp(args_term.frag_wigs)
        self.helper.remove_tmp(self.outfolder["term"])
        shutil.rmtree(self.tmps["transterm"])
        shutil.rmtree(self.tmps["term_table"])
        self.helper.remove_all_content(args_term.out_folder,
                                       "inter_seq_", "file")
        self.helper.remove_all_content(args_term.out_folder,
                                       "inter_sec_", "file")
        self.helper.remove_all_content(args_term.out_folder,
                                       "term_candidates_", "file")

    def _compute_stat(self, args_term):
        new_prefixs = []
        for gff in os.listdir(self.terms["all"]):
            if gff.endswith("_term_all.gff"):
                out_tmp = open(self.tmps["gff"], "w")
                out_tmp.write("##gff-version 3\n")
                new_prefix = gff.replace("_term_all.gff", "")
                new_prefixs.append(gff.replace("_term_all.gff", ""))
                num = 0
                fh = open(os.path.join(self.terms["all"], gff))
                for entry in self.gff_parser.entries(fh):
                    name = '%0*d' % (5, num)
                    entry.attributes["ID"] = "term" + str(num)
                    entry.attributes["Name"] = "_".join(["Terminator_" + name])
                    entry.attribute_string = ";".join([
                        "=".join(items) for items in entry.attributes.items()])
                    out_tmp.write("\t".join([entry.info_without_attributes,
                                  entry.attribute_string]) + "\n")
                    num += 1
                out_tmp.close()
                fh.close()
                shutil.move(self.tmps["gff"], os.path.join(self.terms["all"],
                            "_".join([new_prefix, self.suffixs["gff"]])))
        if args_term.stat:
            stat_path = os.path.join(args_term.out_folder, "statistics")
            for prefix in new_prefixs:
                stat_term(os.path.join(self.terms["all"],
                          "_".join([prefix, self.suffixs["gff"]])),
                          os.path.join(self.csvs["all"],
                          "_".join([prefix, self.suffixs["csv"]])),
                          os.path.join(stat_path,
                          "_".join(["stat", prefix + ".csv"])),
                          os.path.join(self.terms["best"],
                          "_".join([prefix, "term"])),
                          os.path.join(self.terms["express"],
                          "_".join([prefix, "term"])),
                          os.path.join(self.terms["non"],
                          "_".join([prefix, "term"])))
                shutil.move(os.path.join(self.terms["best"],
                            "_".join([prefix, self.suffixs["csv"]])),
                            os.path.join(self.csvs["best"],
                            "_".join([prefix, self.suffixs["csv"]])))
                shutil.move(os.path.join(self.terms["express"],
                            "_".join([prefix, self.suffixs["csv"]])),
                            os.path.join(self.csvs["express"],
                            "_".join([prefix, self.suffixs["csv"]])))
                shutil.move(os.path.join(self.terms["non"],
                            "_".join([prefix, self.suffixs["csv"]])),
                            os.path.join(self.csvs["non"],
                            "_".join([prefix, self.suffixs["csv"]])))
                os.remove(os.path.join(self.terms["all"],
                          "_".join([prefix, self.suffixs["allgff"]])))

    def _check_gff_file(self, folder):
        for file_ in os.listdir(folder):
            if file_.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(folder, file_))

    def _compare_term_tran(self, args_term):
        self.multiparser.combine_gff(args_term.gffs, self.tran_path,
                                     None, "transcript")
        for type_ in ("best", "express", "all_candidates"):
            compare_term_tran(self.tran_path,
                              os.path.join(self.outfolder["term"], type_),
                              args_term.fuzzy_up_ta, args_term.fuzzy_down_ta,
                              args_term.out_folder, "terminator")
            shutil.move(
                os.path.join(
                    args_term.out_folder, "statistics",
                    "stat_comparison_terminator_transcript.csv"),
                os.path.join(
                    args_term.out_folder, "statistics",
                    "stat_comparison_terminator_transcript_" + type_ + ".csv"))

    def run_terminator(self, args_term):
        self._check_gff_file(args_term.gffs)
        self._check_gff_file(args_term.trans)
        self.multiparser.parser_fasta(args_term.fastas)
        if (not args_term.gffs) or (not args_term.fastas):
            print("Error: please assign gff annotation folder "
                  "and fasta folder!!!")
            sys.exit()
        file_types, prefixs = self._convert_gff2rntptt(
                self.gff_path, self.fasta_path, args_term.srnas)
        self._combine_ptt_rnt(self.gff_path, file_types, self.srna_path)
        self._run_TransTermHP(args_term)
        self._convert_to_gff(prefixs, args_term)
        self.helper.remove_tmp(self.gff_path)
        self.multiparser.parser_gff(args_term.trans, "transcript")
        self.helper.check_make_folder(self.tmps["term_table"])
        self.multiparser.parser_gff(self.tmps["transterm"], self.tmps["hp"])
        merge_path = self._merge_sRNA(args_term.srnas, prefixs, self.gff_path)
        self._compute_intersection_forward_reverse(
                prefixs, merge_path, args_term.wig_path,
                args_term.merge_wigs, args_term)
        self._compute_stat(args_term)
        self._compare_term_tran(args_term)
        self._remove_tmp_file(args_term.merge_wigs, args_term)
예제 #22
0
class PPINetwork(object):
    '''detection of PPI'''
    def __init__(self, out_folder):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.gffparser = Gff3Parser()
        self.tmp_id = os.path.join(out_folder, "tmp_id_list")
        self.all_result = os.path.join(out_folder, "all_results")
        self.best_result = os.path.join(out_folder, "best_results")
        self.fig = os.path.join(out_folder, "figures")
        self.with_strain = "with_strain"
        self.without_strain = "without_strain"
        self.tmp_files = {
            "log": "tmp_log",
            "action": "tmp_action.log",
            "pubmed": "tmp_pubmed.log",
            "specific": os.path.join(out_folder, "tmp_specific"),
            "nospecific": os.path.join(out_folder, "tmp_nospecific"),
            "wget_action": os.path.join(out_folder, "tmp_action")
        }

    def _make_folder_no_exist(self, path, folder):
        if folder not in os.listdir(path):
            os.mkdir(os.path.join(path, folder))

    def _make_subfolder(self, path, strain, ptt):
        os.mkdir(os.path.join(path, strain))
        os.mkdir(os.path.join(path, strain, ptt))

    def _run_wget(self, source, folder, log):
        call(["wget", source, "-O", folder], stderr=log)
        time.sleep(2)

    def _wget_id(self, strain, locus, strain_id, files):
        detect_id = False
        if strain == strain_id["ptt"]:
            print("Retrieving STRING ID for {0} of {1} -- {2}".format(
                locus, strain_id["string"], strain_id["file"]))
            id_source = ("http://string-db.org/api/tsv/resolve?"
                         "identifier={0}&species={1}").format(
                             locus, strain_id["string"])
            self._run_wget(id_source, os.path.join(files["id_list"], locus),
                           files["id_log"])
            detect_id = True
        return detect_id

    def _retrieve_id(self, strain_id, genes, files):
        for gene in genes:
            detect_id = self._wget_id(gene["strain"], gene["locus_tag"],
                                      strain_id, files)
            if not detect_id:
                print("Error:there is no {0} in {1}".format(
                    gene, strain_id["file"]))

    def _get_prefer_name(self, row_a, strain_id, files, querys):
        prefername = ""
        filename = row_a.split(".")
        if (filename[1] not in os.listdir(
                files["id_list"])) and ("all" not in querys):
            self._wget_id(strain_id["ptt"], filename[1], strain_id, files)
        if filename[1] in os.listdir(files["id_list"]):
            id_h = open(os.path.join(files["id_list"], filename[1]), "r")
            for row_i in csv.reader(id_h, delimiter="\t"):
                if row_a == row_i[0]:
                    prefername = row_i[3]
            id_h.close()
        return prefername

    def _print_title(self, out, id_file, id_folder):
        id_h = open(os.path.join(id_folder, id_file), "r")
        prefername = id_file
        for row_i in csv.reader(id_h, delimiter="\t"):
            prefername = row_i[3]
        id_h.close()
        out.write("Interaction of {0} | {1}\n".format(id_file, prefername))
        out.write("strain\titem_id_a\titem_id_b\tmode\taction\ta_is_acting\t"
                  "STRING_action_score\tpubmed_id\tpubmed_score\n")

    def _get_pubmed(self, row, strain_id, mode, actor, id_file, first_output,
                    ptt, files, paths, args_ppi):
        prefer1 = self._get_prefer_name(row[0], strain_id, files,
                                        args_ppi.querys)
        prefer2 = self._get_prefer_name(row[1], strain_id, files,
                                        args_ppi.querys)
        if (len(prefer1) > 0) and (len(prefer2) > 0):
            if args_ppi.no_specific:
                pubmed_source = (
                    "http://www.ncbi.nlm.nih.gov/CBBresearch/"
                    "Wilbur/IRET/PIE/getppi.cgi?term={0}+{1}").format(
                        prefer1, prefer2)
                self._run_wget(pubmed_source, self.tmp_files["nospecific"],
                               files["pubmed_log"])
            strain_id["pie"] = "+".join(strain_id["pie"].split(" "))
            pubmed_source = ("http://www.ncbi.nlm.nih.gov/CBBresearch/Wilbur"
                             "/IRET/PIE/getppi.cgi?term={0}+{1}+{2}").format(
                                 prefer1, prefer2, strain_id["pie"])
            self._run_wget(pubmed_source, self.tmp_files["specific"],
                           files["pubmed_log"])
            row[2] = mode
            row[4] = actor
            row[0] = prefer1
            row[1] = prefer2
            self._merge_information(
                first_output, self.tmp_files["specific"],
                files["all_specific"], files["best_specific"], row,
                args_ppi.score, id_file, files["id_list"], "specific",
                os.path.join(paths["all"], self.with_strain),
                os.path.join(paths["best"], self.with_strain), ptt)
            if args_ppi.no_specific:
                self._merge_information(
                    first_output, self.tmp_files["nospecific"],
                    files["all_nospecific"], files["best_nospecific"], row,
                    args_ppi.score, id_file, files["id_list"], "nospecific",
                    os.path.join(paths["all"], self.without_strain),
                    os.path.join(paths["best"], self.without_strain), ptt)

    def _print_single_file(self, out_single, row_a, ptt, row):
        if row == "NA":
            out_single.write("\t".join([ptt, "\t".join(row_a), "NA", "NA"]) +
                             "\n")
        else:
            out_single.write(
                "\t".join([ptt, "\t".join(row_a), "\t".join(row)]) + "\n")

    def _merge_information(self, first_output, filename, out_all, out_best,
                           row_a, score, id_file, id_folder, file_type,
                           all_folder, best_folder, ptt):
        if os.path.getsize(filename) != 0:
            f_h = open(filename, "r")
            out_all_single = open(
                os.path.join(all_folder, ptt,
                             "_".join([row_a[0], row_a[1] + ".csv"])), "w")
            out_best_single = open(
                os.path.join(best_folder, ptt,
                             "_".join([row_a[0], row_a[1] + ".csv"])), "w")
            self._print_title(out_all_single, id_file, id_folder)
            self._print_title(out_best_single, id_file, id_folder)
            detect = False
            for row in csv.reader(f_h, delimiter="\t"):
                self._print_single_file(out_all_single, row_a, ptt, row)
                if first_output["_".join([file_type, "all"])]:
                    first_output["_".join([file_type, "all"])] = False
                    self._print_title(out_all, id_file, id_folder)
                out_all.write(
                    "\t".join([ptt, "\t".join(row_a), "\t".join(row)]) + "\n")
                if (float(row[1]) >= score):
                    detect = True
                    self._print_single_file(out_best_single, row_a, ptt, row)
                    if first_output["_".join([file_type, "best"])]:
                        first_output["_".join([file_type, "best"])] = False
                        self._print_title(out_best, id_file, id_folder)
                    out_best.write(
                        "\t".join([ptt, "\t".join(row_a), "\t".join(row)]) +
                        "\n")
            f_h.close()
            if not detect:
                os.remove(
                    os.path.join(best_folder, ptt,
                                 "_".join([row_a[0], row_a[1] + ".csv"])))
            out_all_single.close()
            out_best_single.close()
        else:
            out_all_single = open(
                os.path.join(all_folder, ptt,
                             "_".join([row_a[0], row_a[1] + ".csv"])), "w")
            self._print_title(out_all_single, id_file, id_folder)
            self._print_single_file(out_all_single, row_a, ptt, "NA")
            if first_output["_".join([file_type, "all"])]:
                first_output["_".join([file_type, "all"])] = False
                self._print_title(out_all, id_file, id_folder)
            out_all.write("\t".join([ptt, "\t".join(row_a), "NA", "NA"]) +
                          "\n")
            out_all_single.close()

    def _detect_protein(self, strain_id, args_ppi):
        fh = open(os.path.join(args_ppi.ptts, strain_id["file"]), "r")
        genes = []
        for row in csv.reader(fh, delimiter="\t"):
            if (len(row) == 1) and ("-" in row[0]) and (".." in row[0]):
                name = (row[0].split("-"))[0].strip().split(",")[0].strip()
            if ("all" in args_ppi.querys):
                if (len(row) > 1) and (row[0] != "Location"):
                    genes.append({"strain": name, "locus_tag": row[5]})
            else:
                for query in args_ppi.querys:
                    datas = query.split(":")
                    strain = datas[0]
                    start = datas[1]
                    end = datas[2]
                    strand = datas[3]
                    if (len(row) > 1
                        ) and (row[0] != "Location") and (name == strain) and (
                            start == row[0].split("..")[0]) and (
                                end == row[0].split("..")[1]) and (strand
                                                                   == row[1]):
                        genes.append({"strain": name, "locus_tag": row[5]})
        fh.close()
        return genes

    def _setup_nospecific(self, paths, strain_id, files):
        self._make_subfolder(paths["all"], self.without_strain,
                             strain_id["ptt"])
        self._make_subfolder(paths["best"], self.without_strain,
                             strain_id["ptt"])
        self._make_subfolder(paths["fig"], self.without_strain,
                             strain_id["ptt"])
        filename_nostrain = "_".join([
            strain_id["file"].replace(".ptt", ""), self.without_strain + ".csv"
        ])
        files["all_nospecific"] = open(
            os.path.join(paths["all"], filename_nostrain), "w")
        files["best_nospecific"] = open(
            os.path.join(paths["best"], filename_nostrain), "w")

    def _setup_folder_and_read_file(self, strain_id, pre_file, files, paths,
                                    args_ppi):
        if strain_id["file"].endswith(".ptt"):
            if strain_id["file"] != pre_file:
                self.helper.check_make_folder("_".join(
                    [self.tmp_id, strain_id["file"]]))
                paths["all"] = os.path.join(self.all_result,
                                            strain_id["file"][:-4])
                paths["best"] = os.path.join(self.best_result,
                                             strain_id["file"][:-4])
                paths["fig"] = os.path.join(self.fig, strain_id["file"][:-4])
                self.helper.check_make_folder(
                    os.path.join(self.all_result, strain_id["file"][:-4]))
                self.helper.check_make_folder(
                    os.path.join(self.best_result, strain_id["file"][:-4]))
                self.helper.check_make_folder(
                    os.path.join(self.fig, strain_id["file"][:-4]))
                self._make_subfolder(paths["all"], self.with_strain,
                                     strain_id["ptt"])
                self._make_subfolder(paths["best"], self.with_strain,
                                     strain_id["ptt"])
                self._make_subfolder(paths["fig"], self.with_strain,
                                     strain_id["ptt"])
                filename_strain = "_".join([
                    strain_id["file"].replace(".ptt", ""),
                    self.with_strain + ".csv"
                ])
                files["all_specific"] = open(
                    os.path.join(paths["all"], filename_strain), "w")
                files["best_specific"] = open(
                    os.path.join(paths["best"], filename_strain), "w")
                if args_ppi.no_specific:
                    self._setup_nospecific(paths, strain_id, files)
                files["id_list"] = "_".join([self.tmp_id, strain_id["file"]])
                files["id_log"] = open(
                    os.path.join(files["id_list"], self.tmp_files["log"]), "w")
                files["action_log"] = open(
                    os.path.join(args_ppi.out_folder,
                                 self.tmp_files["action"]), "w")
                files["pubmed_log"] = open(
                    os.path.join(args_ppi.out_folder,
                                 self.tmp_files["pubmed"]), "w")
                pre_file = strain_id["file"]
                if strain_id["file"] in os.listdir(args_ppi.ptts):
                    genes = self._detect_protein(strain_id, args_ppi)
            else:
                self._make_folder_no_exist(
                    os.path.join(paths["all"], self.with_strain),
                    strain_id["ptt"])
                self._make_folder_no_exist(
                    os.path.join(paths["best"], self.with_strain),
                    strain_id["ptt"])
                if args_ppi.no_specific:
                    self._make_folder_no_exist(
                        os.path.join(paths["all"], self.without_strain),
                        strain_id["ptt"])
                    self._make_folder_no_exist(
                        os.path.join(paths["best"], self.without_strain),
                        strain_id["ptt"])
        else:
            print("Error:wrong .ptt file!!")
            sys.exit()
        return genes

    def _wget_actions(self, files, id_file, strain_id, out_folder):
        detect = False
        t_h = open(os.path.join(files["id_list"], id_file), "r")
        print("Retrieving STRING actions for {0} of {1} -- {2}".format(
            id_file, strain_id["string"], strain_id["file"]))
        for row in csv.reader(t_h, delimiter="\t"):
            if row[0].startswith("stringId"):
                continue
            else:
                detect = True
                if row[1] == strain_id["string"]:
                    action_source = ("http://string-db.org/api/tsv/actions?"
                                     "identifier={0}&species={1}").format(
                                         row[0], row[1])
                    self._run_wget(action_source,
                                   self.tmp_files["wget_action"],
                                   files["action_log"])
                    break
        t_h.close()
        if not detect:
            print("Warning: " + id_file + " can not be found in STRING...")
        return detect

    def _retrieve_actions(self, files, strain_id, paths, args_ppi):
        '''get the interaction of proteins'''
        for id_file in os.listdir(files["id_list"]):
            if id_file != self.tmp_files["log"]:
                detect_id = self._wget_actions(files, id_file, strain_id,
                                               args_ppi.out_folder)
                if detect_id:
                    a_h = open(self.tmp_files["wget_action"], "r")
                    pre_row = []
                    first = True
                    detect = False
                    first_output = {
                        "specific_all": True,
                        "specific_best": True,
                        "nospecific_all": True,
                        "nospecific_best": True
                    }
                    print("Retrieving Pubmed for {0} of {1} -- {2}".format(
                        id_file, strain_id["string"], strain_id["file"]))
                    for row_a in csv.reader(a_h, delimiter="\t"):
                        if row_a == []:
                            print("No interaction can be detected...")
                            break
                        if row_a[0].startswith("item_id_a"):
                            continue
                        else:
                            detect = True
                            if first:
                                first = False
                                mode = row_a[2]
                                actor = row_a[4]
                            else:
                                if (row_a[0] != pre_row[0]) or (row_a[1] !=
                                                                pre_row[1]):
                                    self._get_pubmed(pre_row, strain_id, mode,
                                                     actor, id_file,
                                                     first_output,
                                                     strain_id["ptt"], files,
                                                     paths, args_ppi)
                                    mode = row_a[2]
                                    actor = row_a[4]
                                else:
                                    mode = mode + ";" + row_a[2]
                                    actor = actor + ";" + row_a[4]
                            pre_row = row_a
                    if detect:
                        detect = False
                        self._get_pubmed(row_a, strain_id, mode, actor,
                                         id_file, first_output,
                                         strain_id["ptt"], files, paths,
                                         args_ppi)
        if detect_id:
            a_h.close()

    def _plot(self, args_ppi, files):
        if args_ppi.no_specific:
            files["all_nospecific"].close()
            files["best_nospecific"].close()
        files["all_specific"].close()
        files["best_specific"].close()
        for folder in os.listdir(self.all_result):
            if folder in os.listdir(self.fig):
                print("plotting {0}".format(folder))
                plot_ppi(
                    os.path.join(self.all_result, folder,
                                 "_".join([folder,
                                           self.with_strain + ".csv"])),
                    args_ppi.score,
                    os.path.join(self.fig, folder,
                                 self.with_strain), args_ppi.size)
                if args_ppi.no_specific:
                    plot_ppi(
                        os.path.join(
                            self.all_result, folder,
                            "_".join([folder, self.without_strain + ".csv"])),
                        args_ppi.score,
                        os.path.join(self.fig, folder,
                                     self.without_strain), args_ppi.size)

    def _remove_tmps(self, args_ppi):
        self.helper.remove_all_content(os.path.join(args_ppi.out_folder),
                                       "tmp", "file")
        self.helper.remove_all_content(os.path.join(args_ppi.out_folder),
                                       "tmp", "dir")
        for file_ in os.listdir(args_ppi.ptts):
            if file_.startswith("PPI_"):
                os.remove(os.path.join(args_ppi.ptts, file_))

    def retrieve_ppi_network(self, args_ppi):
        '''retrieve PPI from STRING with PIE and draw network'''
        strain_ids = []
        paths = {}
        files = {}
        for strain in args_ppi.strains:
            datas = strain.split(":")
            ptt_file = "PPI_" + datas[0].replace(".gff", ".ptt")
            rnt_file = "PPI_" + datas[0].replace(".gff", ".rnt")
            self.converter.convert_gff2rntptt(
                os.path.join(args_ppi.ptts, datas[0]), "0",
                os.path.join(args_ppi.ptts, ptt_file),
                os.path.join(args_ppi.ptts, rnt_file), None, None)
            strain_ids.append({
                "file": ptt_file,
                "ptt": datas[1],
                "string": datas[2],
                "pie": datas[3]
            })
        strain_ids.sort(key=lambda x: x["file"])
        pre_file = ""
        for strain_id in strain_ids:
            genes = self._setup_folder_and_read_file(strain_id, pre_file,
                                                     files, paths, args_ppi)
            s_h = open(args_ppi.species, "r")
            for row in csv.reader(s_h, delimiter="\t"):
                if row[0] != "##":
                    if row[0] == strain_id["string"]:
                        break
                    elif row[2] == strain_id["string"]:
                        strain_id["string"] = row[0]
                        break
                    elif row[3] == strain_id["string"]:
                        strain_id["string"] = row[0]
                        break
            self._retrieve_id(strain_id, genes, files)
            self._retrieve_actions(files, strain_id, paths, args_ppi)
        self._plot(args_ppi, files)
        self._remove_tmps(args_ppi)
예제 #23
0
class CircRNADetection(object):

    def __init__(self, args_circ):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.alignment_path = os.path.join(args_circ.output_folder,
                                           "segemehl_align")
        self.splice_path = os.path.join(args_circ.output_folder,
                                        "segemehl_splice")
        self.candidate_path = os.path.join(args_circ.output_folder,
                                           "circRNA_tables")
        self.gff_folder = os.path.join(args_circ.output_folder, "gffs")
        self.gff_path = os.path.join(args_circ.gffs, "tmp")
        self.splices = {"all_file": "splicesites_all.bed",
                        "file": "splicesites.bed",
                        "all": "splicesites_all", "splice": "splicesites"}
        self.trans = {"all_file": "transrealigned_all.bed",
                      "file": "transrealigned.bed",
                      "all": "transrealigned_all", "trans": "transrealigned"}
        self.bams = {"whole": "whole_reads.bam", "sort": "whole_reads_sort"}
        if args_circ.align:
            if args_circ.fastas is None:
                print("Error: There is no genome fasta file!!!")
                sys.exit()
            else:
                self.fasta_path = os.path.join(args_circ.fastas, "tmp")
        else:
            self.fasta_path = os.path.join(args_circ.fastas, "tmp")

    def _wait_process(self, processes):
        for p in processes:
            p.wait()
            if p.stdout:
                p.stdout.close()
            if p.stdin:
                p.stdin.close()
            if p.stderr:
                p.stderr.close()
            try:
                p.kill()
            except OSError:
                pass
            time.sleep(5)

    def _deal_zip_file(self, read_folder):
        tmp_reads = []
        for read in os.listdir(read_folder):
            if read.endswith(".bz2"):
                mod_read = read.replace(".bz2", "")
                if (".fa" not in mod_read) and (
                        ".fasta" not in mod_read) and (
                        ".fna" not in mod_read):
                    mod_read = mod_read + ".fa"
                read_out = open(os.path.join(read_folder, mod_read), "w")
                tmp_reads.append(os.path.join(read_folder, mod_read))
                print(" ".join(["unzip", read]))
                call(["bzcat", os.path.join(read_folder, read)],
                     stdout=read_out)
                read_out.close()
            elif read.endswith(".gz"):
                mod_read = read.replace(".gz", "")
                if (".fa" not in mod_read) and (
                        ".fasta" not in mod_read) and (
                        ".fna" not in mod_read):
                    mod_read = mod_read + ".fa"
                read_out = open(os.path.join(read_folder, mod_read), "w")
                tmp_reads.append(os.path.join(read_folder, mod_read))
                print(" ".join(["unzip", read]))
                call(["zcat", os.path.join(read_folder, read)],
                     stdout=read_out)
                read_out.close()
        return tmp_reads

    def _run_segemehl_fasta_index(self, segemehl_path, fasta_path,
                                  index, fasta):
        call([os.path.join(segemehl_path, "segemehl.x"),
              "-x", os.path.join(fasta_path, index),
              "-d", os.path.join(fasta_path, fasta)])

    def _run_segemehl_align(self, args_circ, index, fasta, read,
                            sam_file, log_file, fasta_prefix):
        out = open(os.path.join(self.alignment_path,
                   fasta_prefix, sam_file), "w")
        log = open(os.path.join(self.alignment_path,
                   fasta_prefix, log_file), "w")
        p = Popen([os.path.join(args_circ.segemehl_path, "segemehl.x"),
                   "-i", os.path.join(self.fasta_path, index),
                   "-d", os.path.join(self.fasta_path, fasta),
                   "-q", os.path.join(args_circ.read_folder, read), "-S"],
                  stdout=out, stderr=log)
        return p

    def _align(self, args_circ):
        prefixs = []
        align_files = []
        for fasta in os.listdir(self.fasta_path):
            index = fasta.replace(".fa", ".idx")
            self._run_segemehl_fasta_index(args_circ.segemehl_path,
                                           self.fasta_path, index, fasta)
            processes = []
            num_process = 0
            fasta_prefix = fasta.replace(".fa", "")
            prefixs.append(fasta_prefix)
            self.helper.check_make_folder(os.path.join(
                                self.alignment_path, fasta_prefix))
            for read in os.listdir(args_circ.read_folder):
                num_process += 1
                if read.endswith(".fa") or \
                   read.endswith(".fna") or \
                   read.endswith("fasta"):
                    filename = read.split(".")
                    read_prefix = ".".join(filename[:-1])
                    sam_file = "_".join([read_prefix, fasta_prefix + ".sam"])
                    log_file = "_".join([read_prefix, fasta_prefix + ".log"])
                    align_files.append("_".join([read_prefix, fasta_prefix]))
                    print("mapping {0}".format(sam_file))
                    p = self._run_segemehl_align(
                            args_circ, index, fasta, read,
                            sam_file, log_file, fasta_prefix)
                    processes.append(p)
                    if num_process == args_circ.cores:
                        self._wait_process(processes)
                        num_process = 0
            self._wait_process(processes)
        return align_files, prefixs

    def _run_samtools_convert_bam(self, samtools_path, pre_sam, out_bam):
        call([samtools_path, "view", "-bS", pre_sam, "-o", out_bam])

    def _convert_sam2bam(self, sub_alignment_path, samtools_path, align_files):
        bam_files = []
        convert_ones = []
        remove_ones = []
        for sam in os.listdir(sub_alignment_path):
            pre_sam = os.path.join(sub_alignment_path, sam)
            if sam.endswith(".sam"):
                bam_file = sam.replace(".sam", ".bam")
                print("Convert {0} to {1}".format(sam, bam_file))
                out_bam = os.path.join(sub_alignment_path, bam_file)
                self._run_samtools_convert_bam(samtools_path, pre_sam, out_bam)
                bam_files.append(out_bam)
                if align_files:
                    if bam_file.replace(".bam", "") not in align_files:
                        convert_ones.append(out_bam)
                    else:
                        remove_ones.append(pre_sam)
            elif sam.endswith(".bam"):
                if (pre_sam not in convert_ones) and (
                        pre_sam not in remove_ones):
                    bam_files.append(pre_sam)
            elif sam.endswith(".log"):
                os.remove(pre_sam)
        return bam_files, convert_ones, remove_ones

    def _run_samtools_merge_sort(self, samtools_path,
                                 sub_alignment_path, bam_files):
        print("Merge all bam files....")
        whole_bam = os.path.join(sub_alignment_path, self.bams["whole"])
        if len(bam_files) <= 1:
            shutil.copyfile(bam_files[0], whole_bam)
        else:
            file_line = " ".join(bam_files)
            os.system(" ".join([samtools_path, "merge",
                                whole_bam, file_line]))
        print("Sort bam files....")
        call([samtools_path, "sort", "-o", os.path.join(sub_alignment_path,
              self.bams["sort"] + ".bam"), whole_bam])
        os.remove(os.path.join(sub_alignment_path, self.bams["whole"]))

    def _run_samtools_convert_sam(self, samtools_path, sub_alignment_path):
        print("Convert whole reads bam file to sam file....")
        call([samtools_path, "view", "-h", "-o",
              os.path.join(sub_alignment_path, self.bams["sort"] + ".sam"),
              os.path.join(sub_alignment_path, self.bams["sort"] + ".bam")])

    def _merge_sort_aligment_file(self, bam_files, samtools_path,
                                  sub_alignment_path, convert_ones,
                                  tmp_reads, remove_ones):
        self._run_samtools_merge_sort(samtools_path,
                                      sub_alignment_path, bam_files)
        self._run_samtools_convert_sam(samtools_path, sub_alignment_path)
        for bam in convert_ones:
            os.remove(bam)
        for sam in remove_ones:
            os.remove(sam)
        if len(tmp_reads) != 0:
            for read in tmp_reads:
                os.remove(read)

    def _run_testrealign(self, prefix, segemehl_path, sub_alignment_path):
        self.helper.check_make_folder(os.path.join(self.splice_path, prefix))
        sub_splice_path = os.path.join(self.splice_path, prefix)
        err_log = os.path.join(sub_splice_path, prefix + ".log")
        print("Running testrealign.x for {0}".format(prefix))
        command = " ".join([
                  os.path.join(segemehl_path, "testrealign.x"),
                  "-d", os.path.join(self.fasta_path, prefix + ".fa"),
                  "-q", os.path.join(sub_alignment_path,
                                     self.bams["sort"] + ".sam"),
                  "-n"])
        os.system(command + " 2>" + err_log)
        self.helper.move_all_content(os.getcwd(), sub_splice_path, [".bed"])
        self.helper.remove_all_content(sub_alignment_path,
                                       self.bams["sort"], "file")

    def _merge_bed(self, fastas, splice_path):
        tmp_prefixs = []
        for fasta in os.listdir(fastas):
            headers = []
            if (fasta.endswith(".fa") or fasta.endswith(".fna") or
                    fasta.endswith(".fasta")):
                with open(os.path.join(fastas, fasta), "r") as f_h:
                    for line in f_h:
                        line = line.strip()
                        if line.startswith(">"):
                            headers.append(line[1:])
                filename = fasta.split(".")
                fasta_prefix = ".".join(filename[:-1])
                tmp_prefixs.append(fasta_prefix)
                self.helper.check_make_folder(os.path.join(
                                              os.getcwd(), fasta_prefix))
                for header in headers:
                    shutil.copyfile(os.path.join(splice_path, header,
                                    self.splices["file"]),
                                    os.path.join(fasta_prefix,
                                    "_".join([self.splices["splice"],
                                              header + ".bed"])))
                    shutil.copyfile(os.path.join(splice_path, header,
                                    self.trans["file"]),
                                    os.path.join(fasta_prefix,
                                    "_".join([self.trans["trans"],
                                              header + ".bed"])))
                out_splice = os.path.join(fasta_prefix,
                                          self.splices["all_file"])
                out_trans = os.path.join(fasta_prefix,
                                         self.trans["all_file"])
                if len(headers) > 1:
                    for file_ in os.listdir(fasta_prefix):
                        if (self.splices["splice"] in file_) and (
                                self.splices["all"] not in file_):
                            self.helper.merge_file(os.path.join(
                                    fasta_prefix, file_), out_splice)
                        elif (self.trans["trans"] in file_) and (
                                self.trans["all"] not in file_):
                            self.helper.merge_file(os.path.join(
                                    fasta_prefix, file_), out_trans)
                else:
                    shutil.move(os.path.join(
                                fasta_prefix,
                                "_".join([self.splices["splice"],
                                         headers[0] + ".bed"])),
                                out_splice)
                    shutil.move(os.path.join(
                                fasta_prefix,
                                "_".join([self.trans["trans"],
                                          headers[0] + ".bed"])),
                                out_trans)
        self.helper.remove_all_content(splice_path, None, "dir")
        return tmp_prefixs

    def _stat_and_gen_gff(self, tmp_prefixs, args_circ):
        for prefix in tmp_prefixs:
            self.helper.check_make_folder(os.path.join(self.gff_folder,
                                                       prefix))
            shutil.copytree(prefix, os.path.join(self.splice_path, prefix))
            self.helper.check_make_folder(os.path.join(
                                          self.candidate_path, prefix))
            print("comparing with annotation of {0}".format(prefix))
            if self.splices["all_file"] in os.listdir(os.path.join(
                                           self.splice_path, prefix)):
                detect_circrna(os.path.join(self.splice_path, prefix,
                               self.splices["all_file"]), os.path.join(
                               self.gff_path, prefix + ".gff"),
                               os.path.join(self.candidate_path, prefix,
                               "_".join(["circRNA", prefix + "_all.csv"])),
                               args_circ, os.path.join(args_circ.stat_folder,
                               "_".join(["stat_circRNA", prefix + ".csv"])))
                self.converter.convert_circ2gff(
                     os.path.join(self.candidate_path, prefix,
                                  "_".join(["circRNA",
                                            prefix + "_all.csv"])),
                     args_circ, os.path.join(
                                self.gff_folder, prefix,
                                "_".join([prefix, "circRNA_all.gff"])),
                     os.path.join(self.gff_folder, prefix,
                                  "_".join([prefix, "circRNA_best.gff"])))

    def _assign_merge_bam(self, args_circ):
        remove_frags = []
        bam_files = []
        if (args_circ.normal_bams is not None) and (
                args_circ.frag_bams is not None):
            for frag in os.listdir(args_circ.frag_bams):
                if frag.endswith(".bam"):
                    shutil.copyfile(os.path.join(args_circ.frag_bams, frag),
                                    os.path.join(args_circ.normal_bams, frag))
                    remove_frags.append(frag)
            merge_folder = args_circ.normal_bams
        elif (args_circ.normal_bams is not None):
            merge_folder = args_circ.normal_bams
        elif (args_circ.frag_bams is not None):
            merge_folder = args_circ.frag_bams
        else:
            print("Error: please assign bam folder or do alignment!!")
            sys.exit()
        for bam in os.listdir(merge_folder):
            if bam.endswith(".bam"):
                bam_files.append(os.path.join(merge_folder, bam))
        return merge_folder, remove_frags, bam_files

    def run_circrna(self, args_circ):
        for gff in os.listdir(args_circ.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(
                                                 args_circ.gffs, gff))
        if args_circ.segemehl_path is None:
            print("Error: please assign segemehl folder!!")
            sys.exit()
        self.multiparser.parser_gff(args_circ.gffs, None)
        self.multiparser.combine_gff(args_circ.fastas, self.gff_path,
                                     "fasta", None)
        tmp_reads = []
        if args_circ.align:
            self.multiparser.parser_fasta(args_circ.fastas)
            tmp_reads = self._deal_zip_file(args_circ.read_folder)
            align_files, prefixs = self._align(args_circ)
        else:
            self.multiparser.parser_fasta(args_circ.fastas)
            prefixs = []
            for fasta in os.listdir(self.fasta_path):
                fasta_prefix = fasta.replace(".fa", "")
                prefixs.append(fasta_prefix)
            merge_folder, remove_frag, bam_files = self._assign_merge_bam(
                                                   args_circ)
            align_files = None
        for prefix in prefixs:
            if args_circ.align:
                sub_alignment_path = os.path.join(self.alignment_path, prefix)
                bam_files, convert_ones, remove_ones = self._convert_sam2bam(
                    sub_alignment_path, args_circ.samtools_path, align_files)
            else:
                sub_alignment_path = merge_folder
                convert_ones = []
                remove_ones = []
            self._merge_sort_aligment_file(
                bam_files, args_circ.samtools_path, sub_alignment_path,
                convert_ones, tmp_reads, remove_ones)
            self._run_testrealign(prefix, args_circ.segemehl_path,
                                  sub_alignment_path)
        tmp_prefixs = self._merge_bed(args_circ.fastas, self.splice_path)
        self.multiparser.parser_gff(args_circ.gffs, None)
        self.multiparser.combine_gff(args_circ.fastas, self.gff_path,
                                     "fasta", None)
        self._stat_and_gen_gff(tmp_prefixs, args_circ)
        self.helper.remove_tmp(args_circ.fastas)
        self.helper.remove_tmp(args_circ.gffs)
        for tmp_prefix in tmp_prefixs:
            shutil.rmtree(tmp_prefix)
        if (not args_circ.align) and (len(remove_frag) != 0):
            for frag in remove_frag:
                os.remove(os.path.join(merge_folder, frag))
예제 #24
0
class TestConverter(unittest.TestCase):
    def setUp(self):
        self.converter = Converter()
        self.example = Example()
        self.converter.gff3parser = Mock_gff3_parser
        self.converter._print_rntptt_title = Mock_func().print_rntptt_title
        self.converter.tsspredator = Mock_TSSPredatorReader()
        self.converter._read_file = Mock_func().mock_read_file
        self.gff_file = self.example.gff_file
        self.ptt_out = self.example.ptt_out
        self.rnt_out = self.example.rnt_out
        self.srna_out = self.example.srna_out
        self.embl_file = self.example.embl_file
        self.embl_out = self.example.embl_out
        self.multi_embl = self.example.multi_embl
        self.gff_out = self.example.gff_out
        self.mastertable = self.example.mastertable
        self.tss_file = self.example.tss_file
        self.fasta_file = self.example.fasta_file
        self.transterm = self.example.transterm
        self.term_file = self.example.term_file
        self.circ_file = self.example.circrna_table
        self.circ_all = self.example.circrna_all
        self.circ_best = self.example.circrna_best
        self.test_folder = "test_folder"
        self.mock_args = MockClass()
        if (not os.path.exists(self.test_folder)):
            os.mkdir(self.test_folder)

    def tearDown(self):
        if os.path.exists(self.test_folder):
            shutil.rmtree(self.test_folder)

    def test_print_rntptt_file(self):
        cdss = []
        genes = []
        rnas = []
        gff_dict = Example().gff_dict
        for gff in gff_dict:
            if gff["feature"] == "gene":
                genes.append(self.converter.gff3parser.entries(self, gff))
            elif gff["feature"] == "CDS":
                cdss.append(self.converter.gff3parser.entries(self, gff))
            elif gff["feature"] == "tRNA":
                rnas.append(self.converter.gff3parser.entries(self, gff))
        out_p = StringIO()
        out_r = StringIO()
        self.converter._print_rntptt_file(out_p, cdss, genes)
        self.converter._print_rntptt_file(out_r, rnas, genes)
        self.assertEqual(out_p.getvalue().split("\n")[:-1],
                         self.example.ptt_out_list)
        self.assertEqual(out_r.getvalue().split("\n")[:-1],
                         self.example.rnt_out_list)
        out_p.close()
        out_r.close()

    def test_srna2pttrnt(self):
        srna_input_file = os.path.join(self.test_folder, "srna.gff")
        srna_output_file = os.path.join(self.test_folder, "srna.out")
        with open(srna_input_file, "w") as fh:
            fh.write(self.gff_file)
        srnas = []
        self.converter._srna2rntptt(srna_input_file, srna_output_file, srnas,
                                    1234567)
        datas = import_data(srna_output_file)
        self.assertEqual(set(datas), set(self.srna_out.split("\n")))

    def test_multi_embl_pos(self):
        embls = []
        for line in self.embl_file.split("\n"):
            datas = self.converter._multi_embl_pos(line.strip())
            if datas != "Wrong":
                embls.append(datas)
        for index in range(0, 7):
            self.assertDictEqual(embls[index], self.embl_out[index])
        for index in range(0, 2):
            self.assertDictEqual(embls[-1]["pos"][index],
                                 self.multi_embl[index])

    def test_parser_embl_data(self):
        embl_file = os.path.join(self.test_folder, "test.embl")
        embl_out = os.path.join(self.test_folder, "test.embl_out")
        out = StringIO()
        with open(embl_file, "w") as eh:
            for line in self.embl_file.split("\n"):
                eh.write(line + "\n")
        info = self.converter._parser_embl_data(embl_file, out)
        datas = out.getvalue().split("\n")
        self.assertEqual(set(datas[:-1]), set(self.gff_out.split("\n")))
        self.assertEqual(info[0], "NC_007795.1")
        for index in range(0, 2):
            self.assertDictEqual(info[1]["pos"][index], self.multi_embl[index])
        out.close()

    def test_multi_tss_class(self):
        nums = {"tss": 0, "tss_uni": 0, "class": 1}
        utrs = {"total": [], "pri": [], "sec": []}
        tss_features = {"tss_types": [], "locus_tags": [], "utr_lengths": []}
        tss_index = defaultdict(lambda: 0)
        master_file = os.path.join(self.test_folder, "test.tsv")
        fh = StringIO(self.mastertable)
        for tss in self.converter.tsspredator.entries(fh):
            self.converter._multi_tss_class(tss, tss_index, tss_features, nums,
                                            utrs)
        fh.close()
        self.assertDictEqual(nums, {'tss_uni': 0, 'class': 5, 'tss': 2})

    def test_convert_mastertable2gff(self):
        master_file = os.path.join(self.test_folder, "test.tsv")
        with open(master_file, "w") as th:
            th.write(self.mastertable)
        out_gff = os.path.join(self.test_folder, "test.tsv_out")
        self.converter.convert_mastertable2gff(master_file, "ANNOgesic", "TSS",
                                               "aaa", out_gff)
        datas = import_data(out_gff)
        self.assertEqual(set(datas), set(self.tss_file.split("\n")))

    def test_convert_gff2rntptt(self):
        srna_input_file = os.path.join(self.test_folder, "srna.gff")
        srna_output_file = os.path.join(self.test_folder, "srna.out")
        gff_file = os.path.join(self.test_folder, "test.gff")
        rnt_file = os.path.join(self.test_folder, "test.rnt")
        ptt_file = os.path.join(self.test_folder, "test.ptt")
        fasta_file = os.path.join(self.test_folder, "test.fa")
        with open(srna_input_file, "w") as fh:
            fh.write(self.gff_file)
        with open(gff_file, "w") as fh:
            fh.write(self.gff_file)
        with open(fasta_file, "w") as fh:
            fh.write(self.fasta_file)
        self.converter.convert_gff2rntptt(gff_file, fasta_file, ptt_file,
                                          rnt_file, srna_input_file,
                                          srna_output_file)
        self.assertTrue(srna_output_file)
        self.assertTrue(rnt_file)
        self.assertTrue(ptt_file)

    def test_convert_embl2gff(self):
        embl_file = os.path.join(self.test_folder, "test.embl")
        gff_file = os.path.join(self.test_folder, "test.embl_out")
        with open(embl_file, "w") as eh:
            for line in self.embl_file.split("\n"):
                eh.write(line + "\n")
        self.converter.convert_embl2gff(embl_file, gff_file)
        datas = import_data(gff_file)
        self.assertEqual(set(datas[1:-2]), set(self.gff_out.split("\n")))

    def test_convert_transtermhp2gff(self):
        transterm_file = os.path.join(self.test_folder,
                                      "test_best_terminator_after_gene.bag")
        gff_file = os.path.join(self.test_folder, "transterm.gff")
        with open(transterm_file, "w") as th:
            th.write(self.transterm)
        self.converter.convert_transtermhp2gff(transterm_file, gff_file)
        datas = import_data(gff_file)
        self.assertEqual(set(datas), set(self.term_file.split("\n")))

    def get_info(datas):
        f_datas = []
        for data in datas:
            if not data.startswith("#"):
                f_datas.append("\t".join(data.split("\t")[:8]))
        return f_datas

    def test_convert_circ2gff(self):
        circ_file = os.path.join(self.test_folder, "circ.csv")
        out_all = os.path.join(self.test_folder, "all.gff")
        out_filter = os.path.join(self.test_folder, "best.gff")
        with open(circ_file, "w") as ch:
            ch.write(self.circ_file)
        args = self.mock_args.mock()
        args.start_ratio = 0.5
        args.end_ratio = 0.5
        args.support = 5
        self.converter.convert_circ2gff(circ_file, args, out_all, out_filter)
        datas = import_data(out_all)
        f_datas = []
        for data in datas:
            if not data.startswith("#"):
                f_datas.append("\t".join(data.split("\t")[:8]))
        c_datas = []
        for data in self.circ_all.split("\n"):
            if not data.startswith("#"):
                c_datas.append("\t".join(data.split("\t")[:8]))
        self.assertListEqual(f_datas, c_datas)
        datas = import_data(out_filter)
        f_datas = []
        for data in datas:
            if not data.startswith("#"):
                f_datas.append("\t".join(data.split("\t")[:8]))
        c_datas = []
        for data in self.circ_best.split("\n"):
            if not data.startswith("#"):
                c_datas.append("\t".join(data.split("\t")[:8]))
        self.assertListEqual(f_datas, c_datas)
예제 #25
0
class RATT(object):
    '''annotation transfer'''
    def __init__(self, args_ratt):
        self.multiparser = Multiparser()
        self.converter = Converter()
        self.format_fixer = FormatFixer()
        self.helper = Helper()
        if args_ratt.ref_gbk:
            self.gbk = os.path.join(args_ratt.ref_gbk, "gbk_tmp")
            self.gbk_tmp = os.path.join(self.gbk, "tmp")
            self.embl = os.path.join(args_ratt.ref_gbk, "embls")
        if args_ratt.ref_embls:
            self.embl = args_ratt.ref_embls
        self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt")
        self.tmp_files = {
            "tar": os.path.join(args_ratt.tar_fastas, "tmp"),
            "ref": os.path.join(args_ratt.ref_fastas, "tmp"),
            "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"),
            "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"),
            "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"),
            "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt")
        }

    def _convert_to_pttrnt(self, gffs, files, log):
        for gff in files:
            if gff.endswith(".gff"):
                gff = os.path.join(gffs, gff)
                filename = gff.split("/")
                prefix = filename[-1][:-4]
                rnt = gff[:-3] + "rnt"
                ptt = gff[:-3] + "ptt"
                fasta = self.helper.get_correct_file(self.tmp_files["tar"],
                                                     ".fa", prefix, None, None)
                if fasta:
                    self.converter.convert_gff2rntptt(gff, fasta, ptt, rnt,
                                                      None, None)
                    log.write("\t" + ptt + " is generated.\n")
                    log.write("\t" + rnt + " is generated.\n")

    def _remove_files(self, args_ratt, out_gbk, log):
        self.helper.remove_all_content(args_ratt.gff_outfolder, ".gff", "file")
        self.helper.remove_all_content(args_ratt.gff_outfolder, ".ptt", "file")
        self.helper.remove_all_content(args_ratt.gff_outfolder, ".rnt", "file")
        log.write("Moving the final output files to {0}.\n".format(
            args_ratt.gff_outfolder))
        self.helper.move_all_content(self.tmp_files["out_gff"],
                                     args_ratt.gff_outfolder, None)
        log.write("Remove the temperary files.\n")
        shutil.rmtree(self.tmp_files["out_gff"])
        shutil.rmtree(self.tmp_files["tar"])
        shutil.rmtree(self.tmp_files["ref"])
        self.helper.remove_tmp_dir(args_ratt.tar_fastas)
        self.helper.remove_tmp_dir(args_ratt.ref_fastas)
        self.helper.remove_tmp_dir(args_ratt.ref_embls)
        self.helper.remove_tmp_dir(args_ratt.ref_gbk)

    def _convert_to_gff(self, ratt_result, args_ratt, files, log):
        name = ratt_result.split(".")
        filename = ".".join(name[1:-2]) + ".gff"
        output_file = os.path.join(args_ratt.output_path, filename)
        self.converter.convert_embl2gff(
            os.path.join(args_ratt.output_path, ratt_result), output_file)
        self.format_fixer.fix_ratt(output_file, ".".join(name[1:-2]),
                                   "tmp_gff")
        shutil.move("tmp_gff", output_file)
        shutil.copy(output_file, os.path.join(args_ratt.gff_outfolder,
                                              filename))
        log.write("\t" + os.path.join(args_ratt.gff_outfolder, filename) +
                  " is generated.\n")
        files.append(filename)

    def _parser_embl_gbk(self, files):
        self.helper.check_make_folder(self.gbk)
        for file_ in files:
            close = False
            with open(file_, "r") as f_h:
                for line in f_h:
                    if (line.startswith("LOCUS")):
                        out = open(self.gbk_tmp, "w")
                        datas = line.split(" ")
                        for data in datas:
                            if (len(data) != 0) and (data != "LOCUS"):
                                filename = ".".join([data.strip(), "gbk"])
                                break
                    elif (line.startswith("VERSION")):
                        datas = line.split(" ")
                        for data in datas:
                            if (len(data) != 0) and (data != "VERSION"):
                                new_filename = ".".join([data.strip(), "gbk"])
                                break
                        if new_filename.find(filename):
                            filename = new_filename
                    if out:
                        out.write(line)
                    if line.startswith("//"):
                        out.close()
                        close = True
                        shutil.move(self.gbk_tmp,
                                    os.path.join(self.gbk, filename))
            if not close:
                out.close()
        return self.gbk

    def _convert_embl(self, ref_embls, log):
        '''convert gbk to embl'''
        detect_gbk = False
        gbks = []
        out_gbk = None
        for embl in os.listdir(ref_embls):
            if (embl.endswith(".gbk")) or (embl.endswith(".gbff")) or (
                    embl.endswith(".gb")):
                detect_gbk = True
                gbks.append(os.path.join(ref_embls, embl))
        if not detect_gbk:
            log.write(
                "--related_gbk_files is assigned, but not gbk files are detected.\n"
                "The gbk file names need to be ended at .gbk, .gb, or .gbff. \n"
            )
            print("Error: Please assign proper Genebank files!")
            sys.exit()
        elif detect_gbk:
            out_gbk = self._parser_embl_gbk(gbks)
            log.write(
                "Running converter.py to convert gbk file to embl format.\n")
            self.converter.convert_gbk2embl(out_gbk)
            self.helper.check_make_folder(self.embl)
            self.helper.move_all_content(out_gbk, self.embl, [".embl"])
            log.write("\t" + self.embl +
                      " is generated and the embl files are stored in it.\n")
        return out_gbk

    def _run_ratt(self, args_ratt, tar, ref, out, log):
        if (not os.path.exists(self.embl)) or (not os.path.exists(
                os.path.join(self.tmp_files["tar"], tar + ".fa"))) or (
                    not os.path.exists(
                        os.path.join(self.tmp_files["ref"], ref + ".fa"))):
            print("Error: Please check --compare_pair, the strain names "
                  "should be the same as the strain names in fasta, "
                  "genbank or embl files!")
            log.write(
                "The strain names in --compare_pair should be the same "
                "as the strain names in fasta, genbank, or embl files.\n")
            sys.exit()
        log.write("Make sure your RATT version is at least 1.64.\n")
        log.write("If the RATT can not run properly, please check the "
                  "RATT_HOME and PAGIT_HOME is assigned correctly.\n")
        log.write(" ".join([
            args_ratt.ratt_path, self.embl,
            os.path.join(self.tmp_files["tar"], tar +
                         ".fa"), args_ratt.element, args_ratt.transfer_type,
            os.path.join(self.tmp_files["ref"], ref + ".fa")
        ]) + "\n")
        call([
            args_ratt.ratt_path, self.embl,
            os.path.join(self.tmp_files["tar"], tar + ".fa"),
            args_ratt.element, args_ratt.transfer_type,
            os.path.join(self.tmp_files["ref"], ref + ".fa")
        ],
             stdout=out,
             stderr=DEVNULL)
        log.write("Done!\n")

    def _format_and_run(self, args_ratt, log):
        print("Running RATT")
        for pair in args_ratt.pairs:
            ref = pair.split(":")[0]
            tar = pair.split(":")[1]
            out = open(self.ratt_log, "w+")
            self._run_ratt(args_ratt, tar, ref, out, log)
            log.write("The following files are generatd:\n")
            for filename in os.listdir():
                if ("final" in filename):
                    log.write("\t" + filename + "\n")
                    shutil.move(filename,
                                os.path.join(args_ratt.output_path, filename))
                elif (args_ratt.element in filename) or (
                        "query" in filename) or ("Reference" in filename) or (
                            "Query" in filename) or ("Sequences" in filename):
                    log.write("\t" + filename + "\n")
                    if os.path.isfile(filename):
                        os.remove(filename)
                    if os.path.isdir(filename):
                        shutil.rmtree(filename)
        out.close()

    def annotation_transfer(self, args_ratt, log):
        self.multiparser.parser_fasta(args_ratt.tar_fastas)
        self.multiparser.parser_fasta(args_ratt.ref_fastas)
        out_gbk = None
        if args_ratt.ref_embls is None:
            out_gbk = self._convert_embl(args_ratt.ref_gbki, log)
        self._format_and_run(args_ratt, log)
        files = []
        for data in os.listdir(args_ratt.output_path):
            if "final.embl" in data:
                log.write(
                    "Running converter.py to convert embl "
                    "files in {0} to gff, ptt, and rnt format.\n".format(data))
                self._convert_to_gff(data, args_ratt, files, log)
                self._convert_to_pttrnt(args_ratt.gff_outfolder, files, log)
        self.helper.check_make_folder(self.tmp_files["out_gff"])
        log.write("Merging the output of {0}.\n".format(data))
        for folder in os.listdir(args_ratt.tar_fastas):
            files = []
            if "_folder" in folder:
                datas = folder.split("_folder")
                prefix = ".".join(datas[0].split(".")[:-1])
                for file_ in os.listdir(
                        os.path.join(args_ratt.tar_fastas, folder)):
                    files.append(file_[:-3])
                for gff in os.listdir(args_ratt.gff_outfolder):
                    for file_ in files:
                        if (".gff" in gff) and (file_ == gff[:-4]):
                            self.helper.merge_file(
                                os.path.join(args_ratt.gff_outfolder, gff),
                                self.tmp_files["gff"])
                        if (".ptt" in gff) and (file_ == gff[:-4]):
                            self.helper.merge_file(
                                os.path.join(args_ratt.gff_outfolder, gff),
                                self.tmp_files["ptt"])
                        if (".rnt" in gff) and (file_ == gff[:-4]):
                            self.helper.merge_file(
                                os.path.join(args_ratt.gff_outfolder, gff),
                                self.tmp_files["rnt"])
                if os.path.exists(self.tmp_files["gff"]):
                    shutil.move(
                        self.tmp_files["gff"],
                        os.path.join(self.tmp_files["out_gff"],
                                     prefix + ".gff"))
                    shutil.move(
                        self.tmp_files["ptt"],
                        os.path.join(self.tmp_files["out_gff"],
                                     prefix + ".ptt"))
                    shutil.move(
                        self.tmp_files["rnt"],
                        os.path.join(self.tmp_files["out_gff"],
                                     prefix + ".rnt"))
                else:
                    print("Error: Please check your fasta or "
                          "annotation files, they should only contain "
                          "the query genome. And make sure your RATT can "
                          "work properly (check $ANNOgesic/output/"
                          "annotation_transfer/ratt_log.txt).")
                    log.write("Please check your fasta or "
                              "annotation files, they should only contain "
                              "the query genome. And make sure your RATT can "
                              "work properly (check $ANNOgesic/output/"
                              "annotation_transfer/ratt_log.txt).\n")
        self._remove_files(args_ratt, out_gbk, log)
예제 #26
0
class PPINetwork(object):

    def __init__(self, out_folder):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.gffparser = Gff3Parser()
        self.tmp_id = os.path.join(out_folder, "tmp_id_list")
        self.all_result = os.path.join(out_folder, "all_results")
        self.best_result = os.path.join(out_folder, "best_results")
        self.fig = os.path.join(out_folder, "figures")
        self.with_strain = "with_strain"
        self.without_strain = "without_strain"
        self.tmp_files = {"log": "tmp_log", "action": "tmp_action.log",
                          "pubmed": "tmp_pubmed.log",
                          "specific": os.path.join(
                                      out_folder, "tmp_specific"),
                          "nospecific": os.path.join(
                                        out_folder, "tmp_nospecific"),
                          "wget_action": os.path.join(
                                         out_folder, "tmp_action")}

    def _make_folder_no_exist(self, path, folder):
        if folder not in os.listdir(path):
            os.mkdir(os.path.join(path, folder))

    def _make_subfolder(self, path, strain, ptt):
        os.mkdir(os.path.join(path, strain))
        os.mkdir(os.path.join(path, strain, ptt))

    def _run_wget(self, source, folder, log):
        call(["wget", source, "-O", folder], stderr=log)
        time.sleep(1)

    def _wget_id(self, strain, locus, strain_id, files):
        detect_id = False
        if strain == strain_id["ptt"]:
            print("Retrieving STRING ID for {0} of {1} -- {2}".format(
                   locus, strain_id["string"], strain_id["file"]))
            id_source = ("http://string-db.org/api/tsv/resolve?"
                         "identifier={0}&species={1}").format(
                         locus, strain_id["string"])
            self._run_wget(id_source, os.path.join(files["id_list"], locus),
                           files["id_log"])
            detect_id = True
        return detect_id

    def _retrieve_id(self, strain_id, genes, files):
        for gene in genes:
            detect_id = self._wget_id(gene["strain"], gene["locus_tag"],
                                      strain_id, files)
            if not detect_id:
                print("Error:there is no {0} in {1}".format(
                       gene, strain_id["file"]))

    def _get_prefer_name(self, row_a, strain_id, files, querys):
        prefername = ""
        filename = row_a.split(".")
        if (filename[1] not in os.listdir(files["id_list"])) and (
                "all" not in querys):
            self._wget_id(strain_id["ptt"], filename[1], strain_id, files)
        if filename[1] in os.listdir(files["id_list"]):
            id_h = open(os.path.join(files["id_list"], filename[1]), "r")
            for row_i in csv.reader(id_h, delimiter="\t"):
                if row_a == row_i[0]:
                    prefername = row_i[3]
            id_h.close()
        return prefername

    def _print_title(self, out, id_file, id_folder):
        id_h = open(os.path.join(id_folder, id_file), "r")
        prefername = id_file
        for row_i in csv.reader(id_h, delimiter="\t"):
            prefername = row_i[3]
        id_h.close()
        out.write("Interaction of {0} | {1}\n".format(id_file, prefername))
        out.write("strain\titem_id_a\titem_id_b\tmode\taction\ta_is_acting\t"
                  "STRING_action_score\tpubmed_id\tpubmed_score\n")

    def _get_pubmed(self, row, strain_id, mode, actor, id_file, first_output,
                    ptt, files, paths, args_ppi):
        prefer1 = self._get_prefer_name(row[0], strain_id,
                                        files, args_ppi.querys)
        prefer2 = self._get_prefer_name(row[1], strain_id,
                                        files, args_ppi.querys)
        if (len(prefer1) > 0) and (len(prefer2) > 0):
            if args_ppi.no_specific:
                pubmed_source = (
                    "http://www.ncbi.nlm.nih.gov/CBBresearch/"
                    "Wilbur/IRET/PIE/getppi.cgi?term={0}+{1}").format(
                        prefer1, prefer2)
                self._run_wget(pubmed_source, self.tmp_files["nospecific"],
                               files["pubmed_log"])
            strain_id["pie"] = "+".join(strain_id["pie"].split(" "))
            pubmed_source = (
                "http://www.ncbi.nlm.nih.gov/CBBresearch/Wilbur"
                "/IRET/PIE/getppi.cgi?term={0}+{1}+{2}").format(
                    prefer1, prefer2, strain_id["pie"])
            self._run_wget(pubmed_source, self.tmp_files["specific"],
                           files["pubmed_log"])
            row[2] = mode
            row[4] = actor
            row[0] = prefer1
            row[1] = prefer2
            self._merge_information(
                first_output, self.tmp_files["specific"],
                files["all_specific"], files["best_specific"], row,
                args_ppi.score, id_file, files["id_list"], "specific",
                os.path.join(paths["all"], self.with_strain),
                os.path.join(paths["best"], self.with_strain), ptt)
            if args_ppi.no_specific:
                self._merge_information(
                     first_output, self.tmp_files["nospecific"],
                     files["all_nospecific"], files["best_nospecific"], row,
                     args_ppi.score, id_file, files["id_list"], "nospecific",
                     os.path.join(paths["all"], self.without_strain),
                     os.path.join(paths["best"], self.without_strain), ptt)

    def _print_single_file(self, out_single, row_a, ptt, row):
        if row == "NA":
            out_single.write("\t".join(
                             [ptt, "\t".join(row_a), "NA", "NA"]) + "\n")
        else:
            out_single.write("\t".join(
                             [ptt, "\t".join(row_a), "\t".join(row)]) + "\n")

    def _merge_information(self, first_output, filename, out_all, out_best,
                           row_a, score, id_file, id_folder, file_type,
                           all_folder, best_folder, ptt):
        if os.path.getsize(filename) != 0:
            f_h = open(filename, "r")
            out_all_single = open(os.path.join(
                all_folder, ptt, "_".join([row_a[0], row_a[1] + ".csv"])), "w")
            out_best_single = open(os.path.join(
                best_folder, ptt,
                "_".join([row_a[0], row_a[1] + ".csv"])), "w")
            self._print_title(out_all_single, id_file, id_folder)
            self._print_title(out_best_single, id_file, id_folder)
            detect = False
            for row in csv.reader(f_h, delimiter="\t"):
                self._print_single_file(out_all_single, row_a, ptt, row)
                if first_output["_".join([file_type, "all"])]:
                    first_output["_".join([file_type, "all"])] = False
                    self._print_title(out_all, id_file, id_folder)
                out_all.write("\t".join([ptt, "\t".join(row_a),
                                         "\t".join(row)]) + "\n")
                if (float(row[1]) >= score):
                    detect = True
                    self._print_single_file(out_best_single, row_a, ptt, row)
                    if first_output["_".join([file_type, "best"])]:
                        first_output["_".join([file_type, "best"])] = False
                        self._print_title(out_best, id_file, id_folder)
                    out_best.write("\t".join([ptt, "\t".join(row_a),
                                              "\t".join(row)]) + "\n")
            f_h.close()
            if not detect:
                os.remove(os.path.join(best_folder, ptt,
                          "_".join([row_a[0], row_a[1] + ".csv"])))
            out_all_single.close()
            out_best_single.close()
        else:
            out_all_single = open(os.path.join(
                all_folder, ptt, "_".join([row_a[0], row_a[1] + ".csv"])), "w")
            self._print_title(out_all_single, id_file, id_folder)
            self._print_single_file(out_all_single, row_a, ptt, "NA")
            if first_output["_".join([file_type, "all"])]:
                first_output["_".join([file_type, "all"])] = False
                self._print_title(out_all, id_file, id_folder)
            out_all.write("\t".join([ptt, "\t".join(row_a),
                                     "NA", "NA"]) + "\n")
            out_all_single.close()

    def _detect_protein(self, strain_id, args_ppi):
        fh = open(os.path.join(args_ppi.ptts, strain_id["file"]), "r")
        genes = []
        for row in csv.reader(fh, delimiter="\t"):
            if (len(row) == 1) and ("-" in row[0]) and (".." in row[0]):
                name = (row[0].split("-"))[0].strip().split(",")[0].strip()
            if ("all" in args_ppi.querys):
                if (len(row) > 1) and (row[0] != "Location"):
                    genes.append({"strain": name, "locus_tag": row[5]})
            else:
                for query in args_ppi.querys:
                    datas = query.split(":")
                    strain = datas[0]
                    start = datas[1]
                    end = datas[2]
                    strand = datas[3]
                    if (len(row) > 1) and (row[0] != "Location") and (
                            name == strain) and (
                            start == row[0].split("..")[0]) and (
                            end == row[0].split("..")[1]) and (
                            strand == row[1]):
                        genes.append({"strain": name, "locus_tag": row[5]})
        fh.close()
        return genes

    def _setup_nospecific(self, paths, strain_id, files):
        self._make_subfolder(
            paths["all"], self.without_strain, strain_id["ptt"])
        self._make_subfolder(
            paths["best"], self.without_strain, strain_id["ptt"])
        self._make_subfolder(
            paths["fig"], self.without_strain, strain_id["ptt"])
        filename_nostrain = "_".join([strain_id["file"].replace(".ptt", ""),
                                      self.without_strain + ".csv"])
        files["all_nospecific"] = open(os.path.join(paths["all"],
                                       filename_nostrain), "w")
        files["best_nospecific"] = open(os.path.join(paths["best"],
                                        filename_nostrain), "w")

    def _setup_folder_and_read_file(self, strain_id, pre_file,
                                    files, paths, args_ppi):
        if strain_id["file"].endswith(".ptt"):
            if strain_id["file"] != pre_file:
                self.helper.check_make_folder(
                     "_".join([self.tmp_id, strain_id["file"]]))
                paths["all"] = os.path.join(
                     self.all_result, strain_id["file"][:-4])
                paths["best"] = os.path.join(
                     self.best_result, strain_id["file"][:-4])
                paths["fig"] = os.path.join(
                     self.fig, strain_id["file"][:-4])
                self.helper.check_make_folder(
                     os.path.join(self.all_result, strain_id["file"][:-4]))
                self.helper.check_make_folder(
                     os.path.join(self.best_result, strain_id["file"][:-4]))
                self.helper.check_make_folder(
                     os.path.join(self.fig, strain_id["file"][:-4]))
                self._make_subfolder(
                     paths["all"], self.with_strain, strain_id["ptt"])
                self._make_subfolder(
                     paths["best"], self.with_strain, strain_id["ptt"])
                self._make_subfolder(
                     paths["fig"], self.with_strain, strain_id["ptt"])
                filename_strain = "_".join(
                     [strain_id["file"].replace(".ptt", ""),
                      self.with_strain + ".csv"])
                files["all_specific"] = open(os.path.join(
                                        paths["all"], filename_strain), "w")
                files["best_specific"] = open(os.path.join(
                                         paths["best"], filename_strain), "w")
                if args_ppi.no_specific:
                    self._setup_nospecific(paths, strain_id, files)
                files["id_list"] = "_".join([self.tmp_id, strain_id["file"]])
                files["id_log"] = open(os.path.join(files["id_list"],
                                       self.tmp_files["log"]), "w")
                files["action_log"] = open(os.path.join(args_ppi.out_folder,
                                           self.tmp_files["action"]), "w")
                files["pubmed_log"] = open(os.path.join(args_ppi.out_folder,
                                           self.tmp_files["pubmed"]), "w")
                pre_file = strain_id["file"]
                if strain_id["file"] in os.listdir(args_ppi.ptts):
                    genes = self._detect_protein(strain_id, args_ppi)
            else:
                self._make_folder_no_exist(os.path.join(paths["all"],
                                           self.with_strain), strain_id["ptt"])
                self._make_folder_no_exist(os.path.join(paths["best"],
                                           self.with_strain), strain_id["ptt"])
                if args_ppi.no_specific:
                    self._make_folder_no_exist(
                        os.path.join(paths["all"], self.without_strain),
                        strain_id["ptt"])
                    self._make_folder_no_exist(
                        os.path.join(paths["best"], self.without_strain),
                        strain_id["ptt"])
        else:
            print("Error:wrong .ptt file!!")
            sys.exit()
        return genes

    def _wget_actions(self, files, id_file, strain_id, out_folder):
        detect = False
        t_h = open(os.path.join(files["id_list"], id_file), "r")
        print("Retrieving STRING actions for {0} of {1} -- {2}".format(
              id_file, strain_id["string"], strain_id["file"]))
        for row in csv.reader(t_h, delimiter="\t"):
            if row[0].startswith("stringId"):
                continue
            else:
                detect = True
                if row[1] == strain_id["string"]:
                    action_source = ("http://string-db.org/api/tsv/actions?"
                                     "identifier={0}&species={1}").format(
                                     row[0], row[1])
                    self._run_wget(
                        action_source, self.tmp_files["wget_action"],
                        files["action_log"])
                    break
        t_h.close()
        if not detect:
            print("Warning: " + id_file + " can not be found in STRING...")
        return detect

    def _retrieve_actions(self, files, strain_id, paths, args_ppi):
        for id_file in os.listdir(files["id_list"]):
            if id_file != self.tmp_files["log"]:
                detect_id = self._wget_actions(files, id_file, strain_id,
                                               args_ppi.out_folder)
                if detect_id:
                    a_h = open(self.tmp_files["wget_action"], "r")
                    pre_row = []
                    first = True
                    detect = False
                    first_output = {"specific_all": True,
                                    "specific_best": True,
                                    "nospecific_all": True,
                                    "nospecific_best": True}
                    print("Retrieving Pubmed for {0} of {1} -- {2}".format(
                           id_file, strain_id["string"], strain_id["file"]))
                    for row_a in csv.reader(a_h, delimiter="\t"):
                        if row_a == []:
                            print("No interaction can be detected...")
                            break
                        if row_a[0].startswith("item_id_a"):
                            continue
                        else:
                            detect = True
                            if first:
                                first = False
                                mode = row_a[2]
                                actor = row_a[4]
                            else:
                                if (row_a[0] != pre_row[0]) or (
                                        row_a[1] != pre_row[1]):
                                    self._get_pubmed(
                                        pre_row, strain_id, mode, actor,
                                        id_file, first_output,
                                        strain_id["ptt"], files, paths,
                                        args_ppi)
                                    mode = row_a[2]
                                    actor = row_a[4]
                                else:
                                    mode = mode + ";" + row_a[2]
                                    actor = actor + ";" + row_a[4]
                            pre_row = row_a
                    if detect:
                        detect = False
                        self._get_pubmed(
                            row_a, strain_id, mode, actor, id_file,
                            first_output, strain_id["ptt"], files,
                            paths, args_ppi)
        if detect_id:
            a_h.close()

    def _plot(self, args_ppi, files):
        if args_ppi.no_specific:
            files["all_nospecific"].close()
            files["best_nospecific"].close()
        files["all_specific"].close()
        files["best_specific"].close()
        for folder in os.listdir(self.all_result):
            if folder in os.listdir(self.fig):
                print("plotting {0}".format(folder))
                plot_ppi(os.path.join(self.all_result, folder,
                         "_".join([folder, self.with_strain + ".csv"])),
                         args_ppi.score, os.path.join(self.fig, folder,
                         self.with_strain), args_ppi.size)
                if args_ppi.no_specific:
                    plot_ppi(os.path.join(self.all_result, folder,
                             "_".join([folder, self.without_strain + ".csv"])),
                             args_ppi.score,
                             os.path.join(self.fig, folder,
                             self.without_strain), args_ppi.size)

    def _remove_tmps(self, args_ppi):
        self.helper.remove_all_content(os.path.join(args_ppi.out_folder),
                                       "tmp", "file")
        self.helper.remove_all_content(os.path.join(args_ppi.out_folder),
                                       "tmp", "dir")
        for file_ in os.listdir(args_ppi.ptts):
            if file_.startswith("PPI_"):
                os.remove(os.path.join(args_ppi.ptts, file_))

    def retrieve_ppi_network(self, args_ppi):
        strain_ids = []
        paths = {}
        files = {}
        for strain in args_ppi.strains:
            datas = strain.split(":")
            ptt_file = "PPI_" + datas[0].replace(".gff", ".ptt")
            rnt_file = "PPI_" + datas[0].replace(".gff", ".rnt")
            self.converter.convert_gff2rntptt(
                           os.path.join(args_ppi.ptts, datas[0]),
                           "0", os.path.join(args_ppi.ptts, ptt_file),
                           os.path.join(args_ppi.ptts, rnt_file), None, None)
            strain_ids.append({"file": ptt_file,
                               "ptt": datas[1],
                               "string": datas[2],
                               "pie": datas[3]})
        strain_ids.sort(key=lambda x: x["file"])
        pre_file = ""
        for strain_id in strain_ids:
            genes = self._setup_folder_and_read_file(strain_id, pre_file,
                                                     files, paths, args_ppi)
            s_h = open(args_ppi.species, "r")
            for row in csv.reader(s_h, delimiter="\t"):
                if row[0] != "##":
                    if row[0] == strain_id["string"]:
                        break
                    elif row[2] == strain_id["string"]:
                        strain_id["string"] = row[0]
                        break
                    elif row[3] == strain_id["string"]:
                        strain_id["string"] = row[0]
                        break
            self._retrieve_id(strain_id, genes, files)
            self._retrieve_actions(files, strain_id, paths, args_ppi)
        self._plot(args_ppi, files)
        self._remove_tmps(args_ppi)
예제 #27
0
class TSSpredator(object):

    def __init__(self, args_tss):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.master = os.path.join(args_tss.out_folder, "MasterTables")
        self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta":
                     "tmp_tss", "tmp": "tmp"}
        if args_tss.ta_files is not None:
            self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp")
        else:
            self.tmps["ta"] = None
        self.gff_path = os.path.join(args_tss.gffs, "tmp")
        self.wig_path = os.path.join(args_tss.wig_folder, "tmp")
        self.fasta_path = os.path.join(args_tss.fastas, "tmp")
        self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics")
        self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs")

    def _assign_dict(self, lib_datas):
        return {"wig": lib_datas[0],
                "tex": lib_datas[1],
                "condition": int(lib_datas[2]),
                "replicate": lib_datas[3],
                "strand": lib_datas[4]}

    def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix):
        for num_id in range(1, lib_num+1):
            cond_list = []
            for lib in lib_list:
                if num_id == lib["condition"]:
                    cond_list.append(lib)
            cond_sort_list = sorted(cond_list, key=lambda k: k['replicate'])
            for cond in cond_sort_list:
                out.write("{0}_{1}{2} = {3}\n".format(
                          prefix, cond["condition"], cond["replicate"],
                          os.path.join(wig_folder, cond["wig"])))

    def _start_to_run(self, tsspredator_path, config_file, out_path, prefix):
        print("Running TSSpredator for " + prefix)
        out = open(os.path.join(out_path, "log.txt"), "w")
        err = open(os.path.join(out_path, "err.txt"), "w")
        call(["java", "-jar", tsspredator_path,
              config_file], stdout=out, stderr=err)
        out.close()
        err.close()

    def _import_lib(self, libs, wig_folder, project_strain_name,
                    out, gff, program, fasta):
        lib_dict = {"fp": [], "fm": [], "nm": [], "np": []}
        lib_num = 0
        rep_set = set()
        list_num_id = []
        print("Runniun {0} now...".format(program))
        for lib in libs:
            lib_datas = lib.split(":")
            if not lib_datas[0].endswith(".wig"):
                print("Error:Exist a not proper wig files!!")
                sys.exit()
            for wig in os.listdir(wig_folder):
                filename = wig.split("_STRAIN_")
                if (filename[0] == lib_datas[0][:-4]) and (
                        filename[1][:-4] == project_strain_name):
                    lib_datas[0] = wig
            if int(lib_datas[2]) > lib_num:
                lib_num = int(lib_datas[2])
            if lib_datas[3] not in rep_set:
                rep_set.add(lib_datas[3])
            if (lib_datas[1] == "tex") and (lib_datas[4] == "+"):
                lib_dict["fp"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"):
                lib_dict["fm"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"):
                lib_dict["np"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"):
                lib_dict["nm"].append(self._assign_dict(lib_datas))
        for num_id in range(1, lib_num+1):
            out.write("annotation_{0} = {1}\n".format(num_id, gff))
        if program.lower() == "tss":
            self._print_lib(lib_num, lib_dict["fm"], out,
                            wig_folder, "fivePrimeMinus")
            self._print_lib(lib_num, lib_dict["fp"], out,
                            wig_folder, "fivePrimePlus")
        elif program.lower() == "processing_site":
            self._print_lib(lib_num, lib_dict["nm"], out,
                            wig_folder, "fivePrimeMinus")
            self._print_lib(lib_num, lib_dict["np"], out,
                            wig_folder, "fivePrimePlus")
        else:
            print("Error: Wrong program name!!!")
            sys.exit()
        for num_id in range(1, lib_num+1):
            out.write("genome_{0} = {1}\n".format(num_id, fasta))
        for num_id in range(1, lib_num+1):
            list_num_id.append(str(num_id))
        return lib_num, num_id, rep_set, lib_dict, list_num_id

    def _gen_config(self, project_strain_name, args_tss, gff,
                    wig_folder, fasta, config_file):
        master_folder = "MasterTable_" + project_strain_name
        out_path = os.path.join(self.master, master_folder)
        self.helper.check_make_folder(out_path)
        out = open(config_file, "w")
        out.write("TSSinClusterSelectionMethod = HIGHEST\n")
        out.write("allowedCompareShift = 1\n")
        out.write("allowedRepCompareShift = 1\n")
        lib_num, num_id, rep_set, lib_dict, list_num_id = \
            self._import_lib(args_tss.libs, wig_folder, project_strain_name,
                             out, gff, args_tss.program, fasta)
        out.write("idList = ")
        out.write(",".join(list_num_id) + "\n")
        out.write("maxASutrLength = 100\n")
        out.write("maxGapLengthInGene = 500\n")
        out.write("maxNormalTo5primeFactor = {0}\n".format(
                  args_tss.processing_factor))
        out.write("maxTSSinClusterDistance = {0}\n".format(
                  args_tss.cluster + 1))
        out.write("maxUTRlength = {0}\n".format(args_tss.utr_length))
        out.write("min5primeToNormalFactor = {0}\n".format(
                  args_tss.enrichment_factor))
        out.write("minCliffFactor = {0}\n".format(args_tss.factor))
        out.write("minCliffFactorDiscount = {0}\n".format(
                  args_tss.factor_reduction))
        out.write("minCliffHeight = {0}\n".format(args_tss.height))
        out.write("minCliffHeightDiscount = {0}\n".format(
                  args_tss.height_reduction))
        out.write("minNormalHeight = {0}\n".format(args_tss.base_height))
        out.write("minNumRepMatches = {0}\n".format(args_tss.repmatch))
        out.write("minPlateauLength = 0\n")
        out.write("mode = cond\n")
        out.write("normPercentile = 0.9\n")
        if args_tss.program.lower() == "tss":
            self._print_lib(lib_num, lib_dict["nm"], out,
                            wig_folder, "normalMinus")
            self._print_lib(lib_num, lib_dict["np"], out,
                            wig_folder, "normalPlus")
        else:
            self._print_lib(lib_num, lib_dict["fm"], out,
                            wig_folder, "normalMinus")
            self._print_lib(lib_num, lib_dict["fp"], out,
                            wig_folder, "normalPlus")
        out.write("numReplicates = {0}\n".format(len(rep_set)))
        out.write("numberOfDatasets = {0}\n".format(lib_num))
        out.write("outputDirectory = {0}\n".format(out_path))
        for prefix_id in range(len(args_tss.output_prefixs)):
            out.write("outputPrefix_{0} = {1}\n".format(
                      prefix_id + 1, args_tss.output_prefixs[prefix_id]))
        out.write("projectName = {0}\n".format(project_strain_name))
        out.write("superGraphCompatibility = igb\n")
        out.write("texNormPercentile = 0.5\n")
        out.write("writeGraphs = 0\n")
        out.write("writeNocornacFiles = 0\n")
        out.close()

    def _convert_gff(self, prefixs, args_tss):
        for prefix in prefixs:
            out_file = os.path.join(self.gff_outfolder, "_".join([
                           prefix, args_tss.program]) + ".gff")
            gff_f = open(out_file, "w")
            out_path = os.path.join(self.master, "_".join([
                           "MasterTable", prefix]))
            if "MasterTable.tsv" not in os.listdir(out_path):
                print("Error:there is not MasterTable file in {0}".format(
                      out_path))
                print("Please check configuration file.")
            else:
                self.converter.convert_mastertable2gff(
                    os.path.join(out_path, "MasterTable.tsv"),
                    "ANNOgesic", args_tss.program, prefix, out_file)
            gff_f.close()

    def _merge_manual(self, tsss, args_tss):
        self.helper.check_make_folder(os.path.join(os.getcwd(),
                                      self.tmps["tss"]))
        for tss in tsss:
            for gff in os.listdir(args_tss.gffs):
                if (gff[:-4] == tss) and (".gff" in gff):
                    break
            filename = "_".join([tss, args_tss.program]) + ".gff"
            predict = os.path.join(self.gff_outfolder, filename)
            print("Running merge and classify manual ....")
            stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss)
            merge_manual_predict_tss(
                predict, stat_file,
                os.path.join(self.tmps["tss"], filename),
                os.path.join(args_tss.gffs, gff), args_tss)
            shutil.move(stat_file, os.path.join(args_tss.out_folder,
                                                "statistics", tss, stat_file))
        self.helper.move_all_content(self.tmps["tss"],
                                     self.gff_outfolder, [".gff"])
        shutil.rmtree(self.tmps["tss"])

    def _validate(self, tsss, args_tss):
        print("Running validation of annotation....")
        for tss in tsss:
            for gff in os.listdir(args_tss.gffs):
                if (gff[:-4] == tss) and (".gff" in gff):
                    break
            stat_file = os.path.join(
                    self.stat_outfolder, tss,
                    "".join(["stat_gene_vali_", tss, ".csv"]))
            out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff")
            if args_tss.program.lower() == "tss":
                compare_file = os.path.join(self.gff_outfolder,
                                            "_".join([tss, "TSS.gff"]))
            elif args_tss.program.lower() == "processing":
                compare_file = os.path.join(self.gff_outfolder,
                                            "_".join([tss, "processing.gff"]))
            validate_gff(compare_file, os.path.join(args_tss.gffs, gff),
                         stat_file, out_cds_file, args_tss.utr_length,
                         args_tss.program.lower())
            shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff))

    def _compare_ta(self, tsss, args_tss):
        detect = False
        print("Running compare transcript assembly and TSS ...")
        self.multiparser.parser_gff(args_tss.ta_files, "transcript")
        self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"],
                                     None, "transcript")
        for tss in tsss:
            stat_out = os.path.join(
                    self.stat_outfolder, tss, "".join([
                        "stat_compare_TSS_Transcriptome_assembly_",
                        tss, ".csv"]))
            for ta in os.listdir(self.tmps["ta"]):
                filename = ta.split("_transcript")
                if (filename[0] == tss) and (filename[1] == ".gff"):
                    detect = True
                    break
            compare_file = os.path.join(self.gff_outfolder,
                                        "_".join([tss, "TSS.gff"]))
            if detect:
                stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file,
                            stat_out, self.tmps["ta_tss"],
                            self.tmps["tss_ta"], args_tss.fuzzy)
                self.helper.sort_gff(self.tmps["tss_ta"], compare_file)
                self.helper.sort_gff(self.tmps["ta_tss"],
                                     os.path.join(args_tss.ta_files, ta))
                os.remove(self.tmps["tss_ta"])
                os.remove(self.tmps["ta_tss"])
                detect = False

    def _stat_tss(self, tsss, feature):
        print("Running statistaics.....")
        for tss in tsss:
            compare_file = os.path.join(self.gff_outfolder,
                                        "_".join([tss, feature]) + ".gff")
            stat_tsspredator(
                compare_file, feature,
                os.path.join(self.stat_outfolder, tss, "_".join([
                    "stat", feature, "class", tss]) + ".csv"),
                os.path.join(self.stat_outfolder, tss, "_".join([
                    "stat", feature, "libs", tss]) + ".csv"))
            self.helper.move_all_content(os.getcwd(), os.path.join(
                self.stat_outfolder, tss), ["_class", ".png"])
            if os.path.exists(os.path.join(
                    self.stat_outfolder, "TSSstatistics.tsv")):
                shutil.move(
                    os.path.join(
                        self.stat_outfolder, "TSSstatistics.tsv"),
                    os.path.join(
                        self.stat_outfolder, tss, "TSSstatistics.tsv"))
            plot_venn(compare_file, feature)
            self.helper.move_all_content(os.getcwd(), os.path.join(
                self.stat_outfolder, tss), ["_venn", ".png"])

    def _set_gen_config(self, args_tss, input_folder):
        prefixs = []
        detect = False
        for fasta in os.listdir(self.fasta_path):
            for gff in os.listdir(self.gff_path):
                if fasta[:-3] == gff[:-4]:
                    prefix = fasta[:-3]
                    for wig in os.listdir(self.wig_path):
                        filename = wig.split("_STRAIN_")
                        if filename[1][:-4] == prefix:
                            detect = True
                            break
                    if detect:
                        prefixs.append(prefix)
                        config = os.path.join(
                                input_folder,
                                "_".join(["config", prefix]) + ".ini")
                        self._gen_config(
                            prefix, args_tss,
                            os.path.join(self.gff_path, gff), self.wig_path,
                            os.path.join(self.fasta_path, fasta), config)
        return prefixs

    def _merge_wigs(self, wig_folder, prefix, libs):
        self.helper.check_make_folder(os.path.join(os.getcwd(),
                                      self.tmps["tmp"]))
        for wig_file in os.listdir(wig_folder):
            for lib in libs:
                info = lib.split(":")
                if (info[0][:-4] in wig_file) and (info[-1] == "+") and (
                        prefix in wig_file) and (
                        os.path.isfile(os.path.join(wig_folder, wig_file))):
                    Helper().merge_file(
                            os.path.join(wig_folder, wig_file),
                            os.path.join("tmp", "merge_forward.wig"))
                if (info[0][:-4] in wig_file) and (info[-1] == "-") and (
                        prefix in wig_file) and (
                        os.path.isfile(os.path.join(wig_folder, wig_file))):
                    Helper().merge_file(
                            os.path.join(wig_folder, wig_file),
                            os.path.join("tmp", "merge_reverse.wig"))

    def _check_orphan(self, prefixs, wig_folder, args_tss):
        for prefix in prefixs:
            self._merge_wigs(wig_folder, prefix, args_tss.libs)
            tmp_tss = os.path.join(self.tmps["tmp"], "_".join([
                          prefix, args_tss.program + ".gff"]))
            pre_tss = os.path.join(self.gff_outfolder, "_".join([
                          prefix, args_tss.program + ".gff"]))
            check_orphan(pre_tss, os.path.join(
                args_tss.gffs, prefix + ".gff"),
                "tmp/merge_forward.wig", "tmp/merge_reverse.wig", tmp_tss)
            shutil.move(tmp_tss, pre_tss)
        shutil.rmtree("tmp")

    def _remove_files(self, args_tss):
        print("Remove temperary files and folders...")
        self.helper.remove_tmp(args_tss.fastas)
        self.helper.remove_tmp(args_tss.gffs)
        self.helper.remove_tmp(args_tss.wig_folder)
        self.helper.remove_tmp(args_tss.ta_files)
        if "merge_forward.wig" in os.listdir(os.getcwd()):
            os.remove("merge_forward.wig")
        if "merge_reverse.wig" in os.listdir(os.getcwd()):
            os.remove("merge_reverse.wig")

    def _deal_with_overlap(self, out_folder, args_tss):
        if args_tss.overlap_feature.lower() == "both":
            pass
        else:
            print("Comparing TSS and Processing site...")
            if args_tss.program.lower() == "tss":
                for tss in os.listdir(out_folder):
                    if tss.endswith("_TSS.gff"):
                        ref = self.helper.get_correct_file(
                                args_tss.references, "_processing.gff",
                                tss.replace("_TSS.gff", ""), None, None)
                        filter_tss_pro(os.path.join(out_folder, tss),
                                       ref, args_tss.overlap_feature,
                                       args_tss.cluster)
            elif args_tss.program.lower() == "processing_site":
                for tss in os.listdir(out_folder):
                    if tss.endswith("_processing.gff"):
                        ref = self.helper.get_correct_file(
                                args_tss.references, "_TSS.gff",
                                tss.replace("_processing.gff", ""), None, None)
                        filter_tss_pro(os.path.join(out_folder, tss),
                                       ref, args_tss.overlap_feature,
                                       args_tss.cluster)

    def _low_expression(self, args_tss, gff_folder):
        prefix = None
        self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs)
        for gff in os.listdir(gff_folder):
            if (args_tss.program.lower() == "tss") and (
                    gff.endswith("_TSS.gff")):
                prefix = gff.replace("_TSS.gff", "")
            elif (args_tss.program.lower() == "processing") and (
                    gff.endswith("_processing.gff")):
                prefix = gff.replace("_processing.gff", "")
            if prefix:
                out = open(os.path.join(
                    self.stat_outfolder, prefix, "_".join([
                        "stat", prefix, "low_expression_cutoff.csv"])), "w")
                out.write("\t".join(["strain", "cutoff_coverage"]) + "\n")
                cutoff = filter_low_expression(
                        os.path.join(gff_folder, gff), args_tss,
                        "tmp/merge_forward.wig", "tmp/merge_reverse.wig",
                        "tmp/without_low_expression.gff")
                out.write("\t".join([prefix, str(cutoff)]) + "\n")
                os.remove(os.path.join(gff_folder, gff))
                shutil.move("tmp/without_low_expression.gff",
                            os.path.join(gff_folder, gff))
                prefix = None
        out.close()

    def run_tsspredator(self, args_tss):
        input_folder = os.path.join(args_tss.out_folder, "configs")
        for gff in os.listdir(args_tss.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(
                                                 args_tss.gffs, gff))
        self.helper.check_make_folder(self.gff_outfolder)
        self.multiparser.parser_fasta(args_tss.fastas)
        self.multiparser.parser_gff(args_tss.gffs, None)
        self.multiparser.parser_wig(args_tss.wig_folder)
        prefixs = self._set_gen_config(args_tss, input_folder)
        for prefix in prefixs:
            out_path = os.path.join(
                    self.master, "_".join(["MasterTable", prefix]))
            config_file = os.path.join(
                    input_folder, "_".join(["config", prefix]) + ".ini")
            self._start_to_run(args_tss.tsspredator_path, config_file,
                               out_path, prefix)
            if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")):
                shutil.move(os.path.join(out_path, "TSSstatistics.tsv"),
                            os.path.join(
                                self.stat_outfolder, "TSSstatistics.tsv"))
        if args_tss.program.lower() == "processing_site":
            args_tss.program = "processing"
        self._convert_gff(prefixs, args_tss)
        if args_tss.check_orphan:
            print("checking the orphan TSS...")
            self._check_orphan(prefixs,
                               os.path.join(args_tss.wig_folder, "tmp"),
                               args_tss)
        self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder,
                                     None, args_tss.program)
        datas = []
        for gff in os.listdir(self.gff_outfolder):
            if gff.endswith(".gff"):
                gff_folder = gff.replace("".join(["_", args_tss.program,
                                                  ".gff"]), "")
                self.helper.check_make_folder(
                     os.path.join(self.stat_outfolder, gff_folder))
                datas.append(gff_folder)
        if args_tss.remove_low_expression is not None:
            self._low_expression(args_tss, self.gff_outfolder)
        if args_tss.manual is not None:
            self.multiparser.combine_wig(args_tss.gffs, self.wig_path,
                                         None, args_tss.libs)
            self._merge_manual(datas, args_tss)
        self._deal_with_overlap(self.gff_outfolder, args_tss)
        if args_tss.stat:
            self._stat_tss(datas, args_tss.program)
        if args_tss.validate:
            self._validate(datas, args_tss)
        if args_tss.ta_files is not None:
            self._compare_ta(datas, args_tss)
        self._remove_files(args_tss)
예제 #28
0
class TestConverter(unittest.TestCase):

    def setUp(self):
        self.converter = Converter()
        self.example = Example()
        self.converter.gff3parser = Mock_gff3_parser
        self.converter._print_rntptt_title = Mock_func().print_rntptt_title
        self.converter.tsspredator = Mock_TSSPredatorReader()
        self.converter._read_file = Mock_func().mock_read_file
        self.gff_file = self.example.gff_file
        self.ptt_out = self.example.ptt_out
        self.rnt_out = self.example.rnt_out
        self.srna_out = self.example.srna_out
        self.embl_file = self.example.embl_file
        self.embl_out = self.example.embl_out
        self.multi_embl = self.example.multi_embl
        self.gff_out = self.example.gff_out
        self.mastertable = self.example.mastertable
        self.tss_file = self.example.tss_file
        self.fasta_file = self.example.fasta_file
        self.transterm = self.example.transterm
        self.term_file = self.example.term_file
        self.circ_file = self.example.circrna_table
        self.circ_all = self.example.circrna_all
        self.circ_best = self.example.circrna_best
        self.test_folder = "test_folder"
        self.mock_args = MockClass()
        if (not os.path.exists(self.test_folder)):
            os.mkdir(self.test_folder)

    def tearDown(self):
        if os.path.exists(self.test_folder):
            shutil.rmtree(self.test_folder)

    def test_print_rntptt_file(self):
        cdss = []
        genes = []
        rnas = []
        gff_dict = Example().gff_dict
        for gff in gff_dict:
            if gff["feature"] == "gene":
                genes.append(self.converter.gff3parser.entries(self, gff))
            elif gff["feature"] == "CDS":
                cdss.append(self.converter.gff3parser.entries(self, gff))
            elif gff["feature"] == "tRNA":
                rnas.append(self.converter.gff3parser.entries(self, gff))
        out_p = StringIO()
        out_r = StringIO()
        self.converter._print_rntptt_file(out_p, cdss, genes)
        self.converter._print_rntptt_file(out_r, rnas, genes)
        self.assertEqual(out_p.getvalue().split("\n")[:-1],
                         self.example.ptt_out_list)
        self.assertEqual(out_r.getvalue().split("\n")[:-1],
                         self.example.rnt_out_list)
        out_p.close()
        out_r.close()

    def test_srna2pttrnt(self):
        srna_input_file = os.path.join(self.test_folder, "srna.gff")
        srna_output_file = os.path.join(self.test_folder, "srna.out")
        with open(srna_input_file, "w") as fh:
            fh.write(self.gff_file)
        srnas = []
        self.converter._srna2rntptt(srna_input_file, srna_output_file,
                                    srnas, 1234567)
        datas = import_data(srna_output_file)
        self.assertEqual(set(datas), set(self.srna_out.split("\n")))

    def test_multi_embl_pos(self):
        embls = []
        for line in self.embl_file.split("\n"):
            datas = self.converter._multi_embl_pos(line.strip())
            if datas != "Wrong":
                embls.append(datas)
        for index in range(0, 7):
            self.assertDictEqual(embls[index], self.embl_out[index])
        for index in range(0, 2):
            self.assertDictEqual(embls[-1]["pos"][index],
                                 self.multi_embl[index])
        
    def test_parser_embl_data(self):
        embl_file = os.path.join(self.test_folder, "test.embl")
        embl_out = os.path.join(self.test_folder, "test.embl_out")
        out = StringIO()
        with open(embl_file, "w") as eh:
            for line in self.embl_file.split("\n"):
                eh.write(line + "\n")
        info = self.converter._parser_embl_data(embl_file, out)
        datas = out.getvalue().split("\n")
        self.assertEqual(set(datas[:-1]), set(self.gff_out.split("\n")))
        self.assertEqual(info[0], "NC_007795.1")
        for index in range(0, 2):
            self.assertDictEqual(info[1]["pos"][index], self.multi_embl[index])
        out.close()

    def test_multi_tss_class(self):
        nums = {"tss": 0, "tss_uni": 0, "class": 1}
        utrs = {"total": [], "pri": [], "sec": []}
        tss_features = {"tss_types": [], "locus_tags": [], "utr_lengths": []}
        tss_index = defaultdict(lambda: 0)
        master_file = os.path.join(self.test_folder, "test.tsv")
        fh = StringIO(self.mastertable)
        for tss in self.converter.tsspredator.entries(fh):
            self.converter._multi_tss_class(
                tss, tss_index, tss_features, nums, utrs)
        fh.close()
        self.assertDictEqual(nums, {'tss_uni': 0, 'class': 5, 'tss': 2})

    def test_convert_mastertable2gff(self):
        master_file = os.path.join(self.test_folder, "test.tsv")
        with open(master_file, "w") as th:
            th.write(self.mastertable)
        out_gff = os.path.join(self.test_folder, "test.tsv_out")
        self.converter.convert_mastertable2gff(master_file, "ANNOgesic", "TSS",
                                               "aaa", out_gff)
        datas = import_data(out_gff)
        self.assertEqual(set(datas), set(self.tss_file.split("\n")))

    def test_convert_gff2rntptt(self):
        srna_input_file = os.path.join(self.test_folder, "srna.gff")
        srna_output_file = os.path.join(self.test_folder, "srna.out")
        gff_file = os.path.join(self.test_folder, "test.gff")
        rnt_file = os.path.join(self.test_folder, "test.rnt")
        ptt_file = os.path.join(self.test_folder, "test.ptt")
        fasta_file = os.path.join(self.test_folder, "test.fa")
        with open(srna_input_file, "w") as fh:
            fh.write(self.gff_file)
        with open(gff_file, "w") as fh:
            fh.write(self.gff_file)
        with open(fasta_file, "w") as fh:
            fh.write(self.fasta_file)
        self.converter.convert_gff2rntptt(
             gff_file, fasta_file, ptt_file, rnt_file,
             srna_input_file, srna_output_file)
        self.assertTrue(srna_output_file)
        self.assertTrue(rnt_file)
        self.assertTrue(ptt_file)

    def test_convert_embl2gff(self):
        embl_file = os.path.join(self.test_folder, "test.embl")
        gff_file = os.path.join(self.test_folder, "test.embl_out")
        with open(embl_file, "w") as eh:
            for line in self.embl_file.split("\n"):
                eh.write(line + "\n")
        self.converter.convert_embl2gff(embl_file, gff_file)
        datas = import_data(gff_file)
        self.assertEqual(set(datas[1:-2]), set(self.gff_out.split("\n")))

    def test_convert_transtermhp2gff(self):
        transterm_file = os.path.join(
            self.test_folder, "test_best_terminator_after_gene.bag")
        gff_file = os.path.join(self.test_folder, "transterm.gff")
        with open(transterm_file, "w") as th:
            th.write(self.transterm)
        self.converter.convert_transtermhp2gff(transterm_file, gff_file)
        datas = import_data(gff_file)
        self.assertEqual(set(datas), set(self.term_file.split("\n")))

    def get_info(datas):
        f_datas = []
        for data in datas:
            if not data.startswith("#"):
                f_datas.append("\t".join(data.split("\t")[:8]))
        return f_datas

    def test_convert_circ2gff(self):
        circ_file = os.path.join(self.test_folder, "circ.csv")
        out_all = os.path.join(self.test_folder, "all.gff")
        out_filter = os.path.join(self.test_folder, "best.gff")  
        with open(circ_file, "w") as ch:
            ch.write(self.circ_file)
        args = self.mock_args.mock()
        args.start_ratio = 0.5
        args.end_ratio = 0.5
        args.support = 5
        self.converter.convert_circ2gff(circ_file, args, out_all, out_filter)
        datas = import_data(out_all)
        f_datas = []
        for data in datas:
            if not data.startswith("#"):
                f_datas.append("\t".join(data.split("\t")[:8]))
        c_datas = []
        for data in self.circ_all.split("\n"):
            if not data.startswith("#"):
                c_datas.append("\t".join(data.split("\t")[:8]))
        self.assertListEqual(f_datas, c_datas)
        datas = import_data(out_filter)
        f_datas = []
        for data in datas:
            if not data.startswith("#"):
                f_datas.append("\t".join(data.split("\t")[:8]))
        c_datas = []
        for data in self.circ_best.split("\n"):
            if not data.startswith("#"):
                c_datas.append("\t".join(data.split("\t")[:8]))
        self.assertListEqual(f_datas, c_datas)
예제 #29
0
class CircRNADetection(object):
    def __init__(self, args_circ):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.alignment_path = os.path.join(args_circ.output_folder,
                                           "segemehl_align")
        self.splice_path = os.path.join(args_circ.output_folder,
                                        "segemehl_splice")
        self.candidate_path = os.path.join(args_circ.output_folder,
                                           "circRNA_tables")
        self.gff_folder = os.path.join(args_circ.output_folder, "gffs")
        self.gff_path = os.path.join(args_circ.gffs, "tmp")
        self.splices = {
            "all_file": "splicesites_all.bed",
            "file": "splicesites.bed",
            "all": "splicesites_all",
            "splice": "splicesites"
        }
        self.trans = {
            "all_file": "transrealigned_all.bed",
            "file": "transrealigned.bed",
            "all": "transrealigned_all",
            "trans": "transrealigned"
        }
        self.bams = {"whole": "whole_reads.bam", "sort": "whole_reads_sort"}
        if args_circ.align:
            if args_circ.fastas is None:
                print("Error: There is no genome fasta file!!!")
                sys.exit()
            else:
                self.fasta_path = os.path.join(args_circ.fastas, "tmp")
        else:
            self.fasta_path = os.path.join(args_circ.fastas, "tmp")

    def _wait_process(self, processes):
        for p in processes:
            p.wait()
            if p.stdout:
                p.stdout.close()
            if p.stdin:
                p.stdin.close()
            if p.stderr:
                p.stderr.close()
            try:
                p.kill()
            except OSError:
                pass
            time.sleep(5)

    def _deal_zip_file(self, read_folder):
        tmp_reads = []
        for read in os.listdir(read_folder):
            if read.endswith(".bz2"):
                mod_read = read.replace(".bz2", "")
                if (".fa" not in mod_read) and (".fasta" not in mod_read) and (
                        ".fna" not in mod_read):
                    mod_read = mod_read + ".fa"
                read_out = open(os.path.join(read_folder, mod_read), "w")
                tmp_reads.append(os.path.join(read_folder, mod_read))
                print(" ".join(["unzip", read]))
                call(["bzcat", os.path.join(read_folder, read)],
                     stdout=read_out)
                read_out.close()
            elif read.endswith(".gz"):
                mod_read = read.replace(".gz", "")
                if (".fa" not in mod_read) and (".fasta" not in mod_read) and (
                        ".fna" not in mod_read):
                    mod_read = mod_read + ".fa"
                read_out = open(os.path.join(read_folder, mod_read), "w")
                tmp_reads.append(os.path.join(read_folder, mod_read))
                print(" ".join(["unzip", read]))
                call(["zcat", os.path.join(read_folder, read)],
                     stdout=read_out)
                read_out.close()
        return tmp_reads

    def _run_segemehl_fasta_index(self, segemehl_path, fasta_path, index,
                                  fasta):
        call([
            os.path.join(segemehl_path, "segemehl.x"), "-x",
            os.path.join(fasta_path, index), "-d",
            os.path.join(fasta_path, fasta)
        ])

    def _run_segemehl_align(self, args_circ, index, fasta, read, sam_file,
                            log_file, fasta_prefix):
        out = open(os.path.join(self.alignment_path, fasta_prefix, sam_file),
                   "w")
        log = open(os.path.join(self.alignment_path, fasta_prefix, log_file),
                   "w")
        p = Popen([
            os.path.join(args_circ.segemehl_path, "segemehl.x"), "-i",
            os.path.join(self.fasta_path, index), "-d",
            os.path.join(self.fasta_path, fasta), "-q",
            os.path.join(args_circ.read_folder, read), "-S"
        ],
                  stdout=out,
                  stderr=log)
        return p

    def _align(self, args_circ):
        prefixs = []
        align_files = []
        for fasta in os.listdir(self.fasta_path):
            index = fasta.replace(".fa", ".idx")
            self._run_segemehl_fasta_index(args_circ.segemehl_path,
                                           self.fasta_path, index, fasta)
            processes = []
            num_process = 0
            fasta_prefix = fasta.replace(".fa", "")
            prefixs.append(fasta_prefix)
            self.helper.check_make_folder(
                os.path.join(self.alignment_path, fasta_prefix))
            for read in os.listdir(args_circ.read_folder):
                num_process += 1
                if read.endswith(".fa") or \
                   read.endswith(".fna") or \
                   read.endswith("fasta"):
                    filename = read.split(".")
                    read_prefix = ".".join(filename[:-1])
                    sam_file = "_".join([read_prefix, fasta_prefix + ".sam"])
                    log_file = "_".join([read_prefix, fasta_prefix + ".log"])
                    align_files.append("_".join([read_prefix, fasta_prefix]))
                    print("mapping {0}".format(sam_file))
                    p = self._run_segemehl_align(args_circ, index, fasta, read,
                                                 sam_file, log_file,
                                                 fasta_prefix)
                    processes.append(p)
                    if num_process == args_circ.cores:
                        self._wait_process(processes)
                        num_process = 0
            self._wait_process(processes)
        return align_files, prefixs

    def _run_samtools_convert_bam(self, samtools_path, pre_sam, out_bam):
        call([samtools_path, "view", "-bS", pre_sam, "-o", out_bam])

    def _convert_sam2bam(self, sub_alignment_path, samtools_path, align_files):
        bam_files = []
        convert_ones = []
        remove_ones = []
        for sam in os.listdir(sub_alignment_path):
            pre_sam = os.path.join(sub_alignment_path, sam)
            if sam.endswith(".sam"):
                bam_file = sam.replace(".sam", ".bam")
                print("Convert {0} to {1}".format(sam, bam_file))
                out_bam = os.path.join(sub_alignment_path, bam_file)
                self._run_samtools_convert_bam(samtools_path, pre_sam, out_bam)
                bam_files.append(out_bam)
                if align_files:
                    if bam_file.replace(".bam", "") not in align_files:
                        convert_ones.append(out_bam)
                    else:
                        remove_ones.append(pre_sam)
            elif sam.endswith(".bam"):
                if (pre_sam not in convert_ones) and (pre_sam
                                                      not in remove_ones):
                    bam_files.append(pre_sam)
            elif sam.endswith(".log"):
                os.remove(pre_sam)
        return bam_files, convert_ones, remove_ones

    def _run_samtools_merge_sort(self, samtools_path, sub_alignment_path,
                                 bam_files):
        print("Merge all bam files....")
        whole_bam = os.path.join(sub_alignment_path, self.bams["whole"])
        if len(bam_files) <= 1:
            shutil.copyfile(bam_files[0], whole_bam)
        else:
            file_line = " ".join(bam_files)
            os.system(" ".join([samtools_path, "merge", whole_bam, file_line]))
        print("Sort bam files....")
        call([
            samtools_path, "sort", "-o",
            os.path.join(sub_alignment_path, self.bams["sort"] + ".bam"),
            whole_bam
        ])
        os.remove(os.path.join(sub_alignment_path, self.bams["whole"]))

    def _run_samtools_convert_sam(self, samtools_path, sub_alignment_path):
        print("Convert whole reads bam file to sam file....")
        call([
            samtools_path, "view", "-h", "-o",
            os.path.join(sub_alignment_path, self.bams["sort"] + ".sam"),
            os.path.join(sub_alignment_path, self.bams["sort"] + ".bam")
        ])

    def _merge_sort_aligment_file(self, bam_files, samtools_path,
                                  sub_alignment_path, convert_ones, tmp_reads,
                                  remove_ones):
        self._run_samtools_merge_sort(samtools_path, sub_alignment_path,
                                      bam_files)
        self._run_samtools_convert_sam(samtools_path, sub_alignment_path)
        for bam in convert_ones:
            os.remove(bam)
        for sam in remove_ones:
            os.remove(sam)
        if len(tmp_reads) != 0:
            for read in tmp_reads:
                os.remove(read)

    def _run_testrealign(self, prefix, segemehl_path, sub_alignment_path):
        self.helper.check_make_folder(os.path.join(self.splice_path, prefix))
        sub_splice_path = os.path.join(self.splice_path, prefix)
        err_log = os.path.join(sub_splice_path, prefix + ".log")
        print("Running testrealign.x for {0}".format(prefix))
        command = " ".join([
            os.path.join(segemehl_path, "testrealign.x"), "-d",
            os.path.join(self.fasta_path, prefix + ".fa"), "-q",
            os.path.join(sub_alignment_path, self.bams["sort"] + ".sam"), "-n"
        ])
        os.system(command + " 2>" + err_log)
        self.helper.move_all_content(os.getcwd(), sub_splice_path, [".bed"])
        self.helper.remove_all_content(sub_alignment_path, self.bams["sort"],
                                       "file")

    def _merge_bed(self, fastas, splice_path):
        tmp_prefixs = []
        for fasta in os.listdir(fastas):
            headers = []
            if (fasta.endswith(".fa") or fasta.endswith(".fna")
                    or fasta.endswith(".fasta")):
                with open(os.path.join(fastas, fasta), "r") as f_h:
                    for line in f_h:
                        line = line.strip()
                        if line.startswith(">"):
                            headers.append(line[1:])
                filename = fasta.split(".")
                fasta_prefix = ".".join(filename[:-1])
                tmp_prefixs.append(fasta_prefix)
                self.helper.check_make_folder(
                    os.path.join(os.getcwd(), fasta_prefix))
                for header in headers:
                    shutil.copyfile(
                        os.path.join(splice_path, header,
                                     self.splices["file"]),
                        os.path.join(
                            fasta_prefix,
                            "_".join([self.splices["splice"],
                                      header + ".bed"])))
                    shutil.copyfile(
                        os.path.join(splice_path, header, self.trans["file"]),
                        os.path.join(
                            fasta_prefix,
                            "_".join([self.trans["trans"], header + ".bed"])))
                out_splice = os.path.join(fasta_prefix,
                                          self.splices["all_file"])
                out_trans = os.path.join(fasta_prefix, self.trans["all_file"])
                if len(headers) > 1:
                    for file_ in os.listdir(fasta_prefix):
                        if (self.splices["splice"]
                                in file_) and (self.splices["all"]
                                               not in file_):
                            self.helper.merge_file(
                                os.path.join(fasta_prefix, file_), out_splice)
                        elif (self.trans["trans"]
                              in file_) and (self.trans["all"] not in file_):
                            self.helper.merge_file(
                                os.path.join(fasta_prefix, file_), out_trans)
                else:
                    shutil.move(
                        os.path.join(
                            fasta_prefix, "_".join(
                                [self.splices["splice"],
                                 headers[0] + ".bed"])), out_splice)
                    shutil.move(
                        os.path.join(
                            fasta_prefix, "_".join(
                                [self.trans["trans"], headers[0] + ".bed"])),
                        out_trans)
        self.helper.remove_all_content(splice_path, None, "dir")
        return tmp_prefixs

    def _stat_and_gen_gff(self, tmp_prefixs, args_circ):
        for prefix in tmp_prefixs:
            self.helper.check_make_folder(os.path.join(self.gff_folder,
                                                       prefix))
            shutil.copytree(prefix, os.path.join(self.splice_path, prefix))
            self.helper.check_make_folder(
                os.path.join(self.candidate_path, prefix))
            print("comparing with annotation of {0}".format(prefix))
            if self.splices["all_file"] in os.listdir(
                    os.path.join(self.splice_path, prefix)):
                detect_circrna(
                    os.path.join(self.splice_path, prefix,
                                 self.splices["all_file"]),
                    os.path.join(self.gff_path, prefix + ".gff"),
                    os.path.join(self.candidate_path, prefix,
                                 "_".join(["circRNA", prefix + "_all.csv"])),
                    args_circ,
                    os.path.join(args_circ.stat_folder,
                                 "_".join(["stat_circRNA", prefix + ".csv"])))
                self.converter.convert_circ2gff(
                    os.path.join(self.candidate_path, prefix,
                                 "_".join(["circRNA", prefix + "_all.csv"])),
                    args_circ,
                    os.path.join(self.gff_folder, prefix,
                                 "_".join([prefix, "circRNA_all.gff"])),
                    os.path.join(self.gff_folder, prefix,
                                 "_".join([prefix, "circRNA_best.gff"])))

    def _assign_merge_bam(self, args_circ):
        remove_frags = []
        bam_files = []
        if (args_circ.normal_bams is not None) and (args_circ.frag_bams
                                                    is not None):
            for frag in os.listdir(args_circ.frag_bams):
                if frag.endswith(".bam"):
                    shutil.copyfile(os.path.join(args_circ.frag_bams, frag),
                                    os.path.join(args_circ.normal_bams, frag))
                    remove_frags.append(frag)
            merge_folder = args_circ.normal_bams
        elif (args_circ.normal_bams is not None):
            merge_folder = args_circ.normal_bams
        elif (args_circ.frag_bams is not None):
            merge_folder = args_circ.frag_bams
        else:
            print("Error: please assign bam folder or do alignment!!")
            sys.exit()
        for bam in os.listdir(merge_folder):
            if bam.endswith(".bam"):
                bam_files.append(os.path.join(merge_folder, bam))
        return merge_folder, remove_frags, bam_files

    def run_circrna(self, args_circ):
        for gff in os.listdir(args_circ.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(
                    os.path.join(args_circ.gffs, gff))
        if args_circ.segemehl_path is None:
            print("Error: please assign segemehl folder!!")
            sys.exit()
        self.multiparser.parser_gff(args_circ.gffs, None)
        self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta",
                                     None)
        tmp_reads = []
        if args_circ.align:
            self.multiparser.parser_fasta(args_circ.fastas)
            tmp_reads = self._deal_zip_file(args_circ.read_folder)
            align_files, prefixs = self._align(args_circ)
        else:
            self.multiparser.parser_fasta(args_circ.fastas)
            prefixs = []
            for fasta in os.listdir(self.fasta_path):
                fasta_prefix = fasta.replace(".fa", "")
                prefixs.append(fasta_prefix)
            merge_folder, remove_frag, bam_files = self._assign_merge_bam(
                args_circ)
            align_files = None
        for prefix in prefixs:
            if args_circ.align:
                sub_alignment_path = os.path.join(self.alignment_path, prefix)
                bam_files, convert_ones, remove_ones = self._convert_sam2bam(
                    sub_alignment_path, args_circ.samtools_path, align_files)
            else:
                sub_alignment_path = merge_folder
                convert_ones = []
                remove_ones = []
            self._merge_sort_aligment_file(bam_files, args_circ.samtools_path,
                                           sub_alignment_path, convert_ones,
                                           tmp_reads, remove_ones)
            self._run_testrealign(prefix, args_circ.segemehl_path,
                                  sub_alignment_path)
        tmp_prefixs = self._merge_bed(args_circ.fastas, self.splice_path)
        self.multiparser.parser_gff(args_circ.gffs, None)
        self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta",
                                     None)
        self._stat_and_gen_gff(tmp_prefixs, args_circ)
        self.helper.remove_tmp(args_circ.fastas)
        self.helper.remove_tmp(args_circ.gffs)
        for tmp_prefix in tmp_prefixs:
            shutil.rmtree(tmp_prefix)
        if (not args_circ.align) and (len(remove_frag) != 0):
            for frag in remove_frag:
                os.remove(os.path.join(merge_folder, frag))
예제 #30
0
class TSSpredator(object):

    def __init__(self, args_tss):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.master = os.path.join(args_tss.out_folder, "MasterTables")
        self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta":
                     "tmp_tss", "tmp": "tmp"}
        if args_tss.ta_files is not None:
            self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp")
        else:
            self.tmps["ta"] = None
        self.gff_path = os.path.join(args_tss.gffs, "tmp")
        if args_tss.manual is not None:
            self.manual_path = os.path.join(args_tss.manual, "tmp")
        self.wig_path = os.path.join(args_tss.wig_folder, "tmp")
        self.fasta_path = os.path.join(args_tss.fastas, "tmp")
        self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics")
        self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs")

    def _assign_dict(self, lib_datas):
        return {"wig": lib_datas[0],
                "tex": lib_datas[1],
                "condition": int(lib_datas[2]),
                "replicate": lib_datas[3],
                "strand": lib_datas[4]}

    def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix, rep_set):
        for num_id in range(1, lib_num+1):
            cond_list = []
            for lib in lib_list:
                if num_id == lib["condition"]:
                    cond_list.append(lib)
            cond_sort_list = sorted(cond_list, key=lambda k: k['replicate'])
            reps = []
            for cond in cond_sort_list:
                out.write("{0}_{1}{2} = {3}\n".format(
                          prefix, cond["condition"], cond["replicate"],
                          os.path.join(wig_folder, cond["wig"])))
                reps.append(cond["replicate"])
            for rep in sorted(rep_set):
                if rep not in reps:
                    out.write("{0}_{1}{2} = \n".format(
                              prefix, cond["condition"], rep))

    def _start_to_run(self, tsspredator_path, config_file, out_path, prefix, log):
        print("Running TSSpredator for " + prefix)
        log.write("Make sure the version of TSSpredator is at least 1.06.\n")
        out = open(os.path.join(out_path, "log.txt"), "w")
        err = open(os.path.join(out_path, "err.txt"), "w")
        log.write(" ".join(["java", "-jar", tsspredator_path,
                            config_file]) + "\n")
        call(["java", "-jar", tsspredator_path,
              config_file], stdout=out, stderr=err)
        out.close()
        err.close()
        log.write("Done!\n")
        log.write("The following files are generated in {0}:\n".format(out_path))
        for file_ in os.listdir(out_path):
            log.write("\t" + file_ + "\n")

    def _import_lib(self, libs, wig_folder, project_strain_name,
                    out, gff, program, fasta):
        lib_dict = {"fp": [], "fm": [], "nm": [], "np": []}
        lib_num = 0
        rep_set = set()
        list_num_id = []
        for lib in libs:
            lib_datas = lib.split(":")
            if not lib_datas[0].endswith(".wig"):
                print("Error: Wiggle files are not end with .wig!")
                sys.exit()
            for wig in os.listdir(wig_folder):
                filename = wig.split("_STRAIN_")
                if (filename[0] == lib_datas[0][:-4]) and (
                        filename[1][:-4] == project_strain_name):
                    lib_datas[0] = wig
            if int(lib_datas[2]) > lib_num:
                lib_num = int(lib_datas[2])
            if lib_datas[3] not in rep_set:
                rep_set.add(lib_datas[3])
            if (lib_datas[1] == "tex") and (lib_datas[4] == "+"):
                lib_dict["fp"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"):
                lib_dict["fm"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"):
                lib_dict["np"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"):
                lib_dict["nm"].append(self._assign_dict(lib_datas))
        for num_id in range(1, lib_num+1):
            out.write("annotation_{0} = {1}\n".format(num_id, gff))
        if program.lower() == "tss":
            self._print_lib(lib_num, lib_dict["fm"], out,
                            wig_folder, "fivePrimeMinus", rep_set)
            self._print_lib(lib_num, lib_dict["fp"], out,
                            wig_folder, "fivePrimePlus", rep_set)
        elif program.lower() == "ps":
            self._print_lib(lib_num, lib_dict["nm"], out,
                            wig_folder, "fivePrimeMinus", rep_set)
            self._print_lib(lib_num, lib_dict["np"], out,
                            wig_folder, "fivePrimePlus", rep_set)
        else:
            print("Error: Wrong program name! Please assing tss "
                  "or processing_site.")
            sys.exit()
        for num_id in range(1, lib_num+1):
            out.write("genome_{0} = {1}\n".format(num_id, fasta))
        for num_id in range(1, lib_num+1):
            list_num_id.append(str(num_id))
        return lib_num, num_id, rep_set, lib_dict, list_num_id

    def _print_repmatch(self, args_tss, out):
        '''check replicate match'''
        detect_all = False
        for rep in args_tss.repmatch:
            if "all" in rep:
                detect_all = True
                match = rep.split("_")[-1]
                out.write("minNumRepMatches = {0}\n".format(match))
                break
        if not detect_all:
            nums = {}
            matchs = {}
            for match in args_tss.repmatch:
                lib = match.split("_")[0]
                rep = match.split("_")[-1]
                matchs[lib] = rep
                if rep not in nums.keys():
                    nums[rep] = 1
                else:
                    nums[rep] += 1
            for rep, num in nums.items():
                if num == max(nums.values()):
                    out.write("minNumRepMatches = {0}\n".format(rep))
                    max_rep = rep
                    break
            for lib, rep in matchs.items():
                if rep != max_rep:
                    out.write("minNumRepMatches_{0} = {1}\n".format(
                        lib, rep))

    def _extract_best_para(self, args_tss, prefix, log):
        detect = False
        for best_file in os.listdir(args_tss.auto_load):
            if best_file == "_".join(["best", prefix + ".csv"]):
                bh = open(os.path.join(args_tss.auto_load, best_file),"r" )
                lines = bh.readlines()
                bh.close()
                if len(lines[len(lines)-1].split("\t")) < 8:
                    print("Error: some information in {0} is missing. "
                          "It may be due to that \"optimize_tss_ps\" did "
                          "not finish successfully.".format(best_file))
                    log.write("Error: some information in {0} is missing. "
                              "It may be due to that \"optimize_tss_ps\" did "
                              "not finish successfully.\n".format(best_file))
                    sys.exit()
                else:
                    para_info = lines[len(lines)-1].split("\t")[1].split("_")
                    detect_all =  all(elem in para_info
                            for elem in ["he", "rh", "fa", "rf",
                                         "bh", "ef", "pf"])
                    if (not detect_all) or (len(para_info) != 14):
                        print("Error: {0} is complete. Some parameters are "
                              "missing!".format(best_file))
                        log.write("Error: {0} is complete. Some parameters "
                                  "are missing!\n".format(best_file))
                        sys.exit()
                    else:
                        detect = True
                        height = para_info[para_info.index("he") + 1]
                        height_reduction = para_info[
                            para_info.index("rh") + 1]
                        factor = para_info[para_info.index("fa") + 1]
                        factor_reduction = para_info[
                            para_info.index("rf") + 1]
                        base_height = para_info[
                            para_info.index("bh") + 1]
                        enrichment_factor = para_info[
                            para_info.index("ef") + 1]
                        processing_factor = para_info[
                            para_info.index("pf") + 1]
        if detect:
            return height, height_reduction, factor, factor_reduction, \
                   base_height, enrichment_factor, processing_factor
        else:
            print("Error: No best_{0}.csv can be found in {1}! ".format(
                prefix, args_tss.auto_load))
            log.write("Error: No best_{0}.csv can be found in {1}\n".format(
                prefix, args_tss.auto_load))
            sys.exit()

    def _get_input_para(self, args_tss, prefix, log):
        if args_tss.genome_order is None:
            height = args_tss.height[0]
            height_reduction = args_tss.height_reduction[0]
            factor = args_tss.factor[0]
            factor_reduction = args_tss.factor_reduction[0]
            base_height = args_tss.base_height[0]
            enrichment_factor = args_tss.enrichment_factor[0]
            processing_factor = args_tss.processing_factor[0]
        else:
            if prefix not in args_tss.genome_order:
                print("Error: the parameters for {0} were not assigned!".format(
                    prefix))
                log.write("Error: the parameters for {0} were not assigned!\n".format(
                    prefix))
                sys.exit()
            else:
                index = args_tss.genome_order.index(prefix)
                height = args_tss.height[index]
                height_reduction = args_tss.height_reduction[index]
                factor = args_tss.factor[index]
                factor_reduction = args_tss.factor_reduction[index]
                base_height = args_tss.base_height[index]
                enrichment_factor = args_tss.enrichment_factor[index]
                processing_factor = args_tss.processing_factor[index]
        return height, height_reduction, factor, factor_reduction, \
               base_height, enrichment_factor, processing_factor

    def _gen_config(self, project_strain_name, args_tss, gff,
                    wig_folder, fasta, config_file, log):
        '''generation of config files'''
        log.write("Generating config files for TSSpredator.\n")
        if args_tss.auto_load is not None:
            height, height_reduction, factor, factor_reduction, \
            base_height, enrichment_factor, processing_factor = \
            self._extract_best_para(args_tss, project_strain_name, log)
        else:
            height, height_reduction, factor, factor_reduction, \
            base_height, enrichment_factor, processing_factor = \
            self._get_input_para(args_tss, project_strain_name, log)
        master_folder = "MasterTable_" + project_strain_name
        out_path = os.path.join(self.master, master_folder)
        self.helper.check_make_folder(out_path)
        out = open(config_file, "w")
        out.write("TSSinClusterSelectionMethod = HIGHEST\n")
        out.write("allowedCompareShift = 1\n")
        out.write("allowedRepCompareShift = 1\n")
        lib_num, num_id, rep_set, lib_dict, list_num_id = \
            self._import_lib(args_tss.libs, wig_folder, project_strain_name,
                             out, gff, args_tss.program, fasta)
        out.write("idList = ")
        out.write(",".join(list_num_id) + "\n")
        out.write("maxASutrLength = 100\n")
        out.write("maxGapLengthInGene = 500\n")
        out.write("maxNormalTo5primeFactor = {0}\n".format(
                  processing_factor))
        out.write("maxTSSinClusterDistance = {0}\n".format(
                  args_tss.cluster + 1))
        out.write("maxUTRlength = {0}\n".format(args_tss.utr_length))
        out.write("min5primeToNormalFactor = {0}\n".format(
                  enrichment_factor))
        out.write("minCliffFactor = {0}\n".format(factor))
        out.write("minCliffFactorDiscount = {0}\n".format(
                  factor_reduction))
        out.write("minCliffHeight = {0}\n".format(height))
        out.write("minCliffHeightDiscount = {0}\n".format(
                  height_reduction))
        out.write("minNormalHeight = {0}\n".format(base_height))
        self._print_repmatch(args_tss, out)
        out.write("minPlateauLength = 0\n")
        out.write("mode = cond\n")
        out.write("normPercentile = 0.9\n")
        if args_tss.program.lower() == "tss":
            self._print_lib(lib_num, lib_dict["nm"], out,
                            wig_folder, "normalMinus", rep_set)
            self._print_lib(lib_num, lib_dict["np"], out,
                            wig_folder, "normalPlus", rep_set)
        else:
            self._print_lib(lib_num, lib_dict["fm"], out,
                            wig_folder, "normalMinus", rep_set)
            self._print_lib(lib_num, lib_dict["fp"], out,
                            wig_folder, "normalPlus", rep_set)
        out.write("numReplicates = {0}\n".format(len(rep_set)))
        out.write("numberOfDatasets = {0}\n".format(lib_num))
        out.write("outputDirectory = {0}\n".format(out_path))
        for prefix_id in range(len(args_tss.output_prefixs)):
            out.write("outputPrefix_{0} = {1}\n".format(
                      prefix_id + 1, args_tss.output_prefixs[prefix_id]))
        out.write("projectName = {0}\n".format(project_strain_name))
        out.write("superGraphCompatibility = igb\n")
        out.write("texNormPercentile = 0.5\n")
        out.write("writeGraphs = 0\n")
        out.write("writeNocornacFiles = 0\n")
        log.write("\t" + config_file + " is generated.\n")
        out.close()

    def _convert_gff(self, prefixs, args_tss, log):
        for prefix in prefixs:
            out_file = os.path.join(self.gff_outfolder, "_".join([
                           prefix, args_tss.program]) + ".gff")
            gff_f = open(out_file, "w")
            out_path = os.path.join(self.master, "_".join([
                           "MasterTable", prefix]))
            if "MasterTable.tsv" not in os.listdir(out_path):
                print("Error: There is not MasterTable file in {0} ".format(
                      out_path))
                print("Please check configuration file.")
                log.write("not MasterTable file is found in {0}\n".format(
                           out_path))
            else:
                if args_tss.program.lower() == "processing":
                    feature = "processing_site"
                elif args_tss.program.lower() == "tss":
                    feature = "TSS"
                self.converter.convert_mastertable2gff(
                    os.path.join(out_path, "MasterTable.tsv"),
                    "ANNOgesic", feature, prefix, out_file)
                log.write("\t" + out_file + "is generated.\n")
            gff_f.close()

    def _merge_manual(self, tsss, args_tss):
        '''if manual detected TSS is provided, it can merge manual detected TSS 
        and TSSpredator predicted TSS'''
        self.helper.check_make_folder(os.path.join(os.getcwd(),
                                      self.tmps["tss"]))
        for tss in tsss:
            for gff in os.listdir(args_tss.gffs):
                if (gff[:-4] == tss) and (".gff" in gff):
                    break
            filename = "_".join([tss, args_tss.program]) + ".gff"
            predict = os.path.join(self.gff_outfolder, filename)
            manual = os.path.join(self.manual_path, tss + ".gff")
            fasta = os.path.join(self.fasta_path, tss + ".fa")
            stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss)
            if os.path.exists(manual):
                print("Merging and classiflying manually-detected "
                      "TSSs for {0}".format(tss))
                merge_manual_predict_tss(
                    predict, stat_file,
                    os.path.join(self.tmps["tss"], filename),
                    os.path.join(args_tss.gffs, gff), args_tss, manual, fasta)
            if os.path.exists(stat_file):
                shutil.move(stat_file, os.path.join(
                    args_tss.out_folder, "statistics", tss, stat_file))
        self.helper.move_all_content(self.tmps["tss"],
                                     self.gff_outfolder, [".gff"])
        shutil.rmtree(self.tmps["tss"])

    def _validate(self, tsss, args_tss, log):
        '''validate TSS with genome annotation'''
        print("Validating TSSs with genome annotations")
        log.write("Running validate_gene.py to compare genome "
                  "annotations and TSSs/PSs.\n")
        for tss in tsss:
            for gff in os.listdir(args_tss.gffs):
                if (gff[:-4] == tss) and (".gff" in gff):
                    break
            stat_file = os.path.join(
                    self.stat_outfolder, tss,
                    "".join(["stat_gene_vali_", tss, ".csv"]))
            out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff")
            if args_tss.program.lower() == "tss":
                compare_file = os.path.join(self.gff_outfolder,
                                            "_".join([tss, "TSS.gff"]))
            elif args_tss.program.lower() == "processing":
                compare_file = os.path.join(self.gff_outfolder,
                                            "_".join([tss, "processing.gff"]))
            validate_gff(compare_file, os.path.join(args_tss.gffs, gff),
                         stat_file, out_cds_file, args_tss.utr_length,
                         args_tss.program.lower())
            log.write("\t" + stat_file + " is generated.\n")
            shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff))

    def _compare_ta(self, tsss, args_tss, log):
        '''compare TSS with transcript'''
        detect = False
        log.write("Running stat_TA_comparison to compare transcripts "
                  "and TSSs/PSs.\n")
        print("Comparing transcripts and TSSs")
        self.multiparser.parser_gff(args_tss.ta_files, "transcript")
        self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"],
                                     None, "transcript")
        for tss in tsss:
            stat_out = os.path.join(
                    self.stat_outfolder, tss, "".join([
                        "stat_compare_TSS_transcript_",
                        tss, ".csv"]))
            for ta in os.listdir(self.tmps["ta"]):
                filename = ta.split("_transcript")
                if (filename[0] == tss) and (filename[1] == ".gff"):
                    detect = True
                    break
            compare_file = os.path.join(self.gff_outfolder,
                                        "_".join([tss, "TSS.gff"]))
            if detect:
                stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file,
                            stat_out, self.tmps["ta_tss"],
                            self.tmps["tss_ta"], args_tss.fuzzy)
                self.helper.sort_gff(self.tmps["tss_ta"], compare_file)
                self.helper.sort_gff(self.tmps["ta_tss"],
                                     os.path.join(args_tss.ta_files, ta))
                os.remove(self.tmps["tss_ta"])
                os.remove(self.tmps["ta_tss"])
                detect = False
            log.write("\t" + stat_out + " is generated.\n")

    def _stat_tss(self, tsss, feature, log):
        print("Running statistaics")
        for tss in tsss:
            compare_file = os.path.join(self.gff_outfolder,
                                        "_".join([tss, feature]) + ".gff")
            stat_tsspredator(
                compare_file, feature,
                os.path.join(self.stat_outfolder, tss, "_".join([
                    "stat", feature, "class", tss]) + ".csv"),
                os.path.join(self.stat_outfolder, tss, "_".join([
                    "stat", feature, "libs", tss]) + ".csv"))
            self.helper.move_all_content(os.getcwd(), os.path.join(
                self.stat_outfolder, tss), ["_class", ".png"])
            if os.path.exists(os.path.join(
                    self.stat_outfolder, "TSSstatistics.tsv")):
                shutil.move(
                    os.path.join(
                        self.stat_outfolder, "TSSstatistics.tsv"),
                    os.path.join(
                        self.stat_outfolder, tss, "TSSstatistics.tsv"))
            plot_venn(compare_file, feature)
            self.helper.move_all_content(os.getcwd(), os.path.join(
                self.stat_outfolder, tss), ["_venn", ".png"])
            log.write("The following files in {0} are generated:\n".format(
                (os.path.join(self.stat_outfolder, tss))))
            for file_ in os.listdir(os.path.join(
                    self.stat_outfolder, tss)):
                log.write("\t" + file_ + "\n")

    def _get_prefixs(self, args_tss):
        prefixs = []
        detect = False
        for fasta in os.listdir(self.fasta_path):
            run = False
            for gff in os.listdir(self.gff_path):
                if fasta[:-3] == gff[:-4]:
                    prefix = fasta[:-3]
                    for wig in os.listdir(self.wig_path):
                        filename = wig.split("_STRAIN_")
                        if filename[1][:-4] == prefix:
                            detect = True
                            break
                    if detect:
                        prefixs.append(prefix)
        return prefixs

    def _merge_wigs(self, wig_folder, prefix, libs):
        self.helper.check_make_folder(os.path.join(os.getcwd(),
                                      self.tmps["tmp"]))
        for wig_file in os.listdir(wig_folder):
            for lib in libs:
                info = lib.split(":")
                if (info[0][:-4] in wig_file) and (info[-1] == "+") and (
                        prefix in wig_file) and (
                        os.path.isfile(os.path.join(wig_folder, wig_file))):
                    Helper().merge_file(
                            os.path.join(wig_folder, wig_file),
                            os.path.join("tmp", "merge_forward.wig"))
                if (info[0][:-4] in wig_file) and (info[-1] == "-") and (
                        prefix in wig_file) and (
                        os.path.isfile(os.path.join(wig_folder, wig_file))):
                    Helper().merge_file(
                            os.path.join(wig_folder, wig_file),
                            os.path.join("tmp", "merge_reverse.wig"))

    def _check_orphan(self, prefixs, wig_folder, args_tss):
        '''if genome has no locus tag, it can use for classify the TSS'''
        for prefix in prefixs:
            self._merge_wigs(wig_folder, prefix, args_tss.libs)
            tmp_tss = os.path.join(self.tmps["tmp"], "_".join([
                          prefix, args_tss.program + ".gff"]))
            pre_tss = os.path.join(self.gff_outfolder, "_".join([
                          prefix, args_tss.program + ".gff"]))
            check_orphan(pre_tss, os.path.join(
                args_tss.gffs, prefix + ".gff"),
                "tmp/merge_forward.wig", "tmp/merge_reverse.wig", tmp_tss)
            shutil.move(tmp_tss, pre_tss)
        shutil.rmtree("tmp")

    def _remove_files(self, args_tss):
        print("Remove temperary files and folders")
        self.helper.remove_tmp_dir(args_tss.fastas)
        self.helper.remove_tmp_dir(args_tss.gffs)
        self.helper.remove_tmp_dir(args_tss.ta_files)
        if "merge_forward.wig" in os.listdir(os.getcwd()):
            os.remove("merge_forward.wig")
        if "merge_reverse.wig" in os.listdir(os.getcwd()):
            os.remove("merge_reverse.wig")
        shutil.rmtree(args_tss.wig_folder)
        if args_tss.manual is not None:
            shutil.rmtree(args_tss.manual)

    def _deal_with_overlap(self, out_folder, args_tss):
        '''deal with the situation that TSS and 
        processing site at the same position'''
        if not args_tss.overlap_feature:
            pass
        else:
            print("Comparing TSSs and Processing sites")
            if args_tss.program.lower() == "tss":
                for tss in os.listdir(out_folder):
                    if tss.endswith("_TSS.gff"):
                        ref = self.helper.get_correct_file(
                                args_tss.overlap_gffs, "_processing.gff",
                                tss.replace("_TSS.gff", ""), None, None)
                        filter_tss_pro(os.path.join(out_folder, tss),
                                       ref, args_tss.program,
                                       args_tss.cluster)
            elif args_tss.program.lower() == "processing":
                for tss in os.listdir(out_folder):
                    if tss.endswith("_processing.gff"):
                        ref = self.helper.get_correct_file(
                                args_tss.overlap_gffs, "_TSS.gff",
                                tss.replace("_processing.gff", ""), None, None)
                        filter_tss_pro(os.path.join(out_folder, tss),
                                       ref, args_tss.program,
                                       args_tss.cluster)

    def _low_expression(self, args_tss, gff_folder):
        '''deal with the low expressed TSS'''
        prefix = None
        self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs)
        for gff in os.listdir(gff_folder):
            if (args_tss.program.lower() == "tss") and (
                    gff.endswith("_TSS.gff")):
                prefix = gff.replace("_TSS.gff", "")
            elif (args_tss.program.lower() == "processing") and (
                    gff.endswith("_processing.gff")):
                prefix = gff.replace("_processing.gff", "")
            if prefix:
                out = open(os.path.join(
                    self.stat_outfolder, prefix, "_".join([
                        "stat", prefix, "low_expression_cutoff.csv"])), "w")
                out.write("\t".join(["Genome", "Cutoff_coverage"]) + "\n")
                cutoff = filter_low_expression(
                        os.path.join(gff_folder, gff), args_tss,
                        "tmp/merge_forward.wig", "tmp/merge_reverse.wig",
                        "tmp/without_low_expression.gff")
                out.write("\t".join([prefix, str(cutoff)]) + "\n")
                os.remove(os.path.join(gff_folder, gff))
                shutil.move("tmp/without_low_expression.gff",
                            os.path.join(gff_folder, gff))
                prefix = None
        out.close()

    def run_tsspredator(self, args_tss, log):
        input_folder = os.path.join(args_tss.out_folder, "configs")
        for gff in os.listdir(args_tss.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(
                                                 args_tss.gffs, gff))
        self.helper.check_make_folder(self.gff_outfolder)
        self.multiparser.parser_fasta(args_tss.fastas)
        self.multiparser.parser_gff(args_tss.gffs, None)
        self.multiparser.parser_wig(args_tss.wig_folder)
        prefixs = self._get_prefixs(args_tss)
        for prefix in prefixs:
            config = os.path.join(input_folder,
                                  "_".join(["config", prefix]) + ".ini")
            self._gen_config(
                prefix, args_tss,
                os.path.join(self.gff_path, prefix + ".gff"), self.wig_path,
                os.path.join(self.fasta_path, prefix + ".fa"), config, log)
            out_path = os.path.join(
                    self.master, "_".join(["MasterTable", prefix]))
            config_file = os.path.join(
                    input_folder, "_".join(["config", prefix]) + ".ini")
            self._start_to_run(args_tss.tsspredator_path, config_file,
                               out_path, prefix, log)
            if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")):
                shutil.move(os.path.join(out_path, "TSSstatistics.tsv"),
                            os.path.join(
                                self.stat_outfolder, "TSSstatistics.tsv"))
        if args_tss.program.lower() == "ps":
            args_tss.program = "processing"
        self._convert_gff(prefixs, args_tss, log)
        if args_tss.check_orphan:
            print("checking the orphan TSSs")
            log.write("Running check_orphan.py to re-check orphan TSSs.\n")
            self._check_orphan(prefixs,
                               os.path.join(args_tss.wig_folder, "tmp"),
                               args_tss)
        self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder,
                                     None, args_tss.program)
        datas = []
        for gff in os.listdir(self.gff_outfolder):
            if gff.endswith(".gff"):
                gff_folder = gff.replace("".join(["_", args_tss.program,
                                                  ".gff"]), "")
                self.helper.check_make_folder(
                     os.path.join(self.stat_outfolder, gff_folder))
                datas.append(gff_folder)
        if args_tss.remove_low_expression is not None:
            log.write("Running filter_low_expression.py to filter out "
                      "low expressed TSS/PS.\n")
            self._low_expression(args_tss, self.gff_outfolder)
        if args_tss.manual is not None:
            self.multiparser.parser_gff(args_tss.manual, None)
            self.multiparser.combine_gff(args_tss.gffs, self.manual_path,
                                         None, None)
            self.multiparser.combine_fasta(args_tss.gffs, self.fasta_path,
                                         None)
            self.multiparser.combine_wig(args_tss.gffs, self.wig_path,
                                         None, args_tss.libs)
            log.write("Running merge_manual.py to merge the manual TSSs.\n")
            self._merge_manual(datas, args_tss)
        log.write("Running filter_TSS_pro.py to deal with the overlap "
                  "position between TSS and PS.\n")
        self._deal_with_overlap(self.gff_outfolder, args_tss)
        log.write("Running stat_TSSpredator.py to do statistics.\n")
        self._stat_tss(datas, args_tss.program, log)
        if args_tss.validate:
            self._validate(datas, args_tss, log)
        if args_tss.ta_files is not None:
            self._compare_ta(datas, args_tss, log)
        self._remove_files(args_tss)
예제 #31
0
class CircRNADetection(object):
    '''Detection of circRNA'''

    def __init__(self, args_circ):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.alignment_path = os.path.join(args_circ.output_folder,
                                           "segemehl_alignment_files")
        self.splice_path = os.path.join(args_circ.output_folder,
                                        "segemehl_splice_results")
        self.candidate_path = os.path.join(args_circ.output_folder,
                                           "circRNA_tables")
        self.gff_folder = os.path.join(args_circ.output_folder, "gffs")
        self.gff_path = os.path.join(args_circ.gffs, "tmp")
        self.splices = {"file": "splicesites.bed",
                        "splice": "splicesites"}
        self.trans = {"file": "transrealigned.bed",
                      "trans": "transrealigned"}
        self.fasta_path = os.path.join(args_circ.fastas, "tmp")

    def _wait_process(self, processes):
        '''wait for the parallels to finish the process'''
        for p in processes:
            p.wait()
            if p.stdout:
                p.stdout.close()
            if p.stdin:
                p.stdin.close()
            if p.stderr:
                p.stderr.close()
            try:
                p.kill()
            except OSError:
                pass
            time.sleep(5)

    def _deal_zip_file(self, read_files, log):
        tmp_datas = []
        tmp_reads = []
        for reads in read_files:
            zips = []
            tmp_datas = reads["files"]
            for read in reads["files"]:
                if read.endswith(".bz2"):
                    mod_read = read.replace(".bz2", "")
                    if (".fa" not in mod_read) and (
                            ".fasta" not in mod_read) and (
                            ".fna" not in mod_read) and (
                            ".fq" not in mod_read) and (
                            ".fastq" not in mod_read):
                        mod_read = mod_read + ".fa"
                    read_out = open(mod_read, "w")
                    tmp_datas.append(mod_read)
                    zips.append(mod_read)
                    print(" ".join(["Uncompressing", read]))
                    log.write(" ".join(["bzcat", read]) + "\n")
                    call(["bzcat", read], stdout=read_out)
                    log.write("\t" + mod_read + " is generated.\n")
                    read_out.close()
                elif read.endswith(".gz"):
                    mod_read = read.replace(".gz", "")
                    if (".fa" not in mod_read) and (
                            ".fasta" not in mod_read) and (
                            ".fna" not in mod_read) and (
                            ".fq" not in mod_read) and (
                            ".fastq" not in mod_read):
                        mod_read = mod_read + ".fa"
                    read_out = open(mod_read, "w")
                    tmp_datas.append(mod_read)
                    zips.append(mod_read)
                    print(" ".join(["Uncompressing", read]))
                    log.write(" ".join(["zcat", read]) + "\n")
                    call(["zcat", read], stdout=read_out)
                    read_out.close()
                    log.write("\t" + mod_read + " is generated.\n")
            tmp_reads.append({"sample": reads["sample"],
                              "files": tmp_datas, "zips": zips})   
        return tmp_reads

    def _run_segemehl_fasta_index(self, segemehl_path, fasta_path,
                                  index, fasta, log):
        log.write(" ".join([segemehl_path,
                  "-x", os.path.join(fasta_path, index),
                  "-d", os.path.join(fasta_path, fasta)]) + "\n")
        call([segemehl_path,
              "-x", os.path.join(fasta_path, index),
              "-d", os.path.join(fasta_path, fasta)])

    def _run_segemehl_align(self, args_circ, index, fasta, read,
                            sam_file, log_file, fasta_prefix, log):
        out = open(os.path.join(self.alignment_path,
                   fasta_prefix, sam_file), "w")
        log = open(os.path.join(self.alignment_path,
                   fasta_prefix, log_file), "w")
        log.write(" ".join([args_circ.segemehl_path,
                   "-i", os.path.join(self.fasta_path, index),
                   "-d", os.path.join(self.fasta_path, fasta),
                   "-q", read, "-S"]) + "\n")
        p = Popen([args_circ.segemehl_path,
                   "-i", os.path.join(self.fasta_path, index),
                   "-d", os.path.join(self.fasta_path, fasta),
                   "-q", read, "-S"],
                  stdout=out, stderr=log)
        return p

    def _align(self, args_circ, read_datas, log):
        '''align the read. if the bam files are provided, it can be skipped.'''
        prefixs = []
        align_files = []
        log.write("Using segemehl to align the read.\n")
        log.write("Please make sure the version of segemehl is at least 0.1.9.\n")
        for fasta in os.listdir(self.fasta_path):
            index = fasta.replace(".fa", ".idx")
            self._run_segemehl_fasta_index(args_circ.segemehl_path,
                                           self.fasta_path, index, fasta, log)
            processes = []
            num_process = 0
            fasta_prefix = fasta.replace(".fa", "")
            prefixs.append(fasta_prefix)
            self.helper.check_make_folder(os.path.join(
                                self.alignment_path, fasta_prefix))
            log.write("Running for {0}.\n".format(fasta_prefix))
            for reads in read_datas:
                for read in reads["files"]:
                    num_process += 1
                    read_name = read.split("/")[-1]
                    if read_name.endswith(".fa") or \
                       read_name.endswith(".fna") or \
                       read_name.endswith(".fasta") or \
                       read_name.endswith(".fq") or \
                       read_name.endswith(".fastq"):
                        filename = read_name.split(".")
                        read_prefix = ".".join(filename[:-1])
                        sam_file = "_".join([read_prefix, fasta_prefix + ".sam"])
                        log_file = "_".join([read_prefix, fasta_prefix + ".log"])
                        align_files.append("_".join([read_prefix, fasta_prefix]))
                        print("Mapping {0}".format(sam_file))
                        p = self._run_segemehl_align(
                                args_circ, index, fasta, read,
                                sam_file, log_file, fasta_prefix, log)
                        processes.append(p)
                        if num_process == args_circ.cores:
                            self._wait_process(processes)
                            num_process = 0
                self._wait_process(processes)
            log.write("Done!\n")
            log.write("The following files are generated in {0}:\n".format(
                  os.path.join(self.alignment_path, fasta_prefix)))
            for file_ in os.listdir(os.path.join(
                   self.alignment_path, fasta_prefix)):
                log.write("\t" + file_ + "\n")
        return align_files, prefixs

    def _run_samtools_convert_bam(self, samtools_path, pre_sam, out_bam, log):
        log.write(" ".join([samtools_path, "view",
                            "-bS", pre_sam, "-o", out_bam]) + "\n")
        call([samtools_path, "view", "-bS", pre_sam, "-o", out_bam])

    def _convert_sam2bam(self, sub_alignment_path, samtools_path, align_files, log):
        bam_files = []
        convert_ones = []
        remove_ones = []
        log.write("Using Samtools to convert SAM files to BAM files.\n")
        log.write("Please make sure the version of Samtools is at least 1.3.1.\n")
        for sam in os.listdir(sub_alignment_path):
            pre_sam = os.path.join(sub_alignment_path, sam)
            if sam.endswith(".sam"):
                bam_file = sam.replace(".sam", ".bam")
                print("Converting {0} to {1}".format(sam, bam_file))
                out_bam = os.path.join(sub_alignment_path, bam_file)
                self._run_samtools_convert_bam(samtools_path, pre_sam,
                                               out_bam, log)
                bam_files.append(out_bam)
                if align_files:
                    if bam_file.replace(".bam", "") not in align_files:
                        convert_ones.append(out_bam)
                    else:
                        remove_ones.append(pre_sam)
            elif sam.endswith(".bam"):
                if (pre_sam not in convert_ones) and (
                        pre_sam not in remove_ones):
                    bam_files.append(pre_sam)
            elif sam.endswith(".log"):
                os.remove(pre_sam)
        log.write("Done!\n")
        log.write("The following files are generated:\n")
        for file_ in os.listdir(sub_alignment_path):
            if file_.endswith(".bam"):
                log.write("\t" + os.path.join(sub_alignment_path, file_) + "\n")
        return bam_files, convert_ones, remove_ones

    def _run_samtools_merge_sort(self, samtools_path, prefix,
                                 out_folder, bam_datas, log):
        log.write("Using Samtools for merging, sorting and converting "
                  "the BAM files.\n")
        log.write("Make sure the version Samtools is at least 1.3.1.\n")
        for bam_data in bam_datas:
            print("Merging bam files for {0} of {1}".format(
                prefix, bam_data["sample"]))
            sample_bam = os.path.join(out_folder, "_".join([
                prefix, bam_data["sample"] + ".bam"]))
            if len(bam_data["files"]) <= 1:
                shutil.copyfile(bam_data["files"][0], sample_bam)
            else:
                file_line = " ".join(bam_data["files"])
                log.write(" ".join([samtools_path, "merge",
                                    sample_bam, file_line]) + "\n")
                os.system(" ".join([samtools_path, "merge",
                                    sample_bam, file_line]))
            print("Sorting bam files for {0} of {1}".format(
                prefix, bam_data["sample"]))
            sort_sample = os.path.join(out_folder,
                  "_".join([prefix, bam_data["sample"] + "_sort.bam"]))
            log.write(" ".join([samtools_path, "sort",
                      "-o", sort_sample, sample_bam]) + "\n")
            call([samtools_path, "sort", "-o", sort_sample, sample_bam])
            os.remove(sample_bam)
            print("Converting bam files to sam files for {0} of {1}".format(
                prefix, bam_data["sample"]))
            log.write(" ".join([samtools_path, "view", "-h", "-o",
                      sort_sample.replace(".bam", ".sam"), sort_sample]) + "\n")
            call([samtools_path, "view", "-h", "-o",
                  sort_sample.replace(".bam", ".sam"), sort_sample])
        log.write("Done!\n")
        log.write("\t" + sort_sample.replace(".bam", ".sam") + " is generated.\n")

    def _merge_sort_aligment_file(
            self, bam_datas, read_datas, samtools_path,
            out_folder, convert_ones, tmp_reads, remove_ones, prefix, log):
        if bam_datas is None:
            merge_bam_datas = []
            for read_data in read_datas:
                bam_files = []
                for read in read_data["files"]:
                    if read.endswith(".gz") or read.endswith(".bz2"):
                        read = ".".join(
                                read.split("/")[-1].split(".")[:-1])
                    read_prefix = ".".join(
                        read.split("/")[-1].split(".")[:-1])
                    bam_files.append(os.path.join(
                        self.alignment_path, prefix,
                        "_".join([read_prefix, prefix + ".bam"])))
                merge_bam_datas.append({"sample": read_data["sample"],
                                        "files": bam_files})
        elif (bam_datas is not None) and (read_datas is not None):
            merge_bam_datas = copy.deepcopy(bam_datas)
            for bam_data in merge_bam_datas:
                for read_data in read_datas:
                    if bam_data["sample"] == read_data["sample"]:
                        for read in read_data["files"]:
                            read_prefix = ".".join(
                                read.split("/")[-1].split(".")[:-1])
                            bam = os.path.join(
                                self.alignment_path, prefix,
                                "_".join([read_prefix, prefix + ".bam"]))
                            if (bam not in bam_data["files"]):
                                bam_data["files"].append(bam)
        else:
            merge_bam_datas = copy.deepcopy(bam_datas)
        self._run_samtools_merge_sort(samtools_path, prefix,
                                      out_folder, merge_bam_datas, log)
        for bam in convert_ones:
            os.remove(bam)
        for sam in remove_ones:
            os.remove(sam)

    def _run_testrealign(self, prefix, testrealign_path, out_folder, log):
        log.write("Using Segemehl to detect circular RNAs.\n")
        log.write("Please make sure the version of Segemehl is at least 0.1.9.\n")
        log.write("Please make sure your testrealign.x exists. If it does not "
                  "exists, please reinstall your Segemehl via using make all.\n")
        sub_splice_path = os.path.join(self.splice_path, prefix)
        if not os.path.exists(sub_splice_path):
            os.mkdir(sub_splice_path)
        err_log = os.path.join(sub_splice_path, prefix + ".log")
        print("Running testrealign.x for {0}".format(prefix))
        for sam_file in os.listdir(out_folder):
            if sam_file.endswith("sort.sam"):
                sample_prefix = sam_file.replace("_sort.sam", "")
                command = " ".join([
                    testrealign_path,
                    "-d", os.path.join(self.fasta_path, prefix + ".fa"),
                    "-q", os.path.join(out_folder, sam_file), "-n",
                    "-U", os.path.join(sub_splice_path,
                                       sample_prefix + "_splicesites.bed"),
                    "-T", os.path.join(sub_splice_path,
                                       sample_prefix + "_transrealigned.bed")])
                log.write(command + " 2>" + err_log + "\n")
                os.system(command + " 2>" + err_log)
        log.write("Done!\n")
        log.write("The following files are generated:\n")
        for file_ in os.listdir(sub_splice_path):
            log.write("\t" + os.path.join(sub_splice_path, file_) + "\n")
        self.helper.remove_all_content(out_folder, ".sam", "file")

    def _merge_bed(self, fastas, splice_path, output_folder):
        '''Merge the bed files for analysis'''
        fa_prefixs = []
        for fasta in os.listdir(fastas):
            headers = []
            if (fasta.endswith(".fa") or fasta.endswith(".fna") or
                    fasta.endswith(".fasta")):
                with open(os.path.join(fastas, fasta), "r") as f_h:
                    for line in f_h:
                        line = line.strip()
                        if line.startswith(">"):
                            headers.append(line[1:])
                filename = fasta.split(".")
                fasta_prefix = ".".join(filename[:-1])
                fa_prefixs.append(fasta_prefix)
                bed_folder = os.path.join(
                    output_folder, fasta_prefix)
                self.helper.check_make_folder(bed_folder)
                samples = []
                for header in headers:
                    for splice in os.listdir(os.path.join(
                            splice_path, header)):
                        if splice.endswith(".bed"):
                            if self.splices["file"] in splice:
                                sample = splice.replace(header, "")
                                sample = sample.replace(
                                    self.splices["file"], "")
                                if sample not in samples:
                                    samples.append(sample)
                            shutil.copyfile(
                                os.path.join(
                                splice_path, header, splice),
                                os.path.join(
                                bed_folder, "tmp_" + splice))
                for sample in samples:
                    out_splice = os.path.join(bed_folder, "".join([
                        fasta_prefix + sample + self.splices["file"]]))
                    out_trans = os.path.join(bed_folder, "".join([
                        fasta_prefix + sample + self.trans["file"]]))
                    if os.path.exists(out_splice):
                        os.remove(out_splice)
                    if os.path.exists(out_trans):
                        os.remove(out_trans)
                    for file_ in os.listdir(bed_folder):
                        if (self.splices["splice"] in file_) and (
                                sample in file_):
                            self.helper.merge_file(os.path.join(
                                    bed_folder, file_), out_splice)
                        elif (self.trans["trans"] in file_) and (
                                sample in file_):
                            self.helper.merge_file(os.path.join(
                                    bed_folder, file_), out_trans)
        self.helper.remove_all_content(splice_path, None, "dir")
        return samples, fa_prefixs

    def _stat_and_gen_gff(self, prefixs, samples, args_circ, log):
        '''do statistics and print the result to gff file'''
        log.write("Running circRNA.py to do statistics and generate gff files.\n")
        log.write("The following files are generated:\n")
        for prefix in prefixs:
            self.helper.check_make_folder(os.path.join(self.gff_folder,
                                                       prefix))
            self.helper.check_make_folder(os.path.join(self.splice_path,
                                                       prefix))
            for bed in os.listdir(os.path.join(
                args_circ.output_folder, prefix)):
                if (bed.split("_")[0] != "tmp") and (bed.endswith(".bed")):
                    shutil.copy(
                        os.path.join(args_circ.output_folder, prefix, bed),
                        os.path.join(self.splice_path, prefix))
            self.helper.check_make_folder(os.path.join(
                                          self.candidate_path, prefix))
            print("Comparing circular RNAs with annotations of {0}".format(
                prefix))
            for sample in samples:
                splice_file = os.path.join(
                    self.splice_path, prefix,
                    "".join([prefix, sample, self.splices["file"]]))
                stat_file = os.path.join(args_circ.stat_folder,
                               "".join(["stat_", prefix, sample,
                                        "circRNA.csv"]))
                csv_all = os.path.join(self.candidate_path, prefix,
                               "".join([prefix, sample, "circRNA_all.csv"]))
                csv_best = os.path.join(self.candidate_path, prefix,
                               "".join([prefix, sample, "circRNA_best.csv"]))
                gff_all = os.path.join(self.gff_folder, prefix,
                                "".join([prefix, sample, "circRNA_all.gff"]))
                gff_best = os.path.join(self.gff_folder, prefix,
                                  "".join([prefix, sample, "circRNA_best.gff"]))
                detect_circrna(splice_file, os.path.join(
                               self.gff_path, prefix + ".gff"), csv_all,
                               args_circ, stat_file)
                self.converter.convert_circ2gff(
                     os.path.join(self.candidate_path, prefix,
                                  "".join([prefix, sample, "circRNA_all.csv"])),
                     args_circ, gff_all, gff_best)
                log.write("\t" + stat_file + "\n")
                log.write("\t" + csv_all + "\n")
                log.write("\t" + csv_best + "\n")
                log.write("\t" + gff_all + "\n")
                log.write("\t" + gff_best + "\n")

    def _extract_input_files(self, inputs):
        input_datas = []
        for input_ in inputs:
            datas = input_.split(":")
            if len(datas) != 2:
                print("Error: the format of --bam_files or "
                      "--read_files is wrong!")
                sys.exit()
            for file_ in datas[-1].split(","):
                if not os.path.exists(file_):
                    print("Error: some files in --bam_files or "
                          "--read_files do not exist!")
                    sys.exit()
            input_datas.append({"sample": datas[0],
                                "files": datas[-1].split(",")})
        return input_datas

    def _combine_read_bam(self, bam_files, bam_datas, read_datas):
        if bam_datas is not None:
            for bam_data in bam_datas:
                for read_data in read_datas:
                    if bam_data["sample"] == read_data["sample"]:
                        for read in read_data["files"]:
                            prefix = ".".join(
                                read.split("/")[-1].split(".")[:-1])
                            bam = os.path.join(self.alignment_path,
                                               prefix + ".bam")
                            if (bam in bam_files) and (
                                    bam not in bam_data["files"]):
                                bam_data["files"].append(bam)
        else:
            bam_datas = []
            for read_data in read_datas:
                bam_files = []
                for read in read_data["files"]:
                    prefix = ".".join(
                        read.split("/")[-1].split(".")[:-1])
                    bam_files.append(os.path.join(
                        self.alignment_path, prefix + ".bam"))
                bam_datas.append({"sample": read_data["sample"],
                                  "files": bam_files})
        return bam_datas

    def _remove_tmp_files(self, args_circ, fa_prefixs):
        self.helper.remove_tmp_dir(args_circ.fastas)
        self.helper.remove_tmp_dir(args_circ.gffs)
        self.helper.remove_all_content(args_circ.output_folder,
                                       ".bam", "file")
        for prefix in fa_prefixs:
            shutil.rmtree(os.path.join(args_circ.output_folder, prefix))

    def run_circrna(self, args_circ, log):
        '''detection of circRNA'''
        bam_datas = None
        read_datas = None
        if (args_circ.bams is None) and (args_circ.read_files is None):
            log.write("--bam_files and --read_files can not be both emtpy.\n")
            print("Error: --bam_files or --read_files should be assigned.")
            sys.exit()
        if args_circ.bams is not None:
            bam_datas = self._extract_input_files(args_circ.bams)
        if args_circ.read_files is not None:
            read_datas = self._extract_input_files(args_circ.read_files)
        for gff in os.listdir(args_circ.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(
                                                 args_circ.gffs, gff))
        if args_circ.segemehl_path is None:
            log.write("segemehl does not exists.\n")
            print("Error: please assign segemehl path!!")
            sys.exit()
        self.multiparser.parser_fasta(args_circ.fastas)
        self.multiparser.parser_gff(args_circ.gffs, None)
        self.multiparser.combine_gff(args_circ.fastas, self.gff_path,
                                     "fasta", None)
        tmp_reads = []
        if args_circ.read_files:
            log.write("Raw read files are found.\n")
            tmp_reads = self._deal_zip_file(read_datas, log)
            align_files, prefixs = self._align(args_circ, tmp_reads, log)
        else:
            align_files = None
        prefixs = []
        for fasta in os.listdir(self.fasta_path):
            if fasta.endswith(".fa"):
                fasta_prefix = fasta.replace(".fa", "")
                prefixs.append(fasta_prefix)
        for prefix in prefixs:
            if args_circ.read_files:
                sub_alignment_path = os.path.join(self.alignment_path, prefix)
                bam_files, convert_ones, remove_ones = self._convert_sam2bam(
                sub_alignment_path, args_circ.samtools_path, align_files, log)
            else:
                convert_ones = []
                remove_ones = []
            self._merge_sort_aligment_file(
                bam_datas, read_datas, args_circ.samtools_path,
                args_circ.output_folder,
                convert_ones, tmp_reads, remove_ones, prefix, log)
            self._run_testrealign(prefix, args_circ.testrealign_path,
                                  args_circ.output_folder, log)
        samples, fa_prefixs = self._merge_bed(
            args_circ.fastas, self.splice_path, args_circ.output_folder)
        self._stat_and_gen_gff(fa_prefixs, samples, args_circ, log)
        if len(tmp_reads) != 0:
            for reads in tmp_reads:
                for read in reads["zips"]:
                    os.remove(read)
        self._remove_tmp_files(args_circ, fa_prefixs)
예제 #32
0
class Terminator(object):
    '''detection of terminator'''

    def __init__(self, args_term):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.gff_parser = Gff3Parser()
        self.gff_path = os.path.join(args_term.gffs, "tmp")
        self.fasta_path = os.path.join(args_term.fastas, "tmp")
        self.tran_path = os.path.join(args_term.trans, "tmp")
        self.outfolder = {"term": os.path.join(args_term.out_folder, "gffs"),
                          "csv": os.path.join(args_term.out_folder, "tables")}
        self.terms = {"all": os.path.join(self.outfolder["term"],
                                          "all_candidates"),
                      "express": os.path.join(self.outfolder["term"],
                                              "expressed_candidates"),
                      "best": os.path.join(self.outfolder["term"],
                                           "best_candidates"),
                      "non": os.path.join(self.outfolder["term"],
                                          "non_expressed_candidates")}
        self.csvs = {"all": os.path.join(self.outfolder["csv"],
                                         "all_candidates"),
                     "express": os.path.join(self.outfolder["csv"],
                                             "expressed_candidates"),
                     "best": os.path.join(self.outfolder["csv"],
                                          "best_candidates"),
                     "non": os.path.join(self.outfolder["csv"],
                                         "non_expressed_candidates")}
        self.combine_path = os.path.join(self.gff_path, "combine")
        self.tmps = {"transterm": os.path.join(os.getcwd(), "tmp_transterm"),
                     "hp": "transtermhp", "hp_gff": "transtermhp.gff",
                     "hp_path": "tmp_transterm/tmp",
                     "term_table": os.path.join(os.getcwd(), "tmp_term_table"),
                     "merge": os.path.join(os.getcwd(), "tmp_merge_gff"),
                     "gff": "tmp.gff",
                     "folder": os.path.join(os.getcwd(), "tmp")}
        self.suffixs = {"gff": "term.gff", "csv": "term.csv",
                        "allgff": "term_all.gff"}
        if args_term.srnas:
            self.srna_path = os.path.join(args_term.srnas, "tmp")
        else:
            self.srna_path = None
        self._make_gff_folder()

    def _combine_annotation(self, combine_file, files):
        with open(combine_file, 'w') as result:
            for file_ in files:
                if (file_.endswith(".ptt")) and (os.stat(file_).st_size == 0):
                    print("Warning: No CDS information, "
                          "TransTermHP can not work!")
                    return "NO_CDS"
                if os.path.exists(file_) and (
                        os.stat(file_).st_size != 0):
                    check_start = False
                    fh = open(file_, 'r')
                    for line in fh:
                        if check_start:
                            result.write(line)
                        if "Location" in line:
                            check_start = True
                    if "\n" not in line:
                        result.write("\n")
                    fh.close()
        return "Normal"

    def _make_gff_folder(self):
        self.helper.check_make_folder(self.terms["all"])
        self.helper.check_make_folder(self.csvs["all"])
        self.helper.check_make_folder(self.terms["best"])
        self.helper.check_make_folder(self.csvs["best"])
        self.helper.check_make_folder(self.terms["express"])
        self.helper.check_make_folder(self.csvs["express"])
        self.helper.check_make_folder(self.terms["non"])
        self.helper.check_make_folder(self.csvs["non"])

    def _convert_gff2rntptt(self, gff_path, fasta_path, sRNAs, log):
        file_types = {}
        prefixs = []
        for gff in os.listdir(gff_path):
            if gff.endswith(".gff"):
                filename = gff.split("/")
                prefix = filename[-1][:-4]
                prefixs.append(prefix)
                gff_file = os.path.join(gff_path, gff)
                rnt_file = os.path.join(gff_path, gff.replace(".gff", ".rnt"))
                ptt_file = os.path.join(gff_path, gff.replace(".gff", ".ptt"))
                fasta = self.helper.get_correct_file(
                             fasta_path, ".fa", prefix, None, None)
                if not fasta:
                    log.write("{0}.fa can not be found.\n".format(prefix))
                    print("Error: {0}.fa can not be found!".format(prefix))
                    sys.exit()
                if sRNAs:
                    self.multiparser.parser_gff(sRNAs, "sRNA")
                    srna = self.helper.get_correct_file(
                            self.srna_path, "_sRNA.gff", prefix, None, None)
                    if (srna) and (fasta):
                        log.write("Running converter.py to convert {0} and "
                                  "{1} to {2}, {3}, and {4}.\n".format(
                            gff_file, srna, ptt_file, rnt_file,
                            srna.replace(".gff", ".rnt")))
                        self.converter.convert_gff2rntptt(
                            gff_file, fasta, ptt_file, rnt_file, srna,
                            srna.replace(".gff", ".rnt"))
                        file_types[prefix] = "srna"
                        log.write("The following files are generated:\n")
                        log.write("\t{0}\n\t{1}\n\t{2}\n".format(
                            ptt_file, rnt_file, srna.replace(".gff", ".rnt")))
                    if (not srna) and (fasta):
                        log.write("Running converter.py to convert {0} "
                                  "to {1}, and {2}.\n".format(
                            gff_file, ptt_file, rnt_file))
                        self.converter.convert_gff2rntptt(
                            gff_file, fasta, ptt_file, rnt_file, None, None)
                        file_types[prefix] = "normal"
                        log.write("The following files are generated:\n")
                        log.write("\t{0}\n\t{1}\n".format(ptt_file, rnt_file))
                else:
                    log.write("Running converter.py to convert {0} "
                              "to {1}, and {2}.\n".format(
                        gff_file, ptt_file, rnt_file))
                    self.converter.convert_gff2rntptt(
                        gff_file, fasta, ptt_file, rnt_file, None, None)
                    file_types[prefix] = "normal"
                    log.write("The following files are generated:\n")
                    log.write("\t{0}\n\t{1}\n".format(ptt_file, rnt_file))
        return file_types, prefixs

    def _combine_ptt_rnt(self, gff_path, file_types, srna_path):
        self.helper.check_make_folder(self.combine_path)
        for prefix, file_type in file_types.items():
            combine_file = os.path.join(self.combine_path, prefix + '.ptt')
            if file_type == "normal":
                files = [os.path.join(gff_path, prefix + ".ptt"),
                         os.path.join(gff_path, prefix + ".rnt")]
                check = self._combine_annotation(combine_file, files)
            elif file_type == "srna":
                files = [os.path.join(gff_path, prefix + ".ptt"),
                         os.path.join(gff_path, prefix + ".rnt"),
                         os.path.join(srna_path,
                                      "_".join([prefix, "sRNA.rnt"]))]
                check = self._combine_annotation(combine_file, files)
        return check

    def _TransTermHP(self, fasta, file_, out_path, prefix, out, args_term, log):
        call([args_term.TransTermHP_path, "-p", args_term.expterm_path,
              fasta, os.path.join(self.combine_path, file_), "--t2t-perf",
              os.path.join(out_path, "_".join([
                  prefix,
                  "terminators_within_robust_tail-to-tail_regions.t2t"])),
              "--bag-output", os.path.join(out_path, "_".join([
                  prefix, "best_terminator_after_gene.bag"]))],
             stdout=out)
        log.write(" ".join([args_term.TransTermHP_path, "-p", args_term.expterm_path,
              fasta, os.path.join(self.combine_path, file_), "--t2t-perf",
              os.path.join(out_path, "_".join([
                  prefix,
                  "terminators_within_robust_tail-to-tail_regions.t2t"])),
              "--bag-output", os.path.join(out_path, "_".join([
                  prefix, "best_terminator_after_gene.bag"]))]) + "\n")

    def _run_TransTermHP(self, args_term, log):
        self.helper.check_make_folder(self.tmps["transterm"])
        log.write("Running TransTermHP.\n")
        log.write("Make sure the version is at least 2.09.\n")
        for file_ in os.listdir(self.combine_path):
            if ".ptt" in file_:
                prefix = file_.replace(".ptt", "")
                fasta = self.helper.get_correct_file(
                             self.fasta_path, ".fa", prefix, None, None)
                if not fasta:
                    log.write("{0}.fa can not be found!.\n".format(prefix))
                    print("Error: {0}.fa can not be found!".format(prefix))
                    sys.exit()
                out_path = os.path.join(args_term.hp_folder, prefix)
                self.helper.check_make_folder(out_path)
                out = open(os.path.join(out_path,
                           "_".join([prefix, "terminators.txt"])), "w")
                self._TransTermHP(fasta, file_, out_path,
                                  prefix, out, args_term, log)
                log.write("Done!\n")
                log.write("The following files are generated in {0}.\n".format(
                    out_path))
                for file_ in os.listdir(out_path):
                    log.write("\t" + file_ + "\n")
                out.close()
        shutil.rmtree(self.combine_path)

    def _convert_to_gff(self, prefixs, args_term, log):
        log.write("Running coverter.py to convert the results of TransTermHP "
                  "to gff3 format.\n")
        for prefix in prefixs:
            for folder in os.listdir(args_term.hp_folder):
                if prefix == folder:
                    out_path = os.path.join(args_term.hp_folder, folder)
                    for file_ in os.listdir(out_path):
                        if file_.endswith(".bag"):
                            out_file = os.path.join(
                                    self.tmps["transterm"],
                                    "_".join([prefix, self.tmps["hp_gff"]]))
                            self.converter.convert_transtermhp2gff(
                                 os.path.join(out_path, file_), out_file)
                            log.write("\t" + out_file + " is generated.\n")
        self.multiparser.combine_gff(args_term.gffs, self.tmps["transterm"],
                                     None, self.tmps["hp"])

    def _combine_wigs(self, args_term):
        if (args_term.tex_wigs is not None) and (
                args_term.frag_wigs is not None):
            folder = args_term.tex_wigs.split("/")
            folder = "/".join(folder[:-1])
            merge_wigs = os.path.join(folder, "merge_wigs")
            self.helper.check_make_folder(merge_wigs)
            for wig in os.listdir(args_term.tex_wigs):
                if os.path.isdir(os.path.join(args_term.tex_wigs, wig)):
                    pass
                else:
                    shutil.copy(os.path.join(args_term.tex_wigs, wig),
                                merge_wigs)
            for wig in os.listdir(args_term.frag_wigs):
                if os.path.isdir(os.path.join(args_term.frag_wigs, wig)):
                    pass
                else:
                    shutil.copy(os.path.join(args_term.frag_wigs, wig),
                                merge_wigs)
        elif (args_term.tex_wigs is not None):
            merge_wigs = args_term.tex_wigs
        elif (args_term.frag_wigs is not None):
            merge_wigs = args_term.frag_wigs
        else:
            print("Error: Wiggle files are not assigned!")
            sys.exit()
        return merge_wigs

    def _merge_sRNA(self, sRNAs, prefixs, gff_path):
        '''searching the terminator with sRNA information'''
        if sRNAs is not None:
            self.multiparser.parser_gff(sRNAs, "sRNA")
            self.helper.check_make_folder(self.tmps["merge"])
            for prefix in prefixs:
                tmp_gff = os.path.join(self.tmps["merge"], self.tmps["gff"])
                if self.tmps["gff"] in os.listdir(self.tmps["merge"]):
                    os.remove(tmp_gff)
                self.helper.merge_file(os.path.join(gff_path, prefix + ".gff"),
                                       tmp_gff)
                self.helper.merge_file(os.path.join(
                    self.srna_path, "_".join([prefix, "sRNA.gff"])), tmp_gff)
                self.helper.sort_gff(tmp_gff, os.path.join(
                    self.tmps["merge"], prefix + ".gff"))
                os.remove(tmp_gff)
            merge_path = self.tmps["merge"]
        else:
            merge_path = gff_path
        return merge_path

    def _move_file(self, term_outfolder, csv_outfolder):
        for gff in os.listdir(term_outfolder):
            if gff.endswith("_term.gff"):
                self.helper.sort_gff(os.path.join(term_outfolder, gff),
                                     self.tmps["gff"])
                shutil.move(self.tmps["gff"],
                            os.path.join(term_outfolder, gff))
                prefix = gff.replace("_term.gff", "")
                new_gff = os.path.join(self.terms["all"], "_".join([
                        prefix, self.suffixs["allgff"]]))
                csv_file = os.path.join(
                        os.path.join(self.csvs["all"], "_".join([
                            prefix, self.suffixs["csv"]])))
                out = open(new_gff, "w")
                out.write("##gff-version 3\n")
                out.close()
                self.helper.merge_file(
                        os.path.join(term_outfolder, gff),
                        os.path.join(
                            self.terms["all"], "_".join([
                                prefix, self.suffixs["allgff"]])))
                os.remove(os.path.join(term_outfolder, gff))
                pre_strain = ""
                if ("_".join([prefix, self.suffixs["csv"]]) in
                        os.listdir(self.csvs["all"])):
                    os.remove(csv_file)
                out_csv = open(csv_file, "w")
                out_csv.write("\t".join(["Genome", "Name", "Start", "End",
                              "Strand", "Detect", "Coverage_decrease",
                              "Coverage_detail"]) + "\n")
                out_csv.close()
                fh = open(new_gff)
                for entry in self.gff_parser.entries(fh):
                    if entry.seq_id != pre_strain:
                        self.helper.merge_file(os.path.join(
                            self.tmps["term_table"], "_".join([
                                entry.seq_id, "term_raw.csv"])),
                            os.path.join(self.csvs["all"], "_".join([
                                prefix, self.suffixs["csv"]])))
                    pre_strain = entry.seq_id
                fh.close()

    def _run_rnafold(self, RNAfold_path, tmp_seq, tmp_sec, prefix, log):
        log.write("Computing secondray structures of {0}.\n".format(prefix))
        log.write("Make sure the version of Vienna RNA package is at least 2.3.2.\n")
        print("Computing secondray structures of {0}".format(prefix))
        self.helper.check_make_folder(self.tmps["folder"])
        pre_cwd = os.getcwd()
        os.chdir(self.tmps["folder"])
        log.write(" ".join([RNAfold_path, "<", os.path.join("..", tmp_seq),
                  ">", os.path.join("..", tmp_sec)]) + "\n")
        os.system(" ".join([RNAfold_path, "<", os.path.join("..", tmp_seq),
                  ">", os.path.join("..", tmp_sec)]))
        log.write("Done!\n")
        log.write("\t" + tmp_sec + " is generated for storing secondary "
                  "structure.\n")
        os.chdir(pre_cwd)
        shutil.rmtree(self.tmps["folder"])

    def _compute_intersection_forward_reverse(
            self, prefixs, merge_path, wig_path, merge_wigs, args_term, log):
        '''the approach for searching gene converged region terminator'''
        log.write("Searching terminators which located in gene converged "
                  "region.\n")
        for prefix in prefixs:
            tmp_seq = os.path.join(args_term.out_folder,
                                   "_".join(["inter_seq", prefix]))
            tmp_index = os.path.join(args_term.out_folder,
                                     "_".join(["inter_index", prefix]))
            tmp_sec = os.path.join(args_term.out_folder,
                                   "_".join(["inter_sec", prefix]))
            tran_file = os.path.join(self.tran_path,
                                     "_".join([prefix, "transcript.gff"]))
            gff_file = os.path.join(merge_path, prefix + ".gff")
            tmp_cand = tmp_cand = os.path.join(args_term.out_folder,
                                     "_".join(["term_candidates", prefix]))
            if os.path.exists(tran_file):
                print("Extracting sequences of {0}".format(prefix))
                log.write("Running get_inter_seq.py to extract the potential "
                          "sequences from {0}.\n".format(prefix))
                intergenic_seq(os.path.join(self.fasta_path, prefix + ".fa"),
                               tran_file, gff_file, tmp_seq, tmp_index, args_term)
                log.write("\t" + tmp_seq + " is generated for storing the "
                          "potential sequences.\n")
                self._run_rnafold(args_term.RNAfold_path, tmp_seq, tmp_sec,
                                  prefix, log)
                log.write("Running extract_sec_info.py to extract the "
                          "information of secondary structure from {0}.\n".format(
                          prefix))
                extract_info_sec(tmp_sec, tmp_seq, tmp_index)
                os.remove(tmp_index)
                log.write("Running get_polyT.py to detect the "
                          "terminator candidates for {0}.\n".format(prefix))
                poly_t(tmp_seq, tmp_sec, gff_file, tran_file, tmp_cand, args_term)
                log.write("\t" + tmp_cand + " which temporary stores terminator "
                          "candidates is generated.\n")
            print("Detecting terminators for " + prefix)
            log.write("Running detect_coverage_term.py to gain "
                      "high-confidence terminators for {0}.\n".format(prefix))
            detect_coverage(
                tmp_cand, os.path.join(merge_path, prefix + ".gff"),
                os.path.join(self.tran_path, "_".join([
                    prefix, "transcript.gff"])),
                os.path.join(self.fasta_path, prefix + ".fa"),
                os.path.join(wig_path, "_".join([prefix, "forward.wig"])),
                os.path.join(wig_path, "_".join([prefix, "reverse.wig"])),
                os.path.join(self.tmps["hp_path"], "_".join([
                    prefix, self.tmps["hp_gff"]])), merge_wigs,
                os.path.join(self.outfolder["term"], "_".join([
                    prefix, self.suffixs["gff"]])),
                os.path.join(self.tmps["term_table"], "_".join([
                    prefix, "term_raw.csv"])), args_term)
        self.multiparser.combine_gff(args_term.gffs, self.outfolder["term"],
                                     None, "term")
        self._move_file(self.outfolder["term"], self.outfolder["csv"])

    def _remove_tmp_file(self, merge_wigs, args_term):
        self.helper.remove_tmp_dir(args_term.gffs)
        self.helper.remove_tmp_dir(args_term.fastas)
        if args_term.srnas is not None:
            self.helper.remove_tmp(args_term.srnas)
            shutil.rmtree(self.tmps["merge"])
        if (args_term.tex_wigs is not None) and (
                args_term.frag_wigs is not None):
            shutil.rmtree(merge_wigs)
        self.helper.remove_tmp_dir(args_term.trans)
        if "tmp_wig" in os.listdir(args_term.out_folder):
            shutil.rmtree(os.path.join(args_term.out_folder, "tmp_wig"))
        self.helper.remove_tmp(self.outfolder["term"])
        shutil.rmtree(self.tmps["transterm"])
        shutil.rmtree(self.tmps["term_table"])
        self.helper.remove_all_content(args_term.out_folder,
                                       "inter_seq_", "file")
        self.helper.remove_all_content(self.outfolder["term"],
                                       "_term.gff", "file")
        self.helper.remove_all_content(args_term.out_folder,
                                       "inter_sec_", "file")
        self.helper.remove_all_content(args_term.out_folder,
                                       "term_candidates_", "file")

    def _compute_stat(self, args_term, log):
        new_prefixs = []
        for gff in os.listdir(self.terms["all"]):
            if gff.endswith("_term_all.gff"):
                out_tmp = open(self.tmps["gff"], "w")
                out_tmp.write("##gff-version 3\n")
                new_prefix = gff.replace("_term_all.gff", "")
                new_prefixs.append(gff.replace("_term_all.gff", ""))
                num = 0
                fh = open(os.path.join(self.terms["all"], gff))
                for entry in self.gff_parser.entries(fh):
                    name = '%0*d' % (5, num)
                    entry.attributes["ID"] = (
                            entry.seq_id + "_terminator" + str(num))
                    entry.attributes["Name"] = "_".join(["terminator_" + name])
                    entry.attribute_string = ";".join([
                        "=".join(items) for items in entry.attributes.items()])
                    out_tmp.write("\t".join([entry.info_without_attributes,
                                  entry.attribute_string]) + "\n")
                    num += 1
                out_tmp.close()
                fh.close()
                shutil.move(self.tmps["gff"], os.path.join(self.terms["all"],
                            "_".join([new_prefix, self.suffixs["gff"]])))
        log.write("Running stat_term.py to do statistics.\n")
        stat_path = os.path.join(args_term.out_folder, "statistics")
        log.write("The following files are generated:\n")
        for prefix in new_prefixs:
            stat_term(os.path.join(self.terms["all"],
                      "_".join([prefix, self.suffixs["gff"]])),
                      os.path.join(self.csvs["all"],
                      "_".join([prefix, self.suffixs["csv"]])),
                      os.path.join(stat_path,
                      "_".join(["stat", prefix + ".csv"])),
                      os.path.join(self.terms["best"],
                      "_".join([prefix, "term"])),
                      os.path.join(self.terms["express"],
                      "_".join([prefix, "term"])),
                      os.path.join(self.terms["non"],
                      "_".join([prefix, "term"])))
            shutil.move(os.path.join(self.terms["best"],
                        "_".join([prefix, self.suffixs["csv"]])),
                        os.path.join(self.csvs["best"],
                        "_".join([prefix, self.suffixs["csv"]])))
            shutil.move(os.path.join(self.terms["express"],
                        "_".join([prefix, self.suffixs["csv"]])),
                        os.path.join(self.csvs["express"],
                        "_".join([prefix, self.suffixs["csv"]])))
            shutil.move(os.path.join(self.terms["non"],
                        "_".join([prefix, self.suffixs["csv"]])),
                        os.path.join(self.csvs["non"],
                        "_".join([prefix, self.suffixs["csv"]])))
            os.remove(os.path.join(self.terms["all"],
                      "_".join([prefix, self.suffixs["allgff"]])))
            log.write("\t" + os.path.join(self.terms["all"],
                      "_".join([prefix, self.suffixs["gff"]])) + "\n")
            log.write("\t" + os.path.join(self.terms["best"],
                      "_".join([prefix, self.suffixs["gff"]])) + "\n")
            log.write("\t" + os.path.join(self.terms["express"],
                      "_".join([prefix, self.suffixs["gff"]])) + "\n")
            log.write("\t" + os.path.join(self.terms["non"],
                      "_".join([prefix, self.suffixs["gff"]])) + "\n")
            log.write("\t" + os.path.join(self.csvs["all"],
                      "_".join([prefix, self.suffixs["csv"]])) + "\n")
            log.write("\t" + os.path.join(stat_path,
                      "_".join(["stat", prefix + ".csv"])) + "\n")
            log.write("\t" + os.path.join(self.csvs["best"],
                        "_".join([prefix, self.suffixs["csv"]])) + "\n")
            log.write("\t" + os.path.join(self.csvs["express"],
                        "_".join([prefix, self.suffixs["csv"]])) + "\n")
            log.write("\t" + os.path.join(self.csvs["non"],
                        "_".join([prefix, self.suffixs["csv"]])) + "\n")

    def _check_gff_file(self, folder):
        for file_ in os.listdir(folder):
            if file_.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(folder, file_))

    def _compare_term_tran(self, args_term, prefixs, log):
        '''searching the associated terminator to transcript'''
        self.multiparser.combine_gff(args_term.gffs, self.tran_path,
                                     None, "transcript")
        prefixs = []
        print("Comparing terminators with transcripts now")
        for file_ in os.listdir(self.tran_path):
            if file_.endswith("_transcript.gff"):
                prefixs.append(file_.replace("_transcript.gff", ""))
        log.write("Running compare_tran_term.py for comparing transcripts "
                  "and terminators.\n")
        log.write("The following files are generated:\n")
        for type_ in ("best_candidates", "expressed_candidates",
                      "all_candidates"):
            compare_term_tran(self.tran_path,
                              os.path.join(self.outfolder["term"], type_),
                              args_term.fuzzy_up_ta, args_term.fuzzy_down_ta,
                              args_term.out_folder, "terminator",
                              self.outfolder["term"], args_term.trans)
            for prefix in prefixs:
                shutil.move(
                    os.path.join(
                        args_term.out_folder, "statistics",
                        "stat_compare_transcript_terminator_" + prefix + ".csv"),
                    os.path.join(
                        args_term.out_folder, "statistics",
                        "_".join(["stat_compare_terminator_transcript", prefix,
                                  type_ + ".csv"])))
                log.write("\t" + os.path.join(
                        args_term.out_folder, "statistics",
                        "_".join(["stat_compare_terminator_transcript", prefix,
                                  type_ + ".csv"])) + "\n")

    def _re_table(self, args_term, prefixs, log):
        log.write("Running re_table.py to generate coverage information.\n")
        log.write("The following files are updated:\n")
        for type_ in ["all_candidates", "best_candidates",
                      "expressed_candidates", "non_expressed_candidates"]:
            for table in os.listdir(os.path.join(
                    args_term.out_folder, "tables", type_)):
                term_table = os.path.join(args_term.out_folder, "tables",
                                          type_, table)
                reorganize_table(args_term.libs, args_term.merge_wigs,
                                 "Coverage_detail", term_table)
                log.write("\t" + term_table + "\n")

    def run_terminator(self, args_term, log):
        self._check_gff_file(args_term.gffs)
        self._check_gff_file(args_term.trans)
        self.multiparser.parser_fasta(args_term.fastas)
        if (not args_term.gffs) or (not args_term.fastas):
            print("Error: Please assign gff files "
                  "and fasta files!")
            sys.exit()
        file_types, prefixs = self._convert_gff2rntptt(
                self.gff_path, self.fasta_path, args_term.srnas, log)
        check = self._combine_ptt_rnt(self.gff_path, file_types,
                                      self.srna_path)
        self._run_TransTermHP(args_term, log)
        self._convert_to_gff(prefixs, args_term, log)
        self.helper.remove_tmp(self.gff_path)
        self.multiparser.parser_gff(args_term.trans, "transcript")
        self.helper.check_make_folder(self.tmps["term_table"])
        if check != "NO_CDS":
            self.multiparser.parser_gff(self.tmps["transterm"],
                                        self.tmps["hp"])
        merge_path = self._merge_sRNA(args_term.srnas, prefixs, self.gff_path)
        self._compute_intersection_forward_reverse(
                prefixs, merge_path, args_term.wig_path,
                args_term.merge_wigs, args_term, log)
        self._compute_stat(args_term, log)
        self._compare_term_tran(args_term, prefixs, log)
        self._re_table(args_term, prefixs, log)
        self._remove_tmp_file(args_term.merge_wigs, args_term)
예제 #33
0
class TSSpredator(object):
    def __init__(self, args_tss):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.master = os.path.join(args_tss.out_folder, "MasterTables")
        self.tmps = {
            "tss": "tmp_TSS",
            "ta_tss": "tmp_ta_tss",
            "tss_ta": "tmp_tss",
            "tmp": "tmp"
        }
        if args_tss.ta_files is not None:
            self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp")
        else:
            self.tmps["ta"] = None
        self.gff_path = os.path.join(args_tss.gffs, "tmp")
        self.wig_path = os.path.join(args_tss.wig_folder, "tmp")
        self.fasta_path = os.path.join(args_tss.fastas, "tmp")
        self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics")
        self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs")

    def _assign_dict(self, lib_datas):
        return {
            "wig": lib_datas[0],
            "tex": lib_datas[1],
            "condition": int(lib_datas[2]),
            "replicate": lib_datas[3],
            "strand": lib_datas[4]
        }

    def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix, rep_set):
        for num_id in range(1, lib_num + 1):
            cond_list = []
            for lib in lib_list:
                if num_id == lib["condition"]:
                    cond_list.append(lib)
            cond_sort_list = sorted(cond_list, key=lambda k: k['replicate'])
            reps = []
            for cond in cond_sort_list:
                out.write("{0}_{1}{2} = {3}\n".format(
                    prefix, cond["condition"], cond["replicate"],
                    os.path.join(wig_folder, cond["wig"])))
                reps.append(cond["replicate"])
            for rep in sorted(rep_set):
                if rep not in reps:
                    out.write("{0}_{1}{2} = \n".format(prefix,
                                                       cond["condition"], rep))

    def _start_to_run(self, tsspredator_path, config_file, out_path, prefix):
        print("Running TSSpredator for " + prefix)
        out = open(os.path.join(out_path, "log.txt"), "w")
        err = open(os.path.join(out_path, "err.txt"), "w")
        call(["java", "-jar", tsspredator_path, config_file],
             stdout=out,
             stderr=err)
        out.close()
        err.close()

    def _import_lib(self, libs, wig_folder, project_strain_name, out, gff,
                    program, fasta):
        lib_dict = {"fp": [], "fm": [], "nm": [], "np": []}
        lib_num = 0
        rep_set = set()
        list_num_id = []
        print("Runniun {0} now...".format(program))
        for lib in libs:
            lib_datas = lib.split(":")
            if not lib_datas[0].endswith(".wig"):
                print("Error:Exist a not proper wig files!!")
                sys.exit()
            for wig in os.listdir(wig_folder):
                filename = wig.split("_STRAIN_")
                if (filename[0]
                        == lib_datas[0][:-4]) and (filename[1][:-4]
                                                   == project_strain_name):
                    lib_datas[0] = wig
            if int(lib_datas[2]) > lib_num:
                lib_num = int(lib_datas[2])
            if lib_datas[3] not in rep_set:
                rep_set.add(lib_datas[3])
            if (lib_datas[1] == "tex") and (lib_datas[4] == "+"):
                lib_dict["fp"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"):
                lib_dict["fm"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"):
                lib_dict["np"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"):
                lib_dict["nm"].append(self._assign_dict(lib_datas))
        for num_id in range(1, lib_num + 1):
            out.write("annotation_{0} = {1}\n".format(num_id, gff))
        if program.lower() == "tss":
            self._print_lib(lib_num, lib_dict["fm"], out, wig_folder,
                            "fivePrimeMinus", rep_set)
            self._print_lib(lib_num, lib_dict["fp"], out, wig_folder,
                            "fivePrimePlus", rep_set)
        elif program.lower() == "processing_site":
            self._print_lib(lib_num, lib_dict["nm"], out, wig_folder,
                            "fivePrimeMinus", rep_set)
            self._print_lib(lib_num, lib_dict["np"], out, wig_folder,
                            "fivePrimePlus", rep_set)
        else:
            print("Error: Wrong program name!!!")
            sys.exit()
        for num_id in range(1, lib_num + 1):
            out.write("genome_{0} = {1}\n".format(num_id, fasta))
        for num_id in range(1, lib_num + 1):
            list_num_id.append(str(num_id))
        return lib_num, num_id, rep_set, lib_dict, list_num_id

    def _print_repmatch(self, args_tss, out):
        '''check replicate match'''
        if "all" in args_tss.repmatch:
            match = args_tss.repmatch.split("_")[-1]
            out.write("minNumRepMatches = {0}\n".format(match))
        else:
            nums = {}
            matchs = {}
            for match in args_tss.repmatch.split(","):
                lib = match.split("_")[0]
                rep = match.split("_")[-1]
                matchs[lib] = rep
                if rep not in nums.keys():
                    nums[rep] = 1
                else:
                    nums[rep] += 1
            for rep, num in nums.items():
                if num == max(nums.values()):
                    out.write("minNumRepMatches = {0}\n".format(rep))
                    max_rep = rep
                    break
            for lib, rep in matchs.items():
                if rep != max_rep:
                    out.write("minNumRepMatches_{0} = {1}\n".format(lib, rep))

    def _gen_config(self, project_strain_name, args_tss, gff, wig_folder,
                    fasta, config_file):
        '''generation of config files'''
        master_folder = "MasterTable_" + project_strain_name
        out_path = os.path.join(self.master, master_folder)
        self.helper.check_make_folder(out_path)
        out = open(config_file, "w")
        out.write("TSSinClusterSelectionMethod = HIGHEST\n")
        out.write("allowedCompareShift = 1\n")
        out.write("allowedRepCompareShift = 1\n")
        lib_num, num_id, rep_set, lib_dict, list_num_id = \
            self._import_lib(args_tss.libs, wig_folder, project_strain_name,
                             out, gff, args_tss.program, fasta)
        out.write("idList = ")
        out.write(",".join(list_num_id) + "\n")
        out.write("maxASutrLength = 100\n")
        out.write("maxGapLengthInGene = 500\n")
        out.write("maxNormalTo5primeFactor = {0}\n".format(
            args_tss.processing_factor))
        out.write("maxTSSinClusterDistance = {0}\n".format(args_tss.cluster +
                                                           1))
        out.write("maxUTRlength = {0}\n".format(args_tss.utr_length))
        out.write("min5primeToNormalFactor = {0}\n".format(
            args_tss.enrichment_factor))
        out.write("minCliffFactor = {0}\n".format(args_tss.factor))
        out.write("minCliffFactorDiscount = {0}\n".format(
            args_tss.factor_reduction))
        out.write("minCliffHeight = {0}\n".format(args_tss.height))
        out.write("minCliffHeightDiscount = {0}\n".format(
            args_tss.height_reduction))
        out.write("minNormalHeight = {0}\n".format(args_tss.base_height))
        self._print_repmatch(args_tss, out)
        out.write("minPlateauLength = 0\n")
        out.write("mode = cond\n")
        out.write("normPercentile = 0.9\n")
        if args_tss.program.lower() == "tss":
            self._print_lib(lib_num, lib_dict["nm"], out, wig_folder,
                            "normalMinus", rep_set)
            self._print_lib(lib_num, lib_dict["np"], out, wig_folder,
                            "normalPlus", rep_set)
        else:
            self._print_lib(lib_num, lib_dict["fm"], out, wig_folder,
                            "normalMinus", rep_set)
            self._print_lib(lib_num, lib_dict["fp"], out, wig_folder,
                            "normalPlus", rep_set)
        out.write("numReplicates = {0}\n".format(len(rep_set)))
        out.write("numberOfDatasets = {0}\n".format(lib_num))
        out.write("outputDirectory = {0}\n".format(out_path))
        for prefix_id in range(len(args_tss.output_prefixs)):
            out.write("outputPrefix_{0} = {1}\n".format(
                prefix_id + 1, args_tss.output_prefixs[prefix_id]))
        out.write("projectName = {0}\n".format(project_strain_name))
        out.write("superGraphCompatibility = igb\n")
        out.write("texNormPercentile = 0.5\n")
        out.write("writeGraphs = 0\n")
        out.write("writeNocornacFiles = 0\n")
        out.close()

    def _convert_gff(self, prefixs, args_tss):
        for prefix in prefixs:
            out_file = os.path.join(
                self.gff_outfolder,
                "_".join([prefix, args_tss.program]) + ".gff")
            gff_f = open(out_file, "w")
            out_path = os.path.join(self.master,
                                    "_".join(["MasterTable", prefix]))
            if "MasterTable.tsv" not in os.listdir(out_path):
                print("Error:there is not MasterTable file in {0}".format(
                    out_path))
                print("Please check configuration file.")
            else:
                if args_tss.program.lower() == "processing":
                    feature = "processing_site"
                elif args_tss.program.lower() == "tss":
                    feature = "TSS"
                self.converter.convert_mastertable2gff(
                    os.path.join(out_path, "MasterTable.tsv"), "ANNOgesic",
                    feature, prefix, out_file)
            gff_f.close()

    def _merge_manual(self, tsss, args_tss):
        '''if manual detected TSS is provided, it can merge manual detected TSS 
        and TSSpredator predicted TSS'''
        self.helper.check_make_folder(
            os.path.join(os.getcwd(), self.tmps["tss"]))
        for tss in tsss:
            for gff in os.listdir(args_tss.gffs):
                if (gff[:-4] == tss) and (".gff" in gff):
                    break
            filename = "_".join([tss, args_tss.program]) + ".gff"
            predict = os.path.join(self.gff_outfolder, filename)
            print("Running merge and classify manual ....")
            stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss)
            merge_manual_predict_tss(predict, stat_file,
                                     os.path.join(self.tmps["tss"], filename),
                                     os.path.join(args_tss.gffs, gff),
                                     args_tss)
            shutil.move(
                stat_file,
                os.path.join(args_tss.out_folder, "statistics", tss,
                             stat_file))
        self.helper.move_all_content(self.tmps["tss"], self.gff_outfolder,
                                     [".gff"])
        shutil.rmtree(self.tmps["tss"])

    def _validate(self, tsss, args_tss):
        '''validate TSS with genome annotation'''
        print("Running validation of annotation....")
        for tss in tsss:
            for gff in os.listdir(args_tss.gffs):
                if (gff[:-4] == tss) and (".gff" in gff):
                    break
            stat_file = os.path.join(self.stat_outfolder, tss,
                                     "".join(["stat_gene_vali_", tss, ".csv"]))
            out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff")
            if args_tss.program.lower() == "tss":
                compare_file = os.path.join(self.gff_outfolder,
                                            "_".join([tss, "TSS.gff"]))
            elif args_tss.program.lower() == "processing":
                compare_file = os.path.join(self.gff_outfolder,
                                            "_".join([tss, "processing.gff"]))
            validate_gff(compare_file, os.path.join(args_tss.gffs, gff),
                         stat_file, out_cds_file, args_tss.utr_length,
                         args_tss.program.lower())
            shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff))

    def _compare_ta(self, tsss, args_tss):
        '''compare TSS with transcript'''
        detect = False
        print("Running compare transcript assembly and TSS ...")
        self.multiparser.parser_gff(args_tss.ta_files, "transcript")
        self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"], None,
                                     "transcript")
        for tss in tsss:
            stat_out = os.path.join(
                self.stat_outfolder, tss,
                "".join(["stat_compare_TSS_transcript_", tss, ".csv"]))
            for ta in os.listdir(self.tmps["ta"]):
                filename = ta.split("_transcript")
                if (filename[0] == tss) and (filename[1] == ".gff"):
                    detect = True
                    break
            compare_file = os.path.join(self.gff_outfolder,
                                        "_".join([tss, "TSS.gff"]))
            if detect:
                stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file,
                            stat_out, self.tmps["ta_tss"], self.tmps["tss_ta"],
                            args_tss.fuzzy)
                self.helper.sort_gff(self.tmps["tss_ta"], compare_file)
                self.helper.sort_gff(self.tmps["ta_tss"],
                                     os.path.join(args_tss.ta_files, ta))
                os.remove(self.tmps["tss_ta"])
                os.remove(self.tmps["ta_tss"])
                detect = False

    def _stat_tss(self, tsss, feature):
        print("Running statistaics.....")
        for tss in tsss:
            compare_file = os.path.join(self.gff_outfolder,
                                        "_".join([tss, feature]) + ".gff")
            stat_tsspredator(
                compare_file, feature,
                os.path.join(
                    self.stat_outfolder, tss,
                    "_".join(["stat", feature, "class", tss]) + ".csv"),
                os.path.join(self.stat_outfolder, tss,
                             "_".join(["stat", feature, "libs", tss]) +
                             ".csv"))
            self.helper.move_all_content(
                os.getcwd(), os.path.join(self.stat_outfolder, tss),
                ["_class", ".png"])
            if os.path.exists(
                    os.path.join(self.stat_outfolder, "TSSstatistics.tsv")):
                shutil.move(
                    os.path.join(self.stat_outfolder, "TSSstatistics.tsv"),
                    os.path.join(self.stat_outfolder, tss,
                                 "TSSstatistics.tsv"))
            plot_venn(compare_file, feature)
            self.helper.move_all_content(
                os.getcwd(), os.path.join(self.stat_outfolder, tss),
                ["_venn", ".png"])

    def _set_gen_config(self, args_tss, input_folder):
        prefixs = []
        detect = False
        for fasta in os.listdir(self.fasta_path):
            for gff in os.listdir(self.gff_path):
                if fasta[:-3] == gff[:-4]:
                    prefix = fasta[:-3]
                    for wig in os.listdir(self.wig_path):
                        filename = wig.split("_STRAIN_")
                        if filename[1][:-4] == prefix:
                            detect = True
                            break
                    if detect:
                        prefixs.append(prefix)
                        config = os.path.join(
                            input_folder,
                            "_".join(["config", prefix]) + ".ini")
                        self._gen_config(prefix, args_tss,
                                         os.path.join(self.gff_path,
                                                      gff), self.wig_path,
                                         os.path.join(self.fasta_path, fasta),
                                         config)
        return prefixs

    def _merge_wigs(self, wig_folder, prefix, libs):
        self.helper.check_make_folder(
            os.path.join(os.getcwd(), self.tmps["tmp"]))
        for wig_file in os.listdir(wig_folder):
            for lib in libs:
                info = lib.split(":")
                if (info[0][:-4] in wig_file) and (info[-1] == "+") and (
                        prefix in wig_file) and (os.path.isfile(
                            os.path.join(wig_folder, wig_file))):
                    Helper().merge_file(
                        os.path.join(wig_folder, wig_file),
                        os.path.join("tmp", "merge_forward.wig"))
                if (info[0][:-4] in wig_file) and (info[-1] == "-") and (
                        prefix in wig_file) and (os.path.isfile(
                            os.path.join(wig_folder, wig_file))):
                    Helper().merge_file(
                        os.path.join(wig_folder, wig_file),
                        os.path.join("tmp", "merge_reverse.wig"))

    def _check_orphan(self, prefixs, wig_folder, args_tss):
        '''if genome has no locus tag, it can use for classify the TSS'''
        for prefix in prefixs:
            self._merge_wigs(wig_folder, prefix, args_tss.libs)
            tmp_tss = os.path.join(
                self.tmps["tmp"], "_".join([prefix,
                                            args_tss.program + ".gff"]))
            pre_tss = os.path.join(
                self.gff_outfolder,
                "_".join([prefix, args_tss.program + ".gff"]))
            check_orphan(pre_tss, os.path.join(args_tss.gffs, prefix + ".gff"),
                         "tmp/merge_forward.wig", "tmp/merge_reverse.wig",
                         tmp_tss)
            shutil.move(tmp_tss, pre_tss)
        shutil.rmtree("tmp")

    def _remove_files(self, args_tss):
        print("Remove temperary files and folders...")
        self.helper.remove_tmp(args_tss.fastas)
        self.helper.remove_tmp(args_tss.gffs)
        self.helper.remove_tmp(args_tss.wig_folder)
        self.helper.remove_tmp(args_tss.ta_files)
        if "merge_forward.wig" in os.listdir(os.getcwd()):
            os.remove("merge_forward.wig")
        if "merge_reverse.wig" in os.listdir(os.getcwd()):
            os.remove("merge_reverse.wig")

    def _deal_with_overlap(self, out_folder, args_tss):
        '''deal with the situation that TSS and 
        processing site at the same position'''
        if args_tss.overlap_feature.lower() == "both":
            pass
        else:
            print("Comparing TSS and Processing site...")
            if args_tss.program.lower() == "tss":
                for tss in os.listdir(out_folder):
                    if tss.endswith("_TSS.gff"):
                        ref = self.helper.get_correct_file(
                            args_tss.references, "_processing.gff",
                            tss.replace("_TSS.gff", ""), None, None)
                        filter_tss_pro(os.path.join(out_folder, tss), ref,
                                       args_tss.overlap_feature,
                                       args_tss.cluster)
            elif args_tss.program.lower() == "processing_site":
                for tss in os.listdir(out_folder):
                    if tss.endswith("_processing.gff"):
                        ref = self.helper.get_correct_file(
                            args_tss.references, "_TSS.gff",
                            tss.replace("_processing.gff", ""), None, None)
                        filter_tss_pro(os.path.join(out_folder, tss), ref,
                                       args_tss.overlap_feature,
                                       args_tss.cluster)

    def _low_expression(self, args_tss, gff_folder):
        '''deal with the low expressed TSS'''
        prefix = None
        self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs)
        for gff in os.listdir(gff_folder):
            if (args_tss.program.lower()
                    == "tss") and (gff.endswith("_TSS.gff")):
                prefix = gff.replace("_TSS.gff", "")
            elif (args_tss.program.lower()
                  == "processing") and (gff.endswith("_processing.gff")):
                prefix = gff.replace("_processing.gff", "")
            if prefix:
                out = open(
                    os.path.join(
                        self.stat_outfolder, prefix,
                        "_".join(["stat", prefix,
                                  "low_expression_cutoff.csv"])), "w")
                out.write("\t".join(["strain", "cutoff_coverage"]) + "\n")
                cutoff = filter_low_expression(
                    os.path.join(gff_folder,
                                 gff), args_tss, "tmp/merge_forward.wig",
                    "tmp/merge_reverse.wig", "tmp/without_low_expression.gff")
                out.write("\t".join([prefix, str(cutoff)]) + "\n")
                os.remove(os.path.join(gff_folder, gff))
                shutil.move("tmp/without_low_expression.gff",
                            os.path.join(gff_folder, gff))
                prefix = None
        out.close()

    def run_tsspredator(self, args_tss):
        input_folder = os.path.join(args_tss.out_folder, "configs")
        for gff in os.listdir(args_tss.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(
                    os.path.join(args_tss.gffs, gff))
        self.helper.check_make_folder(self.gff_outfolder)
        self.multiparser.parser_fasta(args_tss.fastas)
        self.multiparser.parser_gff(args_tss.gffs, None)
        self.multiparser.parser_wig(args_tss.wig_folder)
        prefixs = self._set_gen_config(args_tss, input_folder)
        for prefix in prefixs:
            out_path = os.path.join(self.master,
                                    "_".join(["MasterTable", prefix]))
            config_file = os.path.join(input_folder,
                                       "_".join(["config", prefix]) + ".ini")
            self._start_to_run(args_tss.tsspredator_path, config_file,
                               out_path, prefix)
            if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")):
                shutil.move(
                    os.path.join(out_path, "TSSstatistics.tsv"),
                    os.path.join(self.stat_outfolder, "TSSstatistics.tsv"))
        if args_tss.program.lower() == "processing_site":
            args_tss.program = "processing"
        self._convert_gff(prefixs, args_tss)
        if args_tss.check_orphan:
            print("checking the orphan TSS...")
            self._check_orphan(prefixs, os.path.join(args_tss.wig_folder,
                                                     "tmp"), args_tss)
        self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder, None,
                                     args_tss.program)
        datas = []
        for gff in os.listdir(self.gff_outfolder):
            if gff.endswith(".gff"):
                gff_folder = gff.replace(
                    "".join(["_", args_tss.program, ".gff"]), "")
                self.helper.check_make_folder(
                    os.path.join(self.stat_outfolder, gff_folder))
                datas.append(gff_folder)
        if args_tss.remove_low_expression is not None:
            self._low_expression(args_tss, self.gff_outfolder)
        if args_tss.manual is not None:
            self.multiparser.combine_wig(args_tss.gffs, self.wig_path, None,
                                         args_tss.libs)
            self._merge_manual(datas, args_tss)
        self._deal_with_overlap(self.gff_outfolder, args_tss)
        if args_tss.stat:
            self._stat_tss(datas, args_tss.program)
        if args_tss.validate:
            self._validate(datas, args_tss)
        if args_tss.ta_files is not None:
            self._compare_ta(datas, args_tss)
        self._remove_files(args_tss)
예제 #34
0
class CircRNADetection(object):
    '''Detection of circRNA'''
    def __init__(self, args_circ):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.alignment_path = os.path.join(args_circ.output_folder,
                                           "segemehl_alignment_files")
        self.splice_path = os.path.join(args_circ.output_folder,
                                        "segemehl_splice_results")
        self.candidate_path = os.path.join(args_circ.output_folder,
                                           "circRNA_tables")
        self.gff_folder = os.path.join(args_circ.output_folder, "gffs")
        self.gff_path = os.path.join(args_circ.gffs, "tmp")
        self.splices = {"file": "splicesites.bed", "splice": "splicesites"}
        self.trans = {"file": "transrealigned.bed", "trans": "transrealigned"}
        self.fasta_path = os.path.join(args_circ.fastas, "tmp")

    def _wait_process(self, processes):
        '''wait for the parallels to finish the process'''
        for p in processes:
            p.wait()
            if p.stdout:
                p.stdout.close()
            if p.stdin:
                p.stdin.close()
            if p.stderr:
                p.stderr.close()
            try:
                p.kill()
            except OSError:
                pass
            time.sleep(5)

    def _deal_zip_file(self, read_files, log):
        tmp_datas = []
        tmp_reads = []
        for reads in read_files:
            zips = []
            tmp_datas = reads["files"]
            for read in reads["files"]:
                if read.endswith(".bz2"):
                    mod_read = read.replace(".bz2", "")
                    if (".fa" not in mod_read) and (
                            ".fasta"
                            not in mod_read) and (".fna" not in mod_read) and (
                                ".fq" not in mod_read) and (".fastq"
                                                            not in mod_read):
                        mod_read = mod_read + ".fa"
                    read_out = open(mod_read, "w")
                    tmp_datas.append(mod_read)
                    zips.append(mod_read)
                    print(" ".join(["Uncompressing", read]))
                    log.write(" ".join(["bzcat", read]) + "\n")
                    call(["bzcat", read], stdout=read_out)
                    log.write("\t" + mod_read + " is generated.\n")
                    read_out.close()
                elif read.endswith(".gz"):
                    mod_read = read.replace(".gz", "")
                    if (".fa" not in mod_read) and (
                            ".fasta"
                            not in mod_read) and (".fna" not in mod_read) and (
                                ".fq" not in mod_read) and (".fastq"
                                                            not in mod_read):
                        mod_read = mod_read + ".fa"
                    read_out = open(mod_read, "w")
                    tmp_datas.append(mod_read)
                    zips.append(mod_read)
                    print(" ".join(["Uncompressing", read]))
                    log.write(" ".join(["zcat", read]) + "\n")
                    call(["zcat", read], stdout=read_out)
                    read_out.close()
                    log.write("\t" + mod_read + " is generated.\n")
            tmp_reads.append({
                "sample": reads["sample"],
                "files": tmp_datas,
                "zips": zips
            })
        return tmp_reads

    def _run_segemehl_fasta_index(self, segemehl_path, fasta_path, index,
                                  fasta, log):
        log.write(" ".join([
            segemehl_path, "-x",
            os.path.join(fasta_path, index), "-d",
            os.path.join(fasta_path, fasta)
        ]) + "\n")
        call([
            segemehl_path, "-x",
            os.path.join(fasta_path, index), "-d",
            os.path.join(fasta_path, fasta)
        ])

    def _run_segemehl_align(self, args_circ, index, fasta, read, sam_file,
                            log_file, fasta_prefix, log):
        out = open(os.path.join(self.alignment_path, fasta_prefix, sam_file),
                   "w")
        log = open(os.path.join(self.alignment_path, fasta_prefix, log_file),
                   "w")
        log.write(" ".join([
            args_circ.segemehl_path, "-i",
            os.path.join(self.fasta_path, index), "-d",
            os.path.join(self.fasta_path, fasta), "-q", read, "-S"
        ]) + "\n")
        p = Popen([
            args_circ.segemehl_path, "-i",
            os.path.join(self.fasta_path, index), "-d",
            os.path.join(self.fasta_path, fasta), "-q", read, "-S"
        ],
                  stdout=out,
                  stderr=log)
        return p

    def _align(self, args_circ, read_datas, log):
        '''align the read. if the bam files are provided, it can be skipped.'''
        prefixs = []
        align_files = []
        log.write("Using segemehl to align the read.\n")
        log.write(
            "Please make sure the version of segemehl is at least 0.1.9.\n")
        for fasta in os.listdir(self.fasta_path):
            index = fasta.replace(".fa", ".idx")
            self._run_segemehl_fasta_index(args_circ.segemehl_path,
                                           self.fasta_path, index, fasta, log)
            processes = []
            num_process = 0
            fasta_prefix = fasta.replace(".fa", "")
            prefixs.append(fasta_prefix)
            self.helper.check_make_folder(
                os.path.join(self.alignment_path, fasta_prefix))
            log.write("Running for {0}.\n".format(fasta_prefix))
            for reads in read_datas:
                for read in reads["files"]:
                    num_process += 1
                    read_name = read.split("/")[-1]
                    if read_name.endswith(".fa") or \
                       read_name.endswith(".fna") or \
                       read_name.endswith(".fasta") or \
                       read_name.endswith(".fq") or \
                       read_name.endswith(".fastq"):
                        filename = read_name.split(".")
                        read_prefix = ".".join(filename[:-1])
                        sam_file = "_".join(
                            [read_prefix, fasta_prefix + ".sam"])
                        log_file = "_".join(
                            [read_prefix, fasta_prefix + ".log"])
                        align_files.append("_".join(
                            [read_prefix, fasta_prefix]))
                        print("Mapping {0}".format(sam_file))
                        p = self._run_segemehl_align(args_circ, index, fasta,
                                                     read, sam_file, log_file,
                                                     fasta_prefix, log)
                        processes.append(p)
                        if num_process == args_circ.cores:
                            self._wait_process(processes)
                            num_process = 0
                self._wait_process(processes)
            log.write("Done!\n")
            log.write("The following files are generated in {0}:\n".format(
                os.path.join(self.alignment_path, fasta_prefix)))
            for file_ in os.listdir(
                    os.path.join(self.alignment_path, fasta_prefix)):
                log.write("\t" + file_ + "\n")
        return align_files, prefixs

    def _run_samtools_convert_bam(self, samtools_path, pre_sam, out_bam, log):
        log.write(
            " ".join([samtools_path, "view", "-bS", pre_sam, "-o", out_bam]) +
            "\n")
        call([samtools_path, "view", "-bS", pre_sam, "-o", out_bam])

    def _convert_sam2bam(self, sub_alignment_path, samtools_path, align_files,
                         log):
        bam_files = []
        convert_ones = []
        remove_ones = []
        log.write("Using Samtools to convert SAM files to BAM files.\n")
        log.write(
            "Please make sure the version of Samtools is at least 1.3.1.\n")
        for sam in os.listdir(sub_alignment_path):
            pre_sam = os.path.join(sub_alignment_path, sam)
            if sam.endswith(".sam"):
                bam_file = sam.replace(".sam", ".bam")
                print("Converting {0} to {1}".format(sam, bam_file))
                out_bam = os.path.join(sub_alignment_path, bam_file)
                self._run_samtools_convert_bam(samtools_path, pre_sam, out_bam,
                                               log)
                bam_files.append(out_bam)
                if align_files:
                    if bam_file.replace(".bam", "") not in align_files:
                        convert_ones.append(out_bam)
                    else:
                        remove_ones.append(pre_sam)
            elif sam.endswith(".bam"):
                if (pre_sam not in convert_ones) and (pre_sam
                                                      not in remove_ones):
                    bam_files.append(pre_sam)
            elif sam.endswith(".log"):
                os.remove(pre_sam)
        log.write("Done!\n")
        log.write("The following files are generated:\n")
        for file_ in os.listdir(sub_alignment_path):
            if file_.endswith(".bam"):
                log.write("\t" + os.path.join(sub_alignment_path, file_) +
                          "\n")
        return bam_files, convert_ones, remove_ones

    def _run_samtools_merge_sort(self, samtools_path, prefix, out_folder,
                                 bam_datas, log):
        log.write("Using Samtools for merging, sorting and converting "
                  "the BAM files.\n")
        log.write("Make sure the version Samtools is at least 1.3.1.\n")
        for bam_data in bam_datas:
            print("Merging bam files for {0} of {1}".format(
                prefix, bam_data["sample"]))
            sample_bam = os.path.join(
                out_folder, "_".join([prefix, bam_data["sample"] + ".bam"]))
            if len(bam_data["files"]) <= 1:
                shutil.copyfile(bam_data["files"][0], sample_bam)
            else:
                file_line = " ".join(bam_data["files"])
                log.write(
                    " ".join([samtools_path, "merge", sample_bam, file_line]) +
                    "\n")
                os.system(" ".join(
                    [samtools_path, "merge", sample_bam, file_line]))
            print("Sorting bam files for {0} of {1}".format(
                prefix, bam_data["sample"]))
            sort_sample = os.path.join(
                out_folder,
                "_".join([prefix, bam_data["sample"] + "_sort.bam"]))
            log.write(" ".join(
                [samtools_path, "sort", "-o", sort_sample, sample_bam]) + "\n")
            call([samtools_path, "sort", "-o", sort_sample, sample_bam])
            os.remove(sample_bam)
            print("Converting bam files to sam files for {0} of {1}".format(
                prefix, bam_data["sample"]))
            log.write(" ".join([
                samtools_path, "view", "-h", "-o",
                sort_sample.replace(".bam", ".sam"), sort_sample
            ]) + "\n")
            call([
                samtools_path, "view", "-h", "-o",
                sort_sample.replace(".bam", ".sam"), sort_sample
            ])
        log.write("Done!\n")
        log.write("\t" + sort_sample.replace(".bam", ".sam") +
                  " is generated.\n")

    def _merge_sort_aligment_file(self, bam_datas, read_datas, samtools_path,
                                  out_folder, convert_ones, tmp_reads,
                                  remove_ones, prefix, log):
        if bam_datas is None:
            merge_bam_datas = []
            for read_data in read_datas:
                bam_files = []
                for read in read_data["files"]:
                    if read.endswith(".gz") or read.endswith(".bz2"):
                        read = ".".join(read.split("/")[-1].split(".")[:-1])
                    read_prefix = ".".join(read.split("/")[-1].split(".")[:-1])
                    bam_files.append(
                        os.path.join(self.alignment_path, prefix,
                                     "_".join([read_prefix, prefix + ".bam"])))
                merge_bam_datas.append({
                    "sample": read_data["sample"],
                    "files": bam_files
                })
        elif (bam_datas is not None) and (read_datas is not None):
            merge_bam_datas = copy.deepcopy(bam_datas)
            for bam_data in merge_bam_datas:
                for read_data in read_datas:
                    if bam_data["sample"] == read_data["sample"]:
                        for read in read_data["files"]:
                            read_prefix = ".".join(
                                read.split("/")[-1].split(".")[:-1])
                            bam = os.path.join(
                                self.alignment_path, prefix,
                                "_".join([read_prefix, prefix + ".bam"]))
                            if (bam not in bam_data["files"]):
                                bam_data["files"].append(bam)
        else:
            merge_bam_datas = copy.deepcopy(bam_datas)
        self._run_samtools_merge_sort(samtools_path, prefix, out_folder,
                                      merge_bam_datas, log)
        for bam in convert_ones:
            os.remove(bam)
        for sam in remove_ones:
            os.remove(sam)

    def _run_testrealign(self, prefix, testrealign_path, out_folder, log):
        log.write("Using Segemehl to detect circular RNAs.\n")
        log.write(
            "Please make sure the version of Segemehl is at least 0.1.9.\n")
        log.write(
            "Please make sure your testrealign.x exists. If it does not "
            "exists, please reinstall your Segemehl via using make all.\n")
        sub_splice_path = os.path.join(self.splice_path, prefix)
        if not os.path.exists(sub_splice_path):
            os.mkdir(sub_splice_path)
        err_log = os.path.join(sub_splice_path, prefix + ".log")
        print("Running testrealign.x for {0}".format(prefix))
        for sam_file in os.listdir(out_folder):
            if sam_file.endswith("sort.sam"):
                sample_prefix = sam_file.replace("_sort.sam", "")
                command = " ".join([
                    testrealign_path, "-d",
                    os.path.join(self.fasta_path, prefix + ".fa"), "-q",
                    os.path.join(out_folder, sam_file), "-n", "-U",
                    os.path.join(sub_splice_path,
                                 sample_prefix + "_splicesites.bed"), "-T",
                    os.path.join(sub_splice_path,
                                 sample_prefix + "_transrealigned.bed")
                ])
                log.write(command + " 2>" + err_log + "\n")
                os.system(command + " 2>" + err_log)
        log.write("Done!\n")
        log.write("The following files are generated:\n")
        for file_ in os.listdir(sub_splice_path):
            log.write("\t" + os.path.join(sub_splice_path, file_) + "\n")
        self.helper.remove_all_content(out_folder, ".sam", "file")

    def _merge_bed(self, fastas, splice_path, output_folder):
        '''Merge the bed files for analysis'''
        fa_prefixs = []
        for fasta in os.listdir(fastas):
            headers = []
            if (fasta.endswith(".fa") or fasta.endswith(".fna")
                    or fasta.endswith(".fasta")):
                with open(os.path.join(fastas, fasta), "r") as f_h:
                    for line in f_h:
                        line = line.strip()
                        if line.startswith(">"):
                            headers.append(line[1:])
                filename = fasta.split(".")
                fasta_prefix = ".".join(filename[:-1])
                fa_prefixs.append(fasta_prefix)
                bed_folder = os.path.join(output_folder, fasta_prefix)
                self.helper.check_make_folder(bed_folder)
                samples = []
                for header in headers:
                    for splice in os.listdir(os.path.join(splice_path,
                                                          header)):
                        if splice.endswith(".bed"):
                            if self.splices["file"] in splice:
                                sample = splice.replace(header, "")
                                sample = sample.replace(
                                    self.splices["file"], "")
                                if sample not in samples:
                                    samples.append(sample)
                            shutil.copyfile(
                                os.path.join(splice_path, header, splice),
                                os.path.join(bed_folder, "tmp_" + splice))
                for sample in samples:
                    out_splice = os.path.join(
                        bed_folder,
                        "".join([fasta_prefix + sample + self.splices["file"]
                                 ]))
                    out_trans = os.path.join(
                        bed_folder,
                        "".join([fasta_prefix + sample + self.trans["file"]]))
                    if os.path.exists(out_splice):
                        os.remove(out_splice)
                    if os.path.exists(out_trans):
                        os.remove(out_trans)
                    for file_ in os.listdir(bed_folder):
                        if (self.splices["splice"] in file_) and (sample
                                                                  in file_):
                            self.helper.merge_file(
                                os.path.join(bed_folder, file_), out_splice)
                        elif (self.trans["trans"] in file_) and (sample
                                                                 in file_):
                            self.helper.merge_file(
                                os.path.join(bed_folder, file_), out_trans)
        self.helper.remove_all_content(splice_path, None, "dir")
        return samples, fa_prefixs

    def _stat_and_gen_gff(self, prefixs, samples, args_circ, log):
        '''do statistics and print the result to gff file'''
        log.write(
            "Running circRNA.py to do statistics and generate gff files.\n")
        log.write("The following files are generated:\n")
        for prefix in prefixs:
            self.helper.check_make_folder(os.path.join(self.gff_folder,
                                                       prefix))
            self.helper.check_make_folder(
                os.path.join(self.splice_path, prefix))
            for bed in os.listdir(os.path.join(args_circ.output_folder,
                                               prefix)):
                if (bed.split("_")[0] != "tmp") and (bed.endswith(".bed")):
                    shutil.copy(
                        os.path.join(args_circ.output_folder, prefix, bed),
                        os.path.join(self.splice_path, prefix))
            self.helper.check_make_folder(
                os.path.join(self.candidate_path, prefix))
            print("Comparing circular RNAs with annotations of {0}".format(
                prefix))
            for sample in samples:
                splice_file = os.path.join(
                    self.splice_path, prefix,
                    "".join([prefix, sample, self.splices["file"]]))
                stat_file = os.path.join(
                    args_circ.stat_folder,
                    "".join(["stat_", prefix, sample, "circRNA.csv"]))
                csv_all = os.path.join(
                    self.candidate_path, prefix,
                    "".join([prefix, sample, "circRNA_all.csv"]))
                csv_best = os.path.join(
                    self.candidate_path, prefix,
                    "".join([prefix, sample, "circRNA_best.csv"]))
                gff_all = os.path.join(
                    self.gff_folder, prefix,
                    "".join([prefix, sample, "circRNA_all.gff"]))
                gff_best = os.path.join(
                    self.gff_folder, prefix,
                    "".join([prefix, sample, "circRNA_best.gff"]))
                detect_circrna(splice_file,
                               os.path.join(self.gff_path, prefix + ".gff"),
                               csv_all, args_circ, stat_file)
                self.converter.convert_circ2gff(
                    os.path.join(self.candidate_path, prefix,
                                 "".join([prefix, sample, "circRNA_all.csv"])),
                    args_circ, gff_all, gff_best)
                log.write("\t" + stat_file + "\n")
                log.write("\t" + csv_all + "\n")
                log.write("\t" + csv_best + "\n")
                log.write("\t" + gff_all + "\n")
                log.write("\t" + gff_best + "\n")

    def _extract_input_files(self, inputs):
        input_datas = []
        for input_ in inputs:
            datas = input_.split(":")
            if len(datas) != 2:
                print("Error: the format of --bam_files or "
                      "--read_files is wrong!")
                sys.exit()
            for file_ in datas[-1].split(","):
                if not os.path.exists(file_):
                    print("Error: some files in --bam_files or "
                          "--read_files do not exist!")
                    sys.exit()
            input_datas.append({
                "sample": datas[0],
                "files": datas[-1].split(",")
            })
        return input_datas

    def _combine_read_bam(self, bam_files, bam_datas, read_datas):
        if bam_datas is not None:
            for bam_data in bam_datas:
                for read_data in read_datas:
                    if bam_data["sample"] == read_data["sample"]:
                        for read in read_data["files"]:
                            prefix = ".".join(
                                read.split("/")[-1].split(".")[:-1])
                            bam = os.path.join(self.alignment_path,
                                               prefix + ".bam")
                            if (bam in bam_files) and (
                                    bam not in bam_data["files"]):
                                bam_data["files"].append(bam)
        else:
            bam_datas = []
            for read_data in read_datas:
                bam_files = []
                for read in read_data["files"]:
                    prefix = ".".join(read.split("/")[-1].split(".")[:-1])
                    bam_files.append(
                        os.path.join(self.alignment_path, prefix + ".bam"))
                bam_datas.append({
                    "sample": read_data["sample"],
                    "files": bam_files
                })
        return bam_datas

    def _remove_tmp_files(self, args_circ, fa_prefixs):
        self.helper.remove_tmp_dir(args_circ.fastas)
        self.helper.remove_tmp_dir(args_circ.gffs)
        self.helper.remove_all_content(args_circ.output_folder, ".bam", "file")
        for prefix in fa_prefixs:
            shutil.rmtree(os.path.join(args_circ.output_folder, prefix))

    def run_circrna(self, args_circ, log):
        '''detection of circRNA'''
        bam_datas = None
        read_datas = None
        if (args_circ.bams is None) and (args_circ.read_files is None):
            log.write("--bam_files and --read_files can not be both emtpy.\n")
            print("Error: --bam_files or --read_files should be assigned.")
            sys.exit()
        if args_circ.bams is not None:
            bam_datas = self._extract_input_files(args_circ.bams)
        if args_circ.read_files is not None:
            read_datas = self._extract_input_files(args_circ.read_files)
        for gff in os.listdir(args_circ.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(
                    os.path.join(args_circ.gffs, gff))
        if args_circ.segemehl_path is None:
            log.write("segemehl does not exists.\n")
            print("Error: please assign segemehl path!!")
            sys.exit()
        self.multiparser.parser_fasta(args_circ.fastas)
        self.multiparser.parser_gff(args_circ.gffs, None)
        self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta",
                                     None)
        tmp_reads = []
        if args_circ.read_files:
            log.write("Raw read files are found.\n")
            tmp_reads = self._deal_zip_file(read_datas, log)
            align_files, prefixs = self._align(args_circ, tmp_reads, log)
        else:
            align_files = None
        prefixs = []
        for fasta in os.listdir(self.fasta_path):
            if fasta.endswith(".fa"):
                fasta_prefix = fasta.replace(".fa", "")
                prefixs.append(fasta_prefix)
        for prefix in prefixs:
            if args_circ.read_files:
                sub_alignment_path = os.path.join(self.alignment_path, prefix)
                bam_files, convert_ones, remove_ones = self._convert_sam2bam(
                    sub_alignment_path, args_circ.samtools_path, align_files,
                    log)
            else:
                convert_ones = []
                remove_ones = []
            self._merge_sort_aligment_file(bam_datas, read_datas,
                                           args_circ.samtools_path,
                                           args_circ.output_folder,
                                           convert_ones, tmp_reads,
                                           remove_ones, prefix, log)
            self._run_testrealign(prefix, args_circ.testrealign_path,
                                  args_circ.output_folder, log)
        samples, fa_prefixs = self._merge_bed(args_circ.fastas,
                                              self.splice_path,
                                              args_circ.output_folder)
        self._stat_and_gen_gff(fa_prefixs, samples, args_circ, log)
        if len(tmp_reads) != 0:
            for reads in tmp_reads:
                for read in reads["zips"]:
                    os.remove(read)
        self._remove_tmp_files(args_circ, fa_prefixs)
예제 #35
0
class RATT(object):

    def __init__(self, args_ratt):
        self.multiparser = Multiparser()
        self.converter = Converter()
        self.format_fixer = FormatFixer()
        self.helper = Helper()
        self.gbk = os.path.join(args_ratt.ref_embls, "gbk_tmp")
        self.gbk_tmp = os.path.join(self.gbk, "tmp")
        self.embl = os.path.join(args_ratt.ref_embls, "embls")
        self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt")
        self.tmp_files = {"tar": os.path.join(args_ratt.tar_fastas, "tmp"),
                          "ref": os.path.join(args_ratt.ref_fastas, "tmp"),
                          "out_gff": os.path.join(args_ratt.gff_outfolder,
                                                  "tmp"),
                          "gff": os.path.join(args_ratt.gff_outfolder,
                                              "tmp.gff"),
                          "ptt": os.path.join(args_ratt.gff_outfolder,
                                              "tmp.ptt"),
                          "rnt": os.path.join(args_ratt.gff_outfolder,
                                              "tmp.rnt")}

    def _convert_to_pttrnt(self, gffs, files):
        for gff in files:
            if gff.endswith(".gff"):
                gff = os.path.join(gffs, gff)
                filename = gff.split("/")
                prefix = filename[-1][:-4]
                rnt = gff[:-3] + "rnt"
                ptt = gff[:-3] + "ptt"
                fasta = self.helper.get_correct_file(self.tmp_files["tar"],
                                                     ".fa", prefix, None, None)
                if fasta:
                    self.converter.convert_gff2rntptt(gff, fasta, ptt, rnt,
                                                      None, None)

    def _remove_files(self, args_ratt, out_gbk):
        self.helper.remove_all_content(args_ratt.gff_outfolder, ".gff", "file")
        self.helper.remove_all_content(args_ratt.gff_outfolder, ".ptt", "file")
        self.helper.remove_all_content(args_ratt.gff_outfolder, ".rnt", "file")
        self.helper.move_all_content(self.tmp_files["out_gff"],
                                     args_ratt.gff_outfolder, None)
        shutil.rmtree(self.tmp_files["out_gff"])
        shutil.rmtree(self.tmp_files["tar"])
        shutil.rmtree(self.tmp_files["ref"])
        shutil.rmtree(self.embl)
        self.helper.remove_all_content(args_ratt.tar_fastas, "_folder", "dir")
        self.helper.remove_all_content(args_ratt.ref_fastas, "_folder", "dir")
        if out_gbk:
            shutil.rmtree(out_gbk)

    def _convert_to_gff(self, ratt_result, args_ratt, files):
        name = ratt_result.split(".")
        filename = ".".join(name[1:-2]) + ".gff"
        output_file = os.path.join(args_ratt.output_path, filename)
        self.converter.convert_embl2gff(
             os.path.join(args_ratt.output_path, ratt_result), output_file)
        self.format_fixer.fix_ratt(output_file, ".".join(name[1:-2]),
                                   "tmp_gff")
        shutil.move("tmp_gff", output_file)
        shutil.copy(output_file, os.path.join(args_ratt.gff_outfolder,
                                              filename))
        files.append(filename)

    def _parser_embl_gbk(self, files):
        self.helper.check_make_folder(self.gbk)
        for file_ in files:
            close = False
            with open(file_, "r") as f_h:
                for line in f_h:
                    if (line.startswith("LOCUS")):
                        out = open(self.gbk_tmp, "w")
                        datas = line.split(" ")
                        for data in datas:
                            if (len(data) != 0) and (data != "LOCUS"):
                                filename = ".".join([data, "gbk"])
                                break
                    elif (line.startswith("VERSION")):
                        datas = line.split(" ")
                        for data in datas:
                            if (len(data) != 0) and (data != "VERSION"):
                                new_filename = ".".join([data, "gbk"])
                                break
                        if new_filename.find(filename):
                            filename = new_filename
                    if out:
                        out.write(line)
                    if line.startswith("//"):
                        out.close()
                        close = True
                        shutil.move(self.gbk_tmp,
                                    os.path.join(self.gbk, filename))
            if not close:
                out.close()
        return self.gbk

    def _convert_embl(self, ref_embls):
        detect_gbk = False
        gbks = []
        out_gbk = None
        for embl in os.listdir(ref_embls):
            if embl.endswith(".gbk"):
                detect_gbk = True
                gbks.append(os.path.join(ref_embls, embl))
        if not detect_gbk:
            print("Error: please assign proper folder for Genebank file!!!")
            sys.exit()
        elif detect_gbk:
            out_gbk = self._parser_embl_gbk(gbks)
            self.converter.convert_gbk2embl(out_gbk)
            self.helper.check_make_folder(self.embl)
            self.helper.move_all_content(out_gbk, self.embl, [".embl"])
        return out_gbk

    def _run_ratt(self, args_ratt, tar, ref, out):
        call([args_ratt.ratt_path, self.embl,
              os.path.join(self.tmp_files["tar"], tar + ".fa"),
              args_ratt.element, args_ratt.transfer_type,
              os.path.join(self.tmp_files["ref"], ref + ".fa")],
             stdout=out, stderr=DEVNULL)

    def _format_and_run(self, args_ratt):
        print("Running RATT...")
        for pair in args_ratt.pairs:
            ref = pair.split(":")[0]
            tar = pair.split(":")[1]
            out = open(self.ratt_log, "w+")
            print(tar)
            self._run_ratt(args_ratt, tar, ref, out)
            for filename in os.listdir():
                if ("final" in filename):
                    shutil.move(filename, os.path.join(args_ratt.output_path,
                                                       filename))
                elif (args_ratt.element in filename) or (
                      "query" in filename) or (
                      "Reference" in filename) or (
                      "Query" in filename) or (
                      "Sequences" in filename):
                    if os.path.isfile(filename):
                        os.remove(filename)
                    if os.path.isdir(filename):
                        shutil.rmtree(filename)
        out.close()

    def annotation_transfer(self, args_ratt):
        self.multiparser.parser_fasta(args_ratt.tar_fastas)
        self.multiparser.parser_fasta(args_ratt.ref_fastas)
        out_gbk = self._convert_embl(args_ratt.ref_embls)
        self._format_and_run(args_ratt)
        if args_ratt.convert:
            files = []
            for data in os.listdir(args_ratt.output_path):
                if "final.embl" in data:
                    self._convert_to_gff(data, args_ratt, files)
                    self._convert_to_pttrnt(args_ratt.gff_outfolder, files)
            self.helper.check_make_folder(self.tmp_files["out_gff"])
            for folder in os.listdir(args_ratt.tar_fastas):
                files = []
                if "_folder" in folder:
                    datas = folder.split("_folder")
                    prefix = datas[0][:-3]
                    for file_ in os.listdir(os.path.join(args_ratt.tar_fastas,
                                                         folder)):
                        files.append(file_[:-3])
                    for gff in os.listdir(args_ratt.gff_outfolder):
                        for file_ in files:
                            if (".gff" in gff) and (file_ == gff[:-4]):
                                self.helper.merge_file(os.path.join(
                                     args_ratt.gff_outfolder, gff),
                                     self.tmp_files["gff"])
                            if (".ptt" in gff) and (file_ == gff[:-4]):
                                self.helper.merge_file(os.path.join(
                                     args_ratt.gff_outfolder, gff),
                                     self.tmp_files["ptt"])
                            if (".rnt" in gff) and (file_ == gff[:-4]):
                                self.helper.merge_file(os.path.join(
                                     args_ratt.gff_outfolder, gff),
                                     self.tmp_files["rnt"])
                    shutil.move(self.tmp_files["gff"], os.path.join(
                                self.tmp_files["out_gff"], prefix + ".gff"))
                    shutil.move(self.tmp_files["ptt"], os.path.join(
                                self.tmp_files["out_gff"], prefix + ".ptt"))
                    shutil.move(self.tmp_files["rnt"], os.path.join(
                                self.tmp_files["out_gff"], prefix + ".rnt"))
        self._remove_files(args_ratt, out_gbk)
예제 #36
0
파일: ratt.py 프로젝트: Sung-Huan/ANNOgesic
class RATT(object):
    '''annotation transfer'''

    def __init__(self, args_ratt):
        self.multiparser = Multiparser()
        self.converter = Converter()
        self.format_fixer = FormatFixer()
        self.helper = Helper()
        if args_ratt.ref_gbk:
            self.gbk = os.path.join(args_ratt.ref_gbk, "gbk_tmp")
            self.gbk_tmp = os.path.join(self.gbk, "tmp")
            self.embl = os.path.join(args_ratt.ref_gbk, "embls")
        if args_ratt.ref_embls:
            self.embl = args_ratt.ref_embls
        self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt")
        self.tmp_files = {"tar": os.path.join(args_ratt.tar_fastas, "tmp"),
                          "ref": os.path.join(args_ratt.ref_fastas, "tmp"),
                          "out_gff": os.path.join(args_ratt.gff_outfolder,
                                                  "tmp"),
                          "gff": os.path.join(args_ratt.gff_outfolder,
                                              "tmp.gff"),
                          "ptt": os.path.join(args_ratt.gff_outfolder,
                                              "tmp.ptt"),
                          "rnt": os.path.join(args_ratt.gff_outfolder,
                                              "tmp.rnt")}

    def _convert_to_pttrnt(self, gffs, files, log):
        for gff in files:
            if gff.endswith(".gff"):
                gff = os.path.join(gffs, gff)
                filename = gff.split("/")
                prefix = filename[-1][:-4]
                rnt = gff[:-3] + "rnt"
                ptt = gff[:-3] + "ptt"
                fasta = self.helper.get_correct_file(self.tmp_files["tar"],
                                                     ".fa", prefix, None, None)
                if fasta:
                    self.converter.convert_gff2rntptt(gff, fasta, ptt, rnt,
                                                      None, None)
                    log.write("\t" + ptt + " is generated.\n")
                    log.write("\t" + rnt + " is generated.\n")

    def _remove_files(self, args_ratt, out_gbk, log):
        self.helper.remove_all_content(args_ratt.gff_outfolder, ".gff", "file")
        self.helper.remove_all_content(args_ratt.gff_outfolder, ".ptt", "file")
        self.helper.remove_all_content(args_ratt.gff_outfolder, ".rnt", "file")
        log.write("Moving the final output files to {0}.\n".format(args_ratt.gff_outfolder))
        self.helper.move_all_content(self.tmp_files["out_gff"],
                                     args_ratt.gff_outfolder, None)
        log.write("Remove the temperary files.\n")
        shutil.rmtree(self.tmp_files["out_gff"])
        shutil.rmtree(self.tmp_files["tar"])
        shutil.rmtree(self.tmp_files["ref"])
        self.helper.remove_tmp_dir(args_ratt.tar_fastas)
        self.helper.remove_tmp_dir(args_ratt.ref_fastas)
        self.helper.remove_tmp_dir(args_ratt.ref_embls)
        self.helper.remove_tmp_dir(args_ratt.ref_gbk)

    def _convert_to_gff(self, ratt_result, args_ratt, files, log):
        name = ratt_result.split(".")
        filename = ".".join(name[1:-2]) + ".gff"
        output_file = os.path.join(args_ratt.output_path, filename)
        self.converter.convert_embl2gff(
             os.path.join(args_ratt.output_path, ratt_result), output_file)
        self.format_fixer.fix_ratt(output_file, ".".join(name[1:-2]),
                                   "tmp_gff")
        shutil.move("tmp_gff", output_file)
        shutil.copy(output_file, os.path.join(args_ratt.gff_outfolder,
                                              filename))
        log.write("\t" + os.path.join(args_ratt.gff_outfolder, filename) + 
                  " is generated.\n")
        files.append(filename)

    def _parser_embl_gbk(self, files):
        self.helper.check_make_folder(self.gbk)
        for file_ in files:
            close = False
            with open(file_, "r") as f_h:
                for line in f_h:
                    if (line.startswith("LOCUS")):
                        out = open(self.gbk_tmp, "w")
                        datas = line.split(" ")
                        for data in datas:
                            if (len(data) != 0) and (data != "LOCUS"):
                                filename = ".".join([data.strip(), "gbk"])
                                break
                    elif (line.startswith("VERSION")):
                        datas = line.split(" ")
                        for data in datas:
                            if (len(data) != 0) and (data != "VERSION"):
                                new_filename = ".".join([data.strip(), "gbk"])
                                break
                        if new_filename.find(filename):
                            filename = new_filename
                    if out:
                        out.write(line)
                    if line.startswith("//"):
                        out.close()
                        close = True
                        shutil.move(self.gbk_tmp,
                                    os.path.join(self.gbk, filename))
            if not close:
                out.close()
        return self.gbk

    def _convert_embl(self, ref_embls, log):
        '''convert gbk to embl'''
        detect_gbk = False
        gbks = []
        out_gbk = None
        for embl in os.listdir(ref_embls):
            if (embl.endswith(".gbk")) or (
                    embl.endswith(".gbff")) or (
                    embl.endswith(".gb")):
                detect_gbk = True
                gbks.append(os.path.join(ref_embls, embl))
        if not detect_gbk:
            log.write("--related_gbk_files is assigned, but not gbk files are detected.\n"
                      "The gbk file names need to be ended at .gbk, .gb, or .gbff. \n")
            print("Error: Please assign proper Genebank files!")
            sys.exit()
        elif detect_gbk:
            out_gbk = self._parser_embl_gbk(gbks)
            log.write("Running converter.py to convert gbk file to embl format.\n")
            self.converter.convert_gbk2embl(out_gbk)
            self.helper.check_make_folder(self.embl)
            self.helper.move_all_content(out_gbk, self.embl, [".embl"])
            log.write("\t" + self.embl + " is generated and the embl files are stored in it.\n")
        return out_gbk

    def _run_ratt(self, args_ratt, tar, ref, out, log):
        if (not os.path.exists(self.embl)) or (
                not os.path.exists(os.path.join(
                    self.tmp_files["tar"], tar + ".fa"))) or (
                not os.path.exists(os.path.join(
                    self.tmp_files["ref"], ref + ".fa"))):
            print("Error: Please check --compare_pair, the strain names "
                  "should be the same as the strain names in fasta, "
                  "genbank or embl files!")
            log.write("The strain names in --compare_pair should be the same "
                      "as the strain names in fasta, genbank, or embl files.\n")
            sys.exit()
        log.write("Make sure your RATT version is at least 1.64.\n")
        log.write("If the RATT can not run properly, please check the "
                  "RATT_HOME and PAGIT_HOME is assigned correctly.\n")
        log.write(" ".join([args_ratt.ratt_path, self.embl,
              os.path.join(self.tmp_files["tar"], tar + ".fa"),
              args_ratt.element, args_ratt.transfer_type,
              os.path.join(self.tmp_files["ref"], ref + ".fa")]) + "\n")
        call([args_ratt.ratt_path, self.embl,
              os.path.join(self.tmp_files["tar"], tar + ".fa"),
              args_ratt.element, args_ratt.transfer_type,
              os.path.join(self.tmp_files["ref"], ref + ".fa")],
             stdout=out, stderr=DEVNULL)
        log.write("Done!\n")

    def _format_and_run(self, args_ratt, log):
        print("Running RATT")
        for pair in args_ratt.pairs:
            ref = pair.split(":")[0]
            tar = pair.split(":")[1]
            out = open(self.ratt_log, "w+")
            self._run_ratt(args_ratt, tar, ref, out, log)
            log.write("The following files are generatd:\n")
            for filename in os.listdir():
                if ("final" in filename):
                    log.write("\t" + filename + "\n")
                    shutil.move(filename, os.path.join(args_ratt.output_path,
                                                       filename))
                elif (args_ratt.element in filename) or (
                      "query" in filename) or (
                      "Reference" in filename) or (
                      "Query" in filename) or (
                      "Sequences" in filename):
                    log.write("\t" + filename + "\n")
                    if os.path.isfile(filename):
                        os.remove(filename)
                    if os.path.isdir(filename):
                        shutil.rmtree(filename)
        out.close()

    def annotation_transfer(self, args_ratt, log):
        self.multiparser.parser_fasta(args_ratt.tar_fastas)
        self.multiparser.parser_fasta(args_ratt.ref_fastas)
        out_gbk = None
        if args_ratt.ref_embls is None:
            out_gbk = self._convert_embl(args_ratt.ref_gbki, log)
        self._format_and_run(args_ratt, log)
        files = []
        for data in os.listdir(args_ratt.output_path):
            if "final.embl" in data:
                log.write("Running converter.py to convert embl "
                          "files in {0} to gff, ptt, and rnt format.\n".format(data))
                self._convert_to_gff(data, args_ratt, files, log)
                self._convert_to_pttrnt(args_ratt.gff_outfolder, files, log)
        self.helper.check_make_folder(self.tmp_files["out_gff"])
        log.write("Merging the output of {0}.\n".format(data))
        for folder in os.listdir(args_ratt.tar_fastas):
            files = []
            if "_folder" in folder:
                datas = folder.split("_folder")
                prefix = ".".join(datas[0].split(".")[:-1])
                for file_ in os.listdir(os.path.join(args_ratt.tar_fastas,
                                                     folder)):
                    files.append(file_[:-3])
                for gff in os.listdir(args_ratt.gff_outfolder):
                    for file_ in files:
                        if (".gff" in gff) and (file_ == gff[:-4]):
                            self.helper.merge_file(os.path.join(
                                 args_ratt.gff_outfolder, gff),
                                 self.tmp_files["gff"])
                        if (".ptt" in gff) and (file_ == gff[:-4]):
                            self.helper.merge_file(os.path.join(
                                 args_ratt.gff_outfolder, gff),
                                 self.tmp_files["ptt"])
                        if (".rnt" in gff) and (file_ == gff[:-4]):
                            self.helper.merge_file(os.path.join(
                                 args_ratt.gff_outfolder, gff),
                                 self.tmp_files["rnt"])
                if os.path.exists(self.tmp_files["gff"]):
                    shutil.move(self.tmp_files["gff"], os.path.join(
                                self.tmp_files["out_gff"], prefix + ".gff"))
                    shutil.move(self.tmp_files["ptt"], os.path.join(
                                self.tmp_files["out_gff"], prefix + ".ptt"))
                    shutil.move(self.tmp_files["rnt"], os.path.join(
                                self.tmp_files["out_gff"], prefix + ".rnt"))
                else:
                    print("Error: Please check your fasta or "
                          "annotation files, they should only contain "
                          "the query genome. And make sure your RATT can "
                          "work properly (check $ANNOgesic/output/"
                          "annotation_transfer/ratt_log.txt).")
                    log.write("Please check your fasta or "
                              "annotation files, they should only contain "
                              "the query genome. And make sure your RATT can "
                              "work properly (check $ANNOgesic/output/"
                              "annotation_transfer/ratt_log.txt).\n")
        self._remove_files(args_ratt, out_gbk, log)