Beispiel #1
0
class TargetFasta(object):
    '''detection of sRNA target interaction'''
    def __init__(self, tar_folder, ref_folder):
        self.multiparser = Multiparser()
        self.seq_editer = SeqEditer()
        self.helper = Helper()
        self.folders = {"tmp_tar": os.path.join(tar_folder, "tmp")}

    def gen_folder(self, out_folder, ref_files):
        new_ref_folder = os.path.join(out_folder, "tmp_reference")
        self.helper.check_make_folder(new_ref_folder)
        for file_ in ref_files:
            shutil.copy(file_, new_ref_folder)
        self.folders["tmp_ref"] = os.path.join(new_ref_folder, "tmp")
        self.multiparser.parser_fasta(new_ref_folder)
        if "tmp_tar" in os.listdir(out_folder):
            shutil.rmtree(self.folders["tmp_tar"])
        os.mkdir(self.folders["tmp_tar"])
        return new_ref_folder

    def get_target_fasta(self, mut_table, tar_folder, ref_files, output,
                         out_folder):
        new_ref_folder = self.gen_folder(out_folder, ref_files)
        self.seq_editer.modify_seq(self.folders["tmp_ref"], mut_table,
                                   self.folders["tmp_tar"])
        print("Transfering to target fasta")
        for file_ in output:
            first = True
            datas = file_.split(":")
            filename = datas[0]
            strains = datas[1].split(",")
            out = open(filename, "w")
            for strain in strains:
                if strain + ".fa" in os.listdir(self.folders["tmp_tar"]):
                    if first:
                        first = False
                    else:
                        out.write("\n")
                    with open(
                            os.path.join(self.folders["tmp_tar"],
                                         strain + ".fa")) as f_h:
                        for line in f_h:
                            out.write(line)
                else:
                    print(
                        "Error: No fasta information of {0}.fa".format(strain))
            out.close()
        shutil.rmtree(self.folders["tmp_tar"])
        shutil.rmtree(self.folders["tmp_ref"])
        if "tmp_reference" in os.listdir(out_folder):
            shutil.rmtree(new_ref_folder)
        print("Please use the new fasta file to remapping again.")
Beispiel #2
0
class Screen(object):
    '''generation of screenshot'''

    def __init__(self, args_sc, out_folder):
        self.helper = Helper()
        args_sc.output_folder = out_folder
        filename = args_sc.fasta.split("/")[-1]
        self.strain = ".".join(filename.split(".")[0:-1])
        self.helper.check_make_folder(os.path.join(args_sc.output_folder,
                                                   self.strain))
        self.forward_file = os.path.join(args_sc.output_folder,
                                         self.strain, "forward")
        self.reverse_file = os.path.join(args_sc.output_folder,
                                         self.strain, "reverse")
        os.mkdir(self.forward_file)
        os.mkdir(self.reverse_file)

    def _import_libs(self, texs, strand, lib_dict):
        if strand == "+":
            tex = "ft"
            notex = "fn"
        else:
            tex = "rt"
            notex = "rn"
        for flib in texs:
            if (flib[1] == "tex"):
                lib_dict[tex].append(flib[0])
                for nlib in texs:
                    if (nlib[1] == "notex") and \
                       (flib[2] == nlib[2]) and \
                       (flib[3] == nlib[3]):
                        lib_dict[notex].append(nlib[0])

    def screenshot(self, args_sc, log):
        lib_dict = {"ft": [], "fn": [], "rt": [], "rn": [], "ff": [], "rf": []}
        f_texs = []
        r_texs = []
        if args_sc.tlibs is not None:
            for lib in args_sc.tlibs:
                lib_datas = lib.split(":")
                if not lib_datas[0].endswith(".wig"):
                    log.write("Wiggle files should end with .wig.\n")
                    print("Error: Wiggle files should end with .wig!")
                    sys.exit()
                else:
                    if lib_datas[-1] == "+":
                        f_texs.append(lib_datas)
                    else:
                        r_texs.append(lib_datas)
            f_texs = sorted(f_texs, key=lambda x: (x[1], x[2], x[3]))
            r_texs = sorted(r_texs, key=lambda x: (x[1], x[2], x[3]))
            self._import_libs(f_texs, "+", lib_dict)
            self._import_libs(r_texs, "-", lib_dict)
        if args_sc.flibs is not None:
            for lib in args_sc.flibs:
                lib_datas = lib.split(":")
                if not lib_datas[0].endswith(".wig"):
                    log.write("Wiggle files should end with .wig.\n")
                    print("Error: Wiggle files should end with .wig!")
                    sys.exit()
                else:
                    if lib_datas[-1] == "+":
                        lib_dict["ff"].append(lib_datas[0])
                    else:
                        lib_dict["rf"].append(lib_datas[0])
        log.write("Running gen_screenshots.py to generate IGV batch script.\n")
        gen_screenshot(args_sc, lib_dict, self.forward_file + ".txt",
                       self.reverse_file + ".txt", self.strain)
        log.write("\t" + self.forward_file + ".txt is generated.\n")
        log.write("\t" + self.reverse_file + ".txt is generated.\n")
        if (args_sc.tlibs is None) and (args_sc.flibs is None):
            log.write("No wig files can be found.\n")
            print("Error: There is no wig file assigned!")
            sys.exit()
Beispiel #3
0
class Multiparser(object):
    def __init__(self):
        self.seq_editer = SeqEditer()
        self.helper = Helper()
        self.tmp_fa = "tmp.fa"
        self.tmp_gff = "tmp.gff"
        self.tmp_wig_forward = "tmp_forward.wig"
        self.tmp_wig_reverse = "tmp_reverse.wig"

    def combine_fasta(self, ref_folder, tar_folder, ref_feature):
        '''combine multiple fasta files'''
        tar_merge = os.path.join(tar_folder, "merge_tmp")
        change = False
        if ref_feature is None:
            ref_feature = ""
        else:
            ref_feature = "_" + ref_feature
        self.helper.check_make_folder(tar_merge)
        for folder in os.listdir(ref_folder):
            files = []
            if "_folder" in folder:
                datas = folder.split("_folder")
                if ref_feature == "":
                    prefix = datas[0][:-4]
                elif ref_feature == "_fasta":
                    if datas[0].endswith(".fa"):
                        prefix = datas[0][:-3]
                    elif datas[0].endswith(".fna"):
                        prefix = datas[0][:-4]
                    elif datas[0].endswith(".fasta"):
                        prefix = datas[0][:-6]
                else:
                    datas = datas[0][:-4]
                    datas = datas.split(ref_feature)
                    prefix = datas[0]
                print("Merging fasta files of " + prefix)
                for file_ in os.listdir("/".join([ref_folder, folder])):
                    if ref_feature == "":
                        files.append(file_[:-4])
                    elif ref_feature == "_fasta":
                        files.append(file_[:-3])
                    else:
                        filename = file_.split(ref_feature)
                        files.append(filename[0])
                for tar in os.listdir(tar_folder):
                    if tar.endswith(".fa") or \
                       tar.endswith(".fna") or \
                       tar.endswith(".fasta"):
                        filename = ".".join((tar.split("."))[:-1])
                        for file_ in files:
                            if filename == file_:
                                self.helper.merge_file(
                                    os.path.join(tar_folder, tar),
                                    os.path.join(tar_folder, self.tmp_fa))
                                change = True
                if change:
                    change = False
                    shutil.move(os.path.join(tar_folder, self.tmp_fa),
                                os.path.join(tar_merge, prefix + ".fa"))
        self.helper.remove_all_content(tar_folder, ".fa", "file")
        self.helper.move_all_content(tar_merge, tar_folder, None)
        shutil.rmtree(tar_merge)

    def get_prefix(self, folder, ref_feature):
        datas = folder.split("_folder")
        if ref_feature == "":
            prefix = datas[0][:-4]
        elif ref_feature == "_fasta":
            if datas[0].endswith(".fa"):
                prefix = datas[0][:-3]
            elif datas[0].endswith(".fna"):
                prefix = datas[0][:-4]
            elif datas[0].endswith(".fasta"):
                prefix = datas[0][:-6]
        else:
            datas = datas[0][:-4]
            datas = datas.split(ref_feature)
            prefix = datas[0]
        return prefix

    def combine_wig(self, ref_folder, tar_folder, ref_feature, libs):
        '''combine multiple wig files'''
        tar_merge = os.path.join(tar_folder, "merge_tmp")
        change_f = False
        change_r = False
        if ref_feature is None:
            ref_feature = ""
        else:
            ref_feature = "_" + ref_feature
        self.helper.check_make_folder(tar_merge)
        for folder in os.listdir(ref_folder):
            files = []
            if "_folder" in folder:
                prefix = self.get_prefix(folder, ref_feature)
                print("Merging wig files of " + prefix)
                for file_ in os.listdir(os.path.join(ref_folder, folder)):
                    if ref_feature == "":
                        files.append(file_[:-4])
                    elif ref_feature == "_fasta":
                        files.append(file_[:-3])
                    else:
                        filename = file_.split(ref_feature)
                        files.append(filename[0])
                for tar in os.listdir(tar_folder):
                    filename = tar.split("_STRAIN_")
                    for file_ in files:
                        if (tar.endswith(".wig")) and (file_
                                                       == filename[-1][:-4]):
                            for lib in libs:
                                if (filename[0] in lib) and (lib[-1] == "+"):
                                    self.helper.merge_file(
                                        os.path.join(tar_folder, tar),
                                        os.path.join(tar_folder,
                                                     self.tmp_wig_forward))
                                    change_f = True
                                elif (filename[0] in lib) and (lib[-1] == "-"):
                                    self.helper.merge_file(
                                        os.path.join(tar_folder, tar),
                                        os.path.join(tar_folder,
                                                     self.tmp_wig_reverse))
                                    change_r = True
                if change_f and change_r:
                    change_f = False
                    change_r = False
                    shutil.move(
                        os.path.join(tar_folder, self.tmp_wig_forward),
                        os.path.join(tar_merge, prefix + "_forward.wig"))
                    shutil.move(
                        os.path.join(tar_folder, self.tmp_wig_reverse),
                        os.path.join(tar_merge, prefix + "_reverse.wig"))
                else:
                    print("Error: comparing input files of {0} failed. "
                          "Please check the seq IDs of all gff and fasta "
                          "files, they should be the same.\nPlease "
                          "also check the wiggle files which should contain "
                          "forward and reverse files.".format(prefix))
                    sys.exit()
        self.helper.remove_all_content(tar_folder, ".wig", "file")
        self.helper.move_all_content(tar_merge, tar_folder, None)
        shutil.rmtree(tar_merge)

    def combine_gff(self, ref_folder, tar_folder, ref_feature, tar_feature):
        '''combine multiple gff files'''
        tar_merge = os.path.join(tar_folder, "merge_tmp")
        change = False
        if tar_feature is None:
            tar_feature = ""
        else:
            tar_feature = "_" + tar_feature
        if ref_feature is None:
            ref_feature = ""
        else:
            ref_feature = "_" + ref_feature
        self.helper.check_make_folder(tar_merge)
        for folder in os.listdir(ref_folder):
            files = []
            if "_folder" in folder:
                datas = folder.split("_folder")
                if ref_feature == "":
                    prefix = datas[0][:-4]
                elif ref_feature == "_fasta":
                    if datas[0].endswith(".fa"):
                        prefix = datas[0][:-3]
                    elif datas[0].endswith(".fna"):
                        prefix = datas[0][:-4]
                    elif datas[0].endswith(".fasta"):
                        prefix = datas[0][:-6]
                else:
                    datas = datas[0][:-4]
                    datas = datas.split(ref_feature)
                    prefix = datas[0]
                print("Merging gff files of " + prefix + tar_feature)
                for file_ in os.listdir(os.path.join(ref_folder, folder)):
                    if ref_feature == "":
                        files.append(file_[:-4])
                    elif ref_feature == "_fasta":
                        files.append(file_[:-3])
                    else:
                        filename = file_.split(ref_feature)
                        files.append(filename[0])
                for tar in os.listdir(tar_folder):
                    for file_ in files:
                        if (".gff" in tar) and (file_ + tar_feature
                                                == tar[:-4]):
                            self.helper.merge_file(
                                os.path.join(tar_folder, tar),
                                os.path.join(tar_folder, self.tmp_gff))
                            change = True
                if change:
                    change = False
                    shutil.move(
                        os.path.join(tar_folder, self.tmp_gff),
                        os.path.join(tar_folder, "merge_tmp",
                                     prefix + tar_feature + ".gff"))
        self.helper.remove_all_content(tar_folder, ".gff", "file")
        self.helper.move_all_content(tar_merge, tar_folder, None)
        shutil.rmtree(tar_merge)

    def parser_fasta(self, fastas):
        '''parser the fasta file based on strain'''
        par_tmp = os.path.join(fastas, "tmp")
        first = True
        out = None
        out_t = None
        detect = False
        for fasta in os.listdir(fastas):
            if (fasta.endswith(".fasta") or fasta.endswith(".fa")
                    or fasta.endswith(".fna")):
                detect = True
                self.seq_editer.modify_header(os.path.join(fastas, fasta))
        self.helper.check_make_folder(par_tmp)
        if not detect:
            print("Error: there are folders which conatin no fasta files! "
                  "The files should end with .fa or .fna or .fasta!")
            sys.exit()
        for fasta in os.listdir(fastas):
            if ("_folder" not in fasta) and ("tmp" != fasta):
                if (fasta.endswith(".fa")) or \
                   (fasta.endswith(".fna")) or \
                   (fasta.endswith(".fasta")):
                    out_path = os.path.join(fastas, fasta + "_folder")
                    print("Parsing " + fasta)
                    self.helper.check_make_folder(out_path)
                    with open(os.path.join(fastas, fasta), "r") as f_f:
                        for line in f_f:
                            if line[0] == ">":
                                line = line.strip()
                                if ("|" in line) and (len(line.split("|")) >
                                                      4):
                                    strain = line.split("|")
                                    name = strain[3]
                                else:
                                    name = line[1:]
                                if first:
                                    first = False
                                else:
                                    out.close()
                                    out_t.close()
                                out = open(
                                    os.path.join(out_path, name + ".fa"), "w")
                                out_t = open(
                                    os.path.join(par_tmp, name + ".fa"), "w")
                                out.write(">" + name + "\n")
                                out_t.write(">" + name + "\n")
                            else:
                                out.write(line)
                                out_t.write(line)
        if out is not None:
            out.close()
        if out_t is not None:
            out_t.close()

    def parser_gff(self, gff_folder, feature):
        '''parser gff file based on strain'''
        par_tmp = os.path.join(gff_folder, "tmp")
        out = None
        out_t = None
        first = True
        detect = False
        if feature is None:
            feature = ""
        else:
            feature = "_" + feature
        self.helper.check_make_folder(par_tmp)
        for filename in os.listdir(gff_folder):
            pre_seq_id = ""
            if ("_folder" not in filename) and ("tmp" != filename):
                out_path = os.path.join(gff_folder, filename + "_folder")
                if ".gff" in filename:
                    detect = True
                    print("Parsing " + filename)
                    self.helper.check_make_folder(out_path)
                    self.helper.sort_gff(os.path.join(gff_folder, filename),
                                         os.path.join(gff_folder, "tmp.gff"))
                    f_h = open(os.path.join(gff_folder, "tmp.gff"), "r")
                    for row in csv.reader(f_h, delimiter="\t"):
                        if row[0].startswith("#"):
                            continue
                        else:
                            if pre_seq_id == row[0]:
                                out.write("\t".join(row) + "\n")
                                out_t.write("\t".join(row) + "\n")
                            else:
                                if first:
                                    first = False
                                else:
                                    out.close()
                                    out_t.close()
                                out = open(
                                    os.path.join(out_path,
                                                 row[0] + feature + ".gff"),
                                    "w")
                                out_t = open(
                                    os.path.join(par_tmp,
                                                 row[0] + feature + ".gff"),
                                    "w")
                                pre_seq_id = row[0]
                                out.write("\t".join(row) + "\n")
                                out_t.write("\t".join(row) + "\n")
                    f_h.close()
        if not detect:
            print("Error: There are folders which contain no gff3 files! "
                  "The files should end with .gff!")
            sys.exit()
        if os.path.exists(os.path.join(gff_folder, "tmp.gff")):
            os.remove(os.path.join(gff_folder, "tmp.gff"))
        if out is not None:
            out.close()
        if out_t is not None:
            out_t.close()

    def parser_wig(self, wig_folder):
        '''parser the wig file based on strain'''
        par_tmp = os.path.join(wig_folder, "tmp")
        first = True
        out = None
        out_t = None
        detect = False
        self.helper.check_make_folder(par_tmp)
        for filename in os.listdir(wig_folder):
            track_info = ""
            if ("_folder" not in filename) and ("tmp" != filename):
                out_path = os.path.join(wig_folder, filename + "_folder")
                if ".wig" in filename:
                    detect = True
                    print("Parsing {0}".format(filename))
                    self.helper.check_make_folder(out_path)
                    with open(os.path.join(wig_folder, filename), "r") as w_f:
                        for line in w_f:
                            line = line.split(" ")
                            if (line[0] == "track"):
                                track_info = " ".join(line)
                            if (line[0] == "variableStep"):
                                strain = line[1].split("=")
                                if first:
                                    first = False
                                else:
                                    out.close()
                                    out_t.close()
                                out = open(
                                    "".join([
                                        os.path.join(out_path, filename[:-4]),
                                        "_STRAIN_", strain[1], ".wig"
                                    ]), "w")
                                out_t = open(
                                    "".join([
                                        os.path.join(wig_folder, "tmp",
                                                     filename[:-4]),
                                        "_STRAIN_", strain[1], ".wig"
                                    ]), "w")
                                if track_info != "":
                                    out.write(track_info)
                                    out_t.write(track_info)
                                out.write(" ".join(line))
                                out_t.write(" ".join(line))
                            if (line[0] != "track") and (line[0] !=
                                                         "variableStep"):
                                out.write(" ".join(line))
                                out_t.write(" ".join(line))
        if not detect:
            print("Error: There are folders which contain no wig files! "
                  "The files should end with .wig!")
            sys.exit()
        if out is not None:
            out.close()
        if out_t is not None:
            out_t.close()
Beispiel #4
0
class MEME(object):
    '''detection of promoter'''

    def __init__(self, args_pro):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.tss_path = os.path.join(args_pro.tsss, "tmp")
        if args_pro.gffs is not None:
            self.gff_path = os.path.join(args_pro.gffs, "tmp")
        else:
            self.gff_path = None
        self.out_fasta = os.path.join(args_pro.output_folder, "fasta_classes")
        self.tmp_folder = os.path.join(os.getcwd(), "tmp")
        self.fastas = {"pri": os.path.join(self.tmp_folder, "primary.fa"),
                       "sec": os.path.join(self.tmp_folder, "secondary.fa"),
                       "inter": os.path.join(self.tmp_folder, "internal.fa"),
                       "anti": os.path.join(self.tmp_folder, "antisense.fa"),
                       "orph": os.path.join(self.tmp_folder, "orphan.fa"),
                       "all_no_orph": "without_orphan.fa",
                       "all": "all_type.fa",
                       "tmp_fa": os.path.join(self.tmp_folder, "tmp.fa"),
                       "tmp_all": os.path.join(self.tmp_folder, "tmp_all.fa")}
        self.all_fasta = os.path.join(args_pro.fastas, "allfasta.fa")
        self.all_tss = os.path.join(self.tss_path, "allfasta_TSS.gff")

    def _gen_and_check_folder(self, out_path, folder, type_):
        sub_out_folder = os.path.join(out_path, type_)
        if folder in os.listdir(sub_out_folder):
            shutil.rmtree(os.path.join(sub_out_folder, folder))
        return sub_out_folder

    def _run_normal_motif(self, input_path, out_path, filename,
                          fasta, width, args_pro, log):
        '''run MEME with specific width'''
        folder = "_".join(["promoter_motifs", filename,
                           str(width), "nt"])
        if (args_pro.program.lower() == "meme") or (
                args_pro.program.lower() == "both"):
            meme_folder = self._gen_and_check_folder(
                            out_path, folder, "MEME")
            command = [args_pro.meme_path, "-maxsize", "1000000",
                       "-dna", "-nmotifs", str(args_pro.num_motif),
                       "-w", str(width), "-maxiter", "100",
                       "-evt", str(args_pro.e_value)]
            if args_pro.para is not None:
                command = command + ["-p", args_pro.para]
            log.write(" ".join(command + ["-oc", os.path.join(
                      meme_folder, folder),
                      os.path.join(input_path, fasta)]) + "\n")
            call(command + ["-oc", os.path.join(meme_folder, folder),
                            os.path.join(input_path, fasta)])
        if (args_pro.program.lower() == "glam2") or (
                args_pro.program.lower() == "both"):
            glam_folder = self._gen_and_check_folder(
                            out_path, folder, "GLAM2")
            log.write(" ".join([args_pro.glam2_path,
                  "-O", os.path.join(glam_folder, folder), "-w",
                  str(width), "-b", str(width), "-r",
                  str(args_pro.num_motif), "-n", str(args_pro.end_run),
                  "n", os.path.join(input_path, fasta)]) + "\n")
            call([args_pro.glam2_path,
                  "-O", os.path.join(glam_folder, folder), "-w",
                  str(width), "-b", str(width), "-r",
                  str(args_pro.num_motif), "-n", str(args_pro.end_run),
                  "n", os.path.join(input_path, fasta)])

    def _run_small_motif(self, input_path, out_path, filename,
                         fasta, width, args_pro, log):
        '''run MEME with range of width'''
        data = width.split("-")
        min_width = data[0]
        max_width = data[1]
        folder = "_".join(["promoter_motifs", filename,
                           "-".join([str(min_width), str(max_width)]), "nt"])
        if (args_pro.program.lower() == "meme") or (
                args_pro.program.lower() == "both"):
            meme_folder = self._gen_and_check_folder(
                            out_path, folder, "MEME")
            command = [args_pro.meme_path, "-maxsize", "1000000",
                       "-dna", "-nmotifs", str(args_pro.num_motif),
                       "-minsites", "0", "-maxsites", "2",
                       "-minw", str(min_width), "-maxw", str(max_width),
                       "-maxiter", "100",
                       "-evt", str(args_pro.e_value)]
            if args_pro.para is not None:
                command = command + ["-p", args_pro.para]
            log.write(" ".join(command + ["-oc", os.path.join(
                      meme_folder, folder),
                      os.path.join(input_path, fasta)]) + "\n")
            call(command + ["-oc", os.path.join(meme_folder, folder),
                            os.path.join(input_path, fasta)])
        if (args_pro.program.lower() == "glam2") or (
                args_pro.program.lower() == "both"):
            glam_folder = self._gen_and_check_folder(
                            out_path, folder, "GLAM2")
            log.write(" ".join([args_pro.glam2_path,
                  "-O", os.path.join(glam_folder, folder), "-a",
                  str(min_width), "-b", str(max_width), "-r",
                  str(args_pro.num_motif), "-n", str(args_pro.end_run),
                  "n", os.path.join(input_path, fasta)]) + "\n")
            call([args_pro.glam2_path,
                  "-O", os.path.join(glam_folder, folder), "-a",
                  str(min_width), "-b", str(max_width), "-r",
                  str(args_pro.num_motif), "-n", str(args_pro.end_run),
                  "n", os.path.join(input_path, fasta)])

    def _get_fasta_file(self, fasta_path, prefix):
        for fasta in os.listdir(fasta_path):
            if (fasta.endswith(".fa")) and \
               (prefix == fasta.replace(".fa", "")):
                break
            elif (fasta.endswith(".fna")) and \
                 (prefix == fasta.replace(".fna", "")):
                break
            elif (fasta.endswith(".fasta")) and \
                 (prefix == fasta.replace(".fasta", "")):
                break
        return fasta

    def _check_gff(self, gffs):
        for gff in os.listdir(gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(gffs, gff))

    def _move_and_merge_fasta(self, input_path, prefix):
        all_type = os.path.join(self.tmp_folder, self.fastas["all"])
        all_no_orph = os.path.join(self.tmp_folder, self.fastas["all_no_orph"])
        if self.fastas["all"] in os.listdir(self.tmp_folder):
            os.remove(all_type)
        if self.fastas["all_no_orph"] in os.listdir(self.tmp_folder):
            os.remove(all_no_orph)
        shutil.copyfile(self.fastas["pri"], self.fastas["tmp_fa"])
        self.helper.merge_file(self.fastas["sec"], self.fastas["tmp_fa"])
        self.helper.merge_file(self.fastas["inter"], self.fastas["tmp_fa"])
        self.helper.merge_file(self.fastas["anti"], self.fastas["tmp_fa"])
        shutil.copyfile(self.fastas["tmp_fa"], self.fastas["tmp_all"])
        self.helper.merge_file(self.fastas["orph"], self.fastas["tmp_all"])
        del_repeat_fasta(self.fastas["tmp_fa"], all_no_orph)
        del_repeat_fasta(self.fastas["tmp_all"], all_type)
        os.remove(self.fastas["tmp_fa"])
        os.remove(self.fastas["tmp_all"])
        out_prefix = os.path.join(input_path, prefix)
        shutil.move(self.fastas["pri"], "_".join([
            out_prefix, "allgenome_primary.fa"]))
        shutil.move(self.fastas["sec"], "_".join([
            out_prefix, "allgenome_secondary.fa"]))
        shutil.move(self.fastas["inter"], "_".join([
            out_prefix, "allgenome_internal.fa"]))
        shutil.move(self.fastas["anti"], "_".join([
            out_prefix, "allgenome_antisense.fa"]))
        shutil.move(self.fastas["orph"], "_".join([
            out_prefix, "allgenome_orphan.fa"]))
        shutil.move(all_type, "_".join([
            out_prefix, "allgenome_all_types.fa"]))
        shutil.move(all_no_orph, "_".join([
            out_prefix, "allgenome_without_orphan.fa"]))

    def _split_fasta_by_strain(self, input_path):
        for fasta in os.listdir(input_path):
            if "allgenome" not in fasta:
                os.remove(os.path.join(input_path, fasta))
        out = None
        for fasta in os.listdir(input_path):
            if fasta.endswith(".fa"):
                pre_strain = ""
                num_strain = 0
                with open(os.path.join(input_path, fasta), "r") as f_h:
                    for line in f_h:
                        line = line.strip()
                        if line.startswith(">"):
                            datas = line.split("_")
                            strain = "_".join(datas[2:])
                            if pre_strain != strain:
                                num_strain += 1
                                filename = fasta.split("allgenome")
                                if out is not None:
                                    out.close()
                                out = open(os.path.join(
                                           input_path, "".join([
                                               filename[0], strain,
                                               filename[-1]])), "a")
                                pre_strain = strain
                            out.write(line + "\n")
                        else:
                            out.write(line + "\n")
                if num_strain <= 1:
                    os.remove(os.path.join(input_path,
                              "".join([filename[0], strain, filename[-1]])))
        out.close()

    def _run_program(self, prefixs, args_pro, log, input_fastas):
        log.write("Using MEME or GLAM2 to predict promoter.\n")
        log.write("Please make sure their versions are at least 4.11.1.\n")
        log.write("If you are running for parallel, please make sure you "
                  "have install MPICH and its version is at least 3.2.\n")
        for prefix in prefixs:
            input_path = os.path.join(self.out_fasta, prefix)
            out_path = os.path.join(args_pro.output_folder, prefix)
            if args_pro.program.lower() == "both":
                self.helper.check_make_folder(os.path.join(out_path, "MEME"))
                self.helper.check_make_folder(os.path.join(out_path, "GLAM2"))
            elif args_pro.program.lower() == "meme":
                self.helper.check_make_folder(os.path.join(out_path, "MEME"))
            elif args_pro.program.lower() == "glam2":
                self.helper.check_make_folder(os.path.join(out_path, "GLAM2"))
            for fasta in os.listdir(input_path):
                filename = fasta.replace(".fa", "")
                names = filename.split("_")
                if (names[-1] in input_fastas) or (
                        ("_".join(names[-2:]) == "all_types") and (
                         "all_types" in input_fastas)) or (
                        ("_".join(names[-2:]) == "without_orphan") and (
                         "without_orphan" in input_fastas)):
                    for width in args_pro.widths:
                        print("Computing promoters of {0} - {1}".format(
                              fasta, width))
                        log.write("Computing promoters of {0} - length {1}.\n".format(
                                  fasta, width))
                        if "-" in width:
                            self._run_small_motif(input_path, out_path, filename,
                                                  fasta, width, args_pro, log)
                        else:
                            self._run_normal_motif(input_path, out_path, filename,
                                                   fasta, width, args_pro, log)
            log.write("Promoter search for {0} is done.\n".format(prefix))
            log.write("All the output files from MEME or GLAM2 are generated "
                      "and stored in {0}.\n".format(out_path))

    def _combine_file(self, prefixs, args_pro):
        '''combine all TSS file in the input folder to generate the 
        global TSS for detecting the global promoter'''
        if args_pro.source:
            for tss in os.listdir(self.tss_path):
                if tss.endswith("_TSS.gff"):
                    self.helper.merge_file(os.path.join(
                         self.tss_path, tss), self.all_tss)
            for fasta in os.listdir(args_pro.fastas):
                if (fasta.endswith(".fa")) or (
                        fasta.endswith(".fna")) or (
                        fasta.endswith(".fasta")):
                    self.helper.merge_file(os.path.join(
                         args_pro.fastas, fasta), self.all_fasta)
        else:
            for tss in os.listdir(os.path.join(
                                  args_pro.output_folder, "TSS_classes")):
                if tss.endswith("_TSS.gff"):
                    self.helper.merge_file(os.path.join(
                         self.tss_path, tss), self.all_tss)
            for fasta in os.listdir(args_pro.fastas):
                if (fasta.endswith(".fa")) or (
                        fasta.endswith(".fna")) or (
                        fasta.endswith(".fasta")):
                    self.helper.merge_file(os.path.join(
                         args_pro.fastas, fasta), self.all_fasta)
        print("Generating fasta file of all sequences")
        prefixs.append("allfasta")
        input_path = os.path.join(self.out_fasta, "allfasta")
        self.helper.check_make_folder(os.path.join(
                                      args_pro.output_folder, "allfasta"))
        self.helper.check_make_folder(os.path.join(
                                      self.out_fasta, "allfasta"))
        args_pro.source = True
        upstream(self.all_tss, self.all_fasta, None,
                 None, args_pro, None)
        self._move_and_merge_fasta(input_path, "allfasta")

    def _remove_files(self, args_pro):
        self.helper.remove_tmp_dir(args_pro.fastas)
        self.helper.remove_tmp_dir(args_pro.tsss)
        self.helper.remove_tmp_dir(args_pro.gffs)
        if "tmp_wig" in os.listdir(args_pro.output_folder):
            shutil.rmtree(os.path.join(args_pro.output_folder, "tmp_wig"))
        if "allfasta" in os.listdir(os.getcwd()):
            shutil.rmtree("allfasta")
        if "tmp" in os.listdir(os.getcwd()):
            shutil.rmtree("tmp")

    def _gen_table(self, output_folder, prefixs, combine, program, log):
        '''generate the promoter table'''
        log.write("Running gen_promoter_table.py to generate promoter "
                  "table which is useful for sRNA prediction.\n")
        log.write("The following files are generated:\n")
        if combine:
            strains = prefixs + ["allfasta"]
        else:
            strains = prefixs
        for strain in strains:
            tss_file = os.path.join(self.tss_path, strain + "_TSS.gff")
            if (program.lower() == "both") or (
                    program.lower() == "meme"):
                for folder in os.listdir(os.path.join(output_folder,
                                                      strain, "MEME")):
                    csv_file = os.path.join(output_folder, strain,
                                            "MEME", folder, "meme.csv")
                    gen_promoter_table(os.path.join(output_folder, strain,
                                       "MEME", folder, "meme.txt"),
                                       csv_file, tss_file, "meme")
                    log.write("\t" + csv_file + "\n")
            if (program.lower() == "both") or (
                    program.lower() == "glam2"):
                for folder in os.listdir(os.path.join(output_folder,
                                                      strain, "GLAM2")):
                    csv_file = os.path.join(output_folder, strain,
                                            "GLAM2", folder, "glam2.csv")
                    gen_promoter_table(os.path.join(output_folder, strain,
                                        "GLAM2", folder, "glam2.txt"),
                                        csv_file, tss_file, "glam2")
                    log.write("\t" + csv_file + "\n")

    def _get_upstream(self, args_pro, prefix, tss, fasta):
        '''get upstream sequence of TSS'''
        if args_pro.source:
            print("Generating fasta file of {0}".format(prefix))
            upstream(os.path.join(self.tss_path, tss),
                     os.path.join(args_pro.fastas, fasta),
                     None, None, args_pro, prefix)
        else:
            if (args_pro.gffs is None):
                print("Error: Please assign proper annotation!!!")
                sys.exit()
            if "TSS_classes" not in os.listdir(args_pro.output_folder):
                os.mkdir(os.path.join(args_pro.output_folder, "TSS_classes"))            
            print("Classifying TSSs and extracting sequence of {0}".format(prefix))
            upstream(os.path.join(self.tss_path, tss),
                     os.path.join(args_pro.fastas, fasta),
                     os.path.join(self.gff_path, prefix + ".gff"),
                     os.path.join(args_pro.output_folder, "TSS_classes",
                     "_".join([prefix, "TSS.gff"])), args_pro, prefix)

    def _get_used_tss_type(self, args_pro):
        input_fastas = []
        for tss in args_pro.use_tss:
            if int(tss) == 1:
                input_fastas.append("all_types")
            elif int(tss) == 2:
                input_fastas.append("primary")
            elif int(tss) == 3:
                input_fastas.append("secondary")
            elif int(tss) == 4:
                input_fastas.append("internal")
            elif int(tss) == 5:
                input_fastas.append("antisense")
            elif int(tss) == 6:
                input_fastas.append("orphan")
            elif int(tss) == 7:
                input_fastas.append("without_orphan")
            else:
                print("Error: The assignment of --use_tss_typ is wrong!")
                sys.exit()
        return input_fastas

    def run_meme(self, args_pro, log):
        if "allfasta.fa" in os.listdir(args_pro.fastas):
            os.remove(self.all_fasta)
            if "allfasta.fa_folder" in os.listdir(args_pro.fastas):
                shutil.rmtree(os.path.join(args_pro.fastas,
                              "allfasta.fa_folder"))
        self.multiparser.parser_fasta(args_pro.fastas)
        self.multiparser.parser_gff(args_pro.tsss, "TSS")
        if "allfasta_TSS.gff" in os.listdir(self.tss_path):
            os.remove(self.all_tss)
        if args_pro.gffs is not None:
            self._check_gff(args_pro.gffs)
            self.multiparser.parser_gff(args_pro.gffs, None)
            self.multiparser.combine_gff(args_pro.fastas, self.gff_path,
                                         "fasta", None)
        self._check_gff(args_pro.tsss)
        self.multiparser.combine_gff(args_pro.fastas, self.tss_path,
                                     "fasta", "TSS")
        self.helper.check_make_folder(self.out_fasta)
        self.helper.check_make_folder(self.tmp_folder)
        prefixs = []
        log.write("Running .TSS_upstream.py to extract the upstream "
                  "sequences of TSSs.\n")
        log.write("The following files are generated:\n")
        for tss in os.listdir(self.tss_path):
            prefix = tss.replace("_TSS.gff", "")
            prefixs.append(prefix)
            self.helper.check_make_folder(os.path.join(args_pro.output_folder,
                                                       prefix))
            self.helper.check_make_folder(os.path.join(self.out_fasta,
                                                       prefix))
            input_path = os.path.join(self.out_fasta, prefix)
            fasta = self._get_fasta_file(args_pro.fastas, prefix)
            self._get_upstream(args_pro, prefix, tss, fasta)
            self._move_and_merge_fasta(input_path, prefix)
            self._split_fasta_by_strain(input_path)
            for file_ in os.listdir(input_path):
                log.write("\t" + os.path.join(input_path, file_) + "\n")
        if args_pro.combine:
            self._combine_file(prefixs, args_pro)
            for file_ in os.listdir(os.path.join(self.out_fasta, "allfasta")):
                log.write("\t" + os.path.join(
                    self.out_fasta, "allfasta", file_) + "\n")
        input_fastas = self._get_used_tss_type(args_pro)
        self._run_program(prefixs, args_pro, log, input_fastas)
        print("Generating the tables")
        self._gen_table(args_pro.output_folder, prefixs,
                        args_pro.combine, args_pro.program, log)
        self._remove_files(args_pro)
Beispiel #5
0
class CircRNADetection(object):
    '''Detection of circRNA'''
    def __init__(self, args_circ):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.alignment_path = os.path.join(args_circ.output_folder,
                                           "segemehl_alignment_files")
        self.splice_path = os.path.join(args_circ.output_folder,
                                        "segemehl_splice_results")
        self.candidate_path = os.path.join(args_circ.output_folder,
                                           "circRNA_tables")
        self.gff_folder = os.path.join(args_circ.output_folder, "gffs")
        self.gff_path = os.path.join(args_circ.gffs, "tmp")
        self.splices = {"file": "splicesites.bed", "splice": "splicesites"}
        self.trans = {"file": "transrealigned.bed", "trans": "transrealigned"}
        self.fasta_path = os.path.join(args_circ.fastas, "tmp")

    def _wait_process(self, processes):
        '''wait for the parallels to finish the process'''
        for p in processes:
            p.wait()
            if p.stdout:
                p.stdout.close()
            if p.stdin:
                p.stdin.close()
            if p.stderr:
                p.stderr.close()
            try:
                p.kill()
            except OSError:
                pass
            time.sleep(5)

    def _deal_zip_file(self, read_files, log):
        tmp_datas = []
        tmp_reads = []
        for reads in read_files:
            zips = []
            tmp_datas = reads["files"]
            for read in reads["files"]:
                if read.endswith(".bz2"):
                    mod_read = read.replace(".bz2", "")
                    if (".fa" not in mod_read) and (
                            ".fasta"
                            not in mod_read) and (".fna" not in mod_read) and (
                                ".fq" not in mod_read) and (".fastq"
                                                            not in mod_read):
                        mod_read = mod_read + ".fa"
                    read_out = open(mod_read, "w")
                    tmp_datas.append(mod_read)
                    zips.append(mod_read)
                    print(" ".join(["Uncompressing", read]))
                    log.write(" ".join(["bzcat", read]) + "\n")
                    call(["bzcat", read], stdout=read_out)
                    log.write("\t" + mod_read + " is generated.\n")
                    read_out.close()
                elif read.endswith(".gz"):
                    mod_read = read.replace(".gz", "")
                    if (".fa" not in mod_read) and (
                            ".fasta"
                            not in mod_read) and (".fna" not in mod_read) and (
                                ".fq" not in mod_read) and (".fastq"
                                                            not in mod_read):
                        mod_read = mod_read + ".fa"
                    read_out = open(mod_read, "w")
                    tmp_datas.append(mod_read)
                    zips.append(mod_read)
                    print(" ".join(["Uncompressing", read]))
                    log.write(" ".join(["zcat", read]) + "\n")
                    call(["zcat", read], stdout=read_out)
                    read_out.close()
                    log.write("\t" + mod_read + " is generated.\n")
            tmp_reads.append({
                "sample": reads["sample"],
                "files": tmp_datas,
                "zips": zips
            })
        return tmp_reads

    def _run_segemehl_fasta_index(self, segemehl_path, fasta_path, index,
                                  fasta, log):
        log.write(" ".join([
            segemehl_path, "-x",
            os.path.join(fasta_path, index), "-d",
            os.path.join(fasta_path, fasta)
        ]) + "\n")
        call([
            segemehl_path, "-x",
            os.path.join(fasta_path, index), "-d",
            os.path.join(fasta_path, fasta)
        ])

    def _run_segemehl_align(self, args_circ, index, fasta, read, sam_file,
                            log_file, fasta_prefix, log):
        out = open(os.path.join(self.alignment_path, fasta_prefix, sam_file),
                   "w")
        log = open(os.path.join(self.alignment_path, fasta_prefix, log_file),
                   "w")
        log.write(" ".join([
            args_circ.segemehl_path, "-i",
            os.path.join(self.fasta_path, index), "-d",
            os.path.join(self.fasta_path, fasta), "-q", read, "-S"
        ]) + "\n")
        p = Popen([
            args_circ.segemehl_path, "-i",
            os.path.join(self.fasta_path, index), "-d",
            os.path.join(self.fasta_path, fasta), "-q", read, "-S"
        ],
                  stdout=out,
                  stderr=log)
        return p

    def _align(self, args_circ, read_datas, log):
        '''align the read. if the bam files are provided, it can be skipped.'''
        prefixs = []
        align_files = []
        log.write("Using segemehl to align the read.\n")
        log.write(
            "Please make sure the version of segemehl is at least 0.1.9.\n")
        for fasta in os.listdir(self.fasta_path):
            index = fasta.replace(".fa", ".idx")
            self._run_segemehl_fasta_index(args_circ.segemehl_path,
                                           self.fasta_path, index, fasta, log)
            processes = []
            num_process = 0
            fasta_prefix = fasta.replace(".fa", "")
            prefixs.append(fasta_prefix)
            self.helper.check_make_folder(
                os.path.join(self.alignment_path, fasta_prefix))
            log.write("Running for {0}.\n".format(fasta_prefix))
            for reads in read_datas:
                for read in reads["files"]:
                    num_process += 1
                    read_name = read.split("/")[-1]
                    if read_name.endswith(".fa") or \
                       read_name.endswith(".fna") or \
                       read_name.endswith(".fasta") or \
                       read_name.endswith(".fq") or \
                       read_name.endswith(".fastq"):
                        filename = read_name.split(".")
                        read_prefix = ".".join(filename[:-1])
                        sam_file = "_".join(
                            [read_prefix, fasta_prefix + ".sam"])
                        log_file = "_".join(
                            [read_prefix, fasta_prefix + ".log"])
                        align_files.append("_".join(
                            [read_prefix, fasta_prefix]))
                        print("Mapping {0}".format(sam_file))
                        p = self._run_segemehl_align(args_circ, index, fasta,
                                                     read, sam_file, log_file,
                                                     fasta_prefix, log)
                        processes.append(p)
                        if num_process == args_circ.cores:
                            self._wait_process(processes)
                            num_process = 0
                self._wait_process(processes)
            log.write("Done!\n")
            log.write("The following files are generated in {0}:\n".format(
                os.path.join(self.alignment_path, fasta_prefix)))
            for file_ in os.listdir(
                    os.path.join(self.alignment_path, fasta_prefix)):
                log.write("\t" + file_ + "\n")
        return align_files, prefixs

    def _run_samtools_convert_bam(self, samtools_path, pre_sam, out_bam, log):
        log.write(
            " ".join([samtools_path, "view", "-bS", pre_sam, "-o", out_bam]) +
            "\n")
        call([samtools_path, "view", "-bS", pre_sam, "-o", out_bam])

    def _convert_sam2bam(self, sub_alignment_path, samtools_path, align_files,
                         log):
        bam_files = []
        convert_ones = []
        remove_ones = []
        log.write("Using Samtools to convert SAM files to BAM files.\n")
        log.write(
            "Please make sure the version of Samtools is at least 1.3.1.\n")
        for sam in os.listdir(sub_alignment_path):
            pre_sam = os.path.join(sub_alignment_path, sam)
            if sam.endswith(".sam"):
                bam_file = sam.replace(".sam", ".bam")
                print("Converting {0} to {1}".format(sam, bam_file))
                out_bam = os.path.join(sub_alignment_path, bam_file)
                self._run_samtools_convert_bam(samtools_path, pre_sam, out_bam,
                                               log)
                bam_files.append(out_bam)
                if align_files:
                    if bam_file.replace(".bam", "") not in align_files:
                        convert_ones.append(out_bam)
                    else:
                        remove_ones.append(pre_sam)
            elif sam.endswith(".bam"):
                if (pre_sam not in convert_ones) and (pre_sam
                                                      not in remove_ones):
                    bam_files.append(pre_sam)
            elif sam.endswith(".log"):
                os.remove(pre_sam)
        log.write("Done!\n")
        log.write("The following files are generated:\n")
        for file_ in os.listdir(sub_alignment_path):
            if file_.endswith(".bam"):
                log.write("\t" + os.path.join(sub_alignment_path, file_) +
                          "\n")
        return bam_files, convert_ones, remove_ones

    def _run_samtools_merge_sort(self, samtools_path, prefix, out_folder,
                                 bam_datas, log):
        log.write("Using Samtools for merging, sorting and converting "
                  "the BAM files.\n")
        log.write("Make sure the version Samtools is at least 1.3.1.\n")
        for bam_data in bam_datas:
            print("Merging bam files for {0} of {1}".format(
                prefix, bam_data["sample"]))
            sample_bam = os.path.join(
                out_folder, "_".join([prefix, bam_data["sample"] + ".bam"]))
            if len(bam_data["files"]) <= 1:
                shutil.copyfile(bam_data["files"][0], sample_bam)
            else:
                file_line = " ".join(bam_data["files"])
                log.write(
                    " ".join([samtools_path, "merge", sample_bam, file_line]) +
                    "\n")
                os.system(" ".join(
                    [samtools_path, "merge", sample_bam, file_line]))
            print("Sorting bam files for {0} of {1}".format(
                prefix, bam_data["sample"]))
            sort_sample = os.path.join(
                out_folder,
                "_".join([prefix, bam_data["sample"] + "_sort.bam"]))
            log.write(" ".join(
                [samtools_path, "sort", "-o", sort_sample, sample_bam]) + "\n")
            call([samtools_path, "sort", "-o", sort_sample, sample_bam])
            os.remove(sample_bam)
            print("Converting bam files to sam files for {0} of {1}".format(
                prefix, bam_data["sample"]))
            log.write(" ".join([
                samtools_path, "view", "-h", "-o",
                sort_sample.replace(".bam", ".sam"), sort_sample
            ]) + "\n")
            call([
                samtools_path, "view", "-h", "-o",
                sort_sample.replace(".bam", ".sam"), sort_sample
            ])
        log.write("Done!\n")
        log.write("\t" + sort_sample.replace(".bam", ".sam") +
                  " is generated.\n")

    def _merge_sort_aligment_file(self, bam_datas, read_datas, samtools_path,
                                  out_folder, convert_ones, tmp_reads,
                                  remove_ones, prefix, log):
        if bam_datas is None:
            merge_bam_datas = []
            for read_data in read_datas:
                bam_files = []
                for read in read_data["files"]:
                    if read.endswith(".gz") or read.endswith(".bz2"):
                        read = ".".join(read.split("/")[-1].split(".")[:-1])
                    read_prefix = ".".join(read.split("/")[-1].split(".")[:-1])
                    bam_files.append(
                        os.path.join(self.alignment_path, prefix,
                                     "_".join([read_prefix, prefix + ".bam"])))
                merge_bam_datas.append({
                    "sample": read_data["sample"],
                    "files": bam_files
                })
        elif (bam_datas is not None) and (read_datas is not None):
            merge_bam_datas = copy.deepcopy(bam_datas)
            for bam_data in merge_bam_datas:
                for read_data in read_datas:
                    if bam_data["sample"] == read_data["sample"]:
                        for read in read_data["files"]:
                            read_prefix = ".".join(
                                read.split("/")[-1].split(".")[:-1])
                            bam = os.path.join(
                                self.alignment_path, prefix,
                                "_".join([read_prefix, prefix + ".bam"]))
                            if (bam not in bam_data["files"]):
                                bam_data["files"].append(bam)
        else:
            merge_bam_datas = copy.deepcopy(bam_datas)
        self._run_samtools_merge_sort(samtools_path, prefix, out_folder,
                                      merge_bam_datas, log)
        for bam in convert_ones:
            os.remove(bam)
        for sam in remove_ones:
            os.remove(sam)

    def _run_testrealign(self, prefix, testrealign_path, out_folder, log):
        log.write("Using Segemehl to detect circular RNAs.\n")
        log.write(
            "Please make sure the version of Segemehl is at least 0.1.9.\n")
        log.write(
            "Please make sure your testrealign.x exists. If it does not "
            "exists, please reinstall your Segemehl via using make all.\n")
        sub_splice_path = os.path.join(self.splice_path, prefix)
        if not os.path.exists(sub_splice_path):
            os.mkdir(sub_splice_path)
        err_log = os.path.join(sub_splice_path, prefix + ".log")
        print("Running testrealign.x for {0}".format(prefix))
        for sam_file in os.listdir(out_folder):
            if sam_file.endswith("sort.sam"):
                sample_prefix = sam_file.replace("_sort.sam", "")
                command = " ".join([
                    testrealign_path, "-d",
                    os.path.join(self.fasta_path, prefix + ".fa"), "-q",
                    os.path.join(out_folder, sam_file), "-n", "-U",
                    os.path.join(sub_splice_path,
                                 sample_prefix + "_splicesites.bed"), "-T",
                    os.path.join(sub_splice_path,
                                 sample_prefix + "_transrealigned.bed")
                ])
                log.write(command + " 2>" + err_log + "\n")
                os.system(command + " 2>" + err_log)
        log.write("Done!\n")
        log.write("The following files are generated:\n")
        for file_ in os.listdir(sub_splice_path):
            log.write("\t" + os.path.join(sub_splice_path, file_) + "\n")
        self.helper.remove_all_content(out_folder, ".sam", "file")

    def _merge_bed(self, fastas, splice_path, output_folder):
        '''Merge the bed files for analysis'''
        fa_prefixs = []
        for fasta in os.listdir(fastas):
            headers = []
            if (fasta.endswith(".fa") or fasta.endswith(".fna")
                    or fasta.endswith(".fasta")):
                with open(os.path.join(fastas, fasta), "r") as f_h:
                    for line in f_h:
                        line = line.strip()
                        if line.startswith(">"):
                            headers.append(line[1:])
                filename = fasta.split(".")
                fasta_prefix = ".".join(filename[:-1])
                fa_prefixs.append(fasta_prefix)
                bed_folder = os.path.join(output_folder, fasta_prefix)
                self.helper.check_make_folder(bed_folder)
                samples = []
                for header in headers:
                    for splice in os.listdir(os.path.join(splice_path,
                                                          header)):
                        if splice.endswith(".bed"):
                            if self.splices["file"] in splice:
                                sample = splice.replace(header, "")
                                sample = sample.replace(
                                    self.splices["file"], "")
                                if sample not in samples:
                                    samples.append(sample)
                            shutil.copyfile(
                                os.path.join(splice_path, header, splice),
                                os.path.join(bed_folder, "tmp_" + splice))
                for sample in samples:
                    out_splice = os.path.join(
                        bed_folder,
                        "".join([fasta_prefix + sample + self.splices["file"]
                                 ]))
                    out_trans = os.path.join(
                        bed_folder,
                        "".join([fasta_prefix + sample + self.trans["file"]]))
                    if os.path.exists(out_splice):
                        os.remove(out_splice)
                    if os.path.exists(out_trans):
                        os.remove(out_trans)
                    for file_ in os.listdir(bed_folder):
                        if (self.splices["splice"] in file_) and (sample
                                                                  in file_):
                            self.helper.merge_file(
                                os.path.join(bed_folder, file_), out_splice)
                        elif (self.trans["trans"] in file_) and (sample
                                                                 in file_):
                            self.helper.merge_file(
                                os.path.join(bed_folder, file_), out_trans)
        self.helper.remove_all_content(splice_path, None, "dir")
        return samples, fa_prefixs

    def _stat_and_gen_gff(self, prefixs, samples, args_circ, log):
        '''do statistics and print the result to gff file'''
        log.write(
            "Running circRNA.py to do statistics and generate gff files.\n")
        log.write("The following files are generated:\n")
        for prefix in prefixs:
            self.helper.check_make_folder(os.path.join(self.gff_folder,
                                                       prefix))
            self.helper.check_make_folder(
                os.path.join(self.splice_path, prefix))
            for bed in os.listdir(os.path.join(args_circ.output_folder,
                                               prefix)):
                if (bed.split("_")[0] != "tmp") and (bed.endswith(".bed")):
                    shutil.copy(
                        os.path.join(args_circ.output_folder, prefix, bed),
                        os.path.join(self.splice_path, prefix))
            self.helper.check_make_folder(
                os.path.join(self.candidate_path, prefix))
            print("Comparing circular RNAs with annotations of {0}".format(
                prefix))
            for sample in samples:
                splice_file = os.path.join(
                    self.splice_path, prefix,
                    "".join([prefix, sample, self.splices["file"]]))
                stat_file = os.path.join(
                    args_circ.stat_folder,
                    "".join(["stat_", prefix, sample, "circRNA.csv"]))
                csv_all = os.path.join(
                    self.candidate_path, prefix,
                    "".join([prefix, sample, "circRNA_all.csv"]))
                csv_best = os.path.join(
                    self.candidate_path, prefix,
                    "".join([prefix, sample, "circRNA_best.csv"]))
                gff_all = os.path.join(
                    self.gff_folder, prefix,
                    "".join([prefix, sample, "circRNA_all.gff"]))
                gff_best = os.path.join(
                    self.gff_folder, prefix,
                    "".join([prefix, sample, "circRNA_best.gff"]))
                detect_circrna(splice_file,
                               os.path.join(self.gff_path, prefix + ".gff"),
                               csv_all, args_circ, stat_file)
                self.converter.convert_circ2gff(
                    os.path.join(self.candidate_path, prefix,
                                 "".join([prefix, sample, "circRNA_all.csv"])),
                    args_circ, gff_all, gff_best)
                log.write("\t" + stat_file + "\n")
                log.write("\t" + csv_all + "\n")
                log.write("\t" + csv_best + "\n")
                log.write("\t" + gff_all + "\n")
                log.write("\t" + gff_best + "\n")

    def _extract_input_files(self, inputs):
        input_datas = []
        for input_ in inputs:
            datas = input_.split(":")
            if len(datas) != 2:
                print("Error: the format of --bam_files or "
                      "--read_files is wrong!")
                sys.exit()
            for file_ in datas[-1].split(","):
                if not os.path.exists(file_):
                    print("Error: some files in --bam_files or "
                          "--read_files do not exist!")
                    sys.exit()
            input_datas.append({
                "sample": datas[0],
                "files": datas[-1].split(",")
            })
        return input_datas

    def _combine_read_bam(self, bam_files, bam_datas, read_datas):
        if bam_datas is not None:
            for bam_data in bam_datas:
                for read_data in read_datas:
                    if bam_data["sample"] == read_data["sample"]:
                        for read in read_data["files"]:
                            prefix = ".".join(
                                read.split("/")[-1].split(".")[:-1])
                            bam = os.path.join(self.alignment_path,
                                               prefix + ".bam")
                            if (bam in bam_files) and (
                                    bam not in bam_data["files"]):
                                bam_data["files"].append(bam)
        else:
            bam_datas = []
            for read_data in read_datas:
                bam_files = []
                for read in read_data["files"]:
                    prefix = ".".join(read.split("/")[-1].split(".")[:-1])
                    bam_files.append(
                        os.path.join(self.alignment_path, prefix + ".bam"))
                bam_datas.append({
                    "sample": read_data["sample"],
                    "files": bam_files
                })
        return bam_datas

    def _remove_tmp_files(self, args_circ, fa_prefixs):
        self.helper.remove_tmp_dir(args_circ.fastas)
        self.helper.remove_tmp_dir(args_circ.gffs)
        self.helper.remove_all_content(args_circ.output_folder, ".bam", "file")
        for prefix in fa_prefixs:
            shutil.rmtree(os.path.join(args_circ.output_folder, prefix))

    def run_circrna(self, args_circ, log):
        '''detection of circRNA'''
        bam_datas = None
        read_datas = None
        if (args_circ.bams is None) and (args_circ.read_files is None):
            log.write("--bam_files and --read_files can not be both emtpy.\n")
            print("Error: --bam_files or --read_files should be assigned.")
            sys.exit()
        if args_circ.bams is not None:
            bam_datas = self._extract_input_files(args_circ.bams)
        if args_circ.read_files is not None:
            read_datas = self._extract_input_files(args_circ.read_files)
        for gff in os.listdir(args_circ.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(
                    os.path.join(args_circ.gffs, gff))
        if args_circ.segemehl_path is None:
            log.write("segemehl does not exists.\n")
            print("Error: please assign segemehl path!!")
            sys.exit()
        self.multiparser.parser_fasta(args_circ.fastas)
        self.multiparser.parser_gff(args_circ.gffs, None)
        self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta",
                                     None)
        tmp_reads = []
        if args_circ.read_files:
            log.write("Raw read files are found.\n")
            tmp_reads = self._deal_zip_file(read_datas, log)
            align_files, prefixs = self._align(args_circ, tmp_reads, log)
        else:
            align_files = None
        prefixs = []
        for fasta in os.listdir(self.fasta_path):
            if fasta.endswith(".fa"):
                fasta_prefix = fasta.replace(".fa", "")
                prefixs.append(fasta_prefix)
        for prefix in prefixs:
            if args_circ.read_files:
                sub_alignment_path = os.path.join(self.alignment_path, prefix)
                bam_files, convert_ones, remove_ones = self._convert_sam2bam(
                    sub_alignment_path, args_circ.samtools_path, align_files,
                    log)
            else:
                convert_ones = []
                remove_ones = []
            self._merge_sort_aligment_file(bam_datas, read_datas,
                                           args_circ.samtools_path,
                                           args_circ.output_folder,
                                           convert_ones, tmp_reads,
                                           remove_ones, prefix, log)
            self._run_testrealign(prefix, args_circ.testrealign_path,
                                  args_circ.output_folder, log)
        samples, fa_prefixs = self._merge_bed(args_circ.fastas,
                                              self.splice_path,
                                              args_circ.output_folder)
        self._stat_and_gen_gff(fa_prefixs, samples, args_circ, log)
        if len(tmp_reads) != 0:
            for reads in tmp_reads:
                for read in reads["zips"]:
                    os.remove(read)
        self._remove_tmp_files(args_circ, fa_prefixs)
Beispiel #6
0
class CircRNADetection(object):
    def __init__(self, args_circ):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.alignment_path = os.path.join(args_circ.output_folder,
                                           "segemehl_align")
        self.splice_path = os.path.join(args_circ.output_folder,
                                        "segemehl_splice")
        self.candidate_path = os.path.join(args_circ.output_folder,
                                           "circRNA_tables")
        self.gff_folder = os.path.join(args_circ.output_folder, "gffs")
        self.gff_path = os.path.join(args_circ.gffs, "tmp")
        self.splices = {
            "all_file": "splicesites_all.bed",
            "file": "splicesites.bed",
            "all": "splicesites_all",
            "splice": "splicesites"
        }
        self.trans = {
            "all_file": "transrealigned_all.bed",
            "file": "transrealigned.bed",
            "all": "transrealigned_all",
            "trans": "transrealigned"
        }
        self.bams = {"whole": "whole_reads.bam", "sort": "whole_reads_sort"}
        if args_circ.align:
            if args_circ.fastas is None:
                print("Error: There is no genome fasta file!!!")
                sys.exit()
            else:
                self.fasta_path = os.path.join(args_circ.fastas, "tmp")
        else:
            self.fasta_path = os.path.join(args_circ.fastas, "tmp")

    def _wait_process(self, processes):
        for p in processes:
            p.wait()
            if p.stdout:
                p.stdout.close()
            if p.stdin:
                p.stdin.close()
            if p.stderr:
                p.stderr.close()
            try:
                p.kill()
            except OSError:
                pass
            time.sleep(5)

    def _deal_zip_file(self, read_folder):
        tmp_reads = []
        for read in os.listdir(read_folder):
            if read.endswith(".bz2"):
                mod_read = read.replace(".bz2", "")
                if (".fa" not in mod_read) and (".fasta" not in mod_read) and (
                        ".fna" not in mod_read):
                    mod_read = mod_read + ".fa"
                read_out = open(os.path.join(read_folder, mod_read), "w")
                tmp_reads.append(os.path.join(read_folder, mod_read))
                print(" ".join(["unzip", read]))
                call(["bzcat", os.path.join(read_folder, read)],
                     stdout=read_out)
                read_out.close()
            elif read.endswith(".gz"):
                mod_read = read.replace(".gz", "")
                if (".fa" not in mod_read) and (".fasta" not in mod_read) and (
                        ".fna" not in mod_read):
                    mod_read = mod_read + ".fa"
                read_out = open(os.path.join(read_folder, mod_read), "w")
                tmp_reads.append(os.path.join(read_folder, mod_read))
                print(" ".join(["unzip", read]))
                call(["zcat", os.path.join(read_folder, read)],
                     stdout=read_out)
                read_out.close()
        return tmp_reads

    def _run_segemehl_fasta_index(self, segemehl_path, fasta_path, index,
                                  fasta):
        call([
            os.path.join(segemehl_path, "segemehl.x"), "-x",
            os.path.join(fasta_path, index), "-d",
            os.path.join(fasta_path, fasta)
        ])

    def _run_segemehl_align(self, args_circ, index, fasta, read, sam_file,
                            log_file, fasta_prefix):
        out = open(os.path.join(self.alignment_path, fasta_prefix, sam_file),
                   "w")
        log = open(os.path.join(self.alignment_path, fasta_prefix, log_file),
                   "w")
        p = Popen([
            os.path.join(args_circ.segemehl_path, "segemehl.x"), "-i",
            os.path.join(self.fasta_path, index), "-d",
            os.path.join(self.fasta_path, fasta), "-q",
            os.path.join(args_circ.read_folder, read), "-S"
        ],
                  stdout=out,
                  stderr=log)
        return p

    def _align(self, args_circ):
        prefixs = []
        align_files = []
        for fasta in os.listdir(self.fasta_path):
            index = fasta.replace(".fa", ".idx")
            self._run_segemehl_fasta_index(args_circ.segemehl_path,
                                           self.fasta_path, index, fasta)
            processes = []
            num_process = 0
            fasta_prefix = fasta.replace(".fa", "")
            prefixs.append(fasta_prefix)
            self.helper.check_make_folder(
                os.path.join(self.alignment_path, fasta_prefix))
            for read in os.listdir(args_circ.read_folder):
                num_process += 1
                if read.endswith(".fa") or \
                   read.endswith(".fna") or \
                   read.endswith("fasta"):
                    filename = read.split(".")
                    read_prefix = ".".join(filename[:-1])
                    sam_file = "_".join([read_prefix, fasta_prefix + ".sam"])
                    log_file = "_".join([read_prefix, fasta_prefix + ".log"])
                    align_files.append("_".join([read_prefix, fasta_prefix]))
                    print("mapping {0}".format(sam_file))
                    p = self._run_segemehl_align(args_circ, index, fasta, read,
                                                 sam_file, log_file,
                                                 fasta_prefix)
                    processes.append(p)
                    if num_process == args_circ.cores:
                        self._wait_process(processes)
                        num_process = 0
            self._wait_process(processes)
        return align_files, prefixs

    def _run_samtools_convert_bam(self, samtools_path, pre_sam, out_bam):
        call([samtools_path, "view", "-bS", pre_sam, "-o", out_bam])

    def _convert_sam2bam(self, sub_alignment_path, samtools_path, align_files):
        bam_files = []
        convert_ones = []
        remove_ones = []
        for sam in os.listdir(sub_alignment_path):
            pre_sam = os.path.join(sub_alignment_path, sam)
            if sam.endswith(".sam"):
                bam_file = sam.replace(".sam", ".bam")
                print("Convert {0} to {1}".format(sam, bam_file))
                out_bam = os.path.join(sub_alignment_path, bam_file)
                self._run_samtools_convert_bam(samtools_path, pre_sam, out_bam)
                bam_files.append(out_bam)
                if align_files:
                    if bam_file.replace(".bam", "") not in align_files:
                        convert_ones.append(out_bam)
                    else:
                        remove_ones.append(pre_sam)
            elif sam.endswith(".bam"):
                if (pre_sam not in convert_ones) and (pre_sam
                                                      not in remove_ones):
                    bam_files.append(pre_sam)
            elif sam.endswith(".log"):
                os.remove(pre_sam)
        return bam_files, convert_ones, remove_ones

    def _run_samtools_merge_sort(self, samtools_path, sub_alignment_path,
                                 bam_files):
        print("Merge all bam files....")
        whole_bam = os.path.join(sub_alignment_path, self.bams["whole"])
        if len(bam_files) <= 1:
            shutil.copyfile(bam_files[0], whole_bam)
        else:
            file_line = " ".join(bam_files)
            os.system(" ".join([samtools_path, "merge", whole_bam, file_line]))
        print("Sort bam files....")
        call([
            samtools_path, "sort", "-o",
            os.path.join(sub_alignment_path, self.bams["sort"] + ".bam"),
            whole_bam
        ])
        os.remove(os.path.join(sub_alignment_path, self.bams["whole"]))

    def _run_samtools_convert_sam(self, samtools_path, sub_alignment_path):
        print("Convert whole reads bam file to sam file....")
        call([
            samtools_path, "view", "-h", "-o",
            os.path.join(sub_alignment_path, self.bams["sort"] + ".sam"),
            os.path.join(sub_alignment_path, self.bams["sort"] + ".bam")
        ])

    def _merge_sort_aligment_file(self, bam_files, samtools_path,
                                  sub_alignment_path, convert_ones, tmp_reads,
                                  remove_ones):
        self._run_samtools_merge_sort(samtools_path, sub_alignment_path,
                                      bam_files)
        self._run_samtools_convert_sam(samtools_path, sub_alignment_path)
        for bam in convert_ones:
            os.remove(bam)
        for sam in remove_ones:
            os.remove(sam)
        if len(tmp_reads) != 0:
            for read in tmp_reads:
                os.remove(read)

    def _run_testrealign(self, prefix, segemehl_path, sub_alignment_path):
        self.helper.check_make_folder(os.path.join(self.splice_path, prefix))
        sub_splice_path = os.path.join(self.splice_path, prefix)
        err_log = os.path.join(sub_splice_path, prefix + ".log")
        print("Running testrealign.x for {0}".format(prefix))
        command = " ".join([
            os.path.join(segemehl_path, "testrealign.x"), "-d",
            os.path.join(self.fasta_path, prefix + ".fa"), "-q",
            os.path.join(sub_alignment_path, self.bams["sort"] + ".sam"), "-n"
        ])
        os.system(command + " 2>" + err_log)
        self.helper.move_all_content(os.getcwd(), sub_splice_path, [".bed"])
        self.helper.remove_all_content(sub_alignment_path, self.bams["sort"],
                                       "file")

    def _merge_bed(self, fastas, splice_path):
        tmp_prefixs = []
        for fasta in os.listdir(fastas):
            headers = []
            if (fasta.endswith(".fa") or fasta.endswith(".fna")
                    or fasta.endswith(".fasta")):
                with open(os.path.join(fastas, fasta), "r") as f_h:
                    for line in f_h:
                        line = line.strip()
                        if line.startswith(">"):
                            headers.append(line[1:])
                filename = fasta.split(".")
                fasta_prefix = ".".join(filename[:-1])
                tmp_prefixs.append(fasta_prefix)
                self.helper.check_make_folder(
                    os.path.join(os.getcwd(), fasta_prefix))
                for header in headers:
                    shutil.copyfile(
                        os.path.join(splice_path, header,
                                     self.splices["file"]),
                        os.path.join(
                            fasta_prefix,
                            "_".join([self.splices["splice"],
                                      header + ".bed"])))
                    shutil.copyfile(
                        os.path.join(splice_path, header, self.trans["file"]),
                        os.path.join(
                            fasta_prefix,
                            "_".join([self.trans["trans"], header + ".bed"])))
                out_splice = os.path.join(fasta_prefix,
                                          self.splices["all_file"])
                out_trans = os.path.join(fasta_prefix, self.trans["all_file"])
                if len(headers) > 1:
                    for file_ in os.listdir(fasta_prefix):
                        if (self.splices["splice"]
                                in file_) and (self.splices["all"]
                                               not in file_):
                            self.helper.merge_file(
                                os.path.join(fasta_prefix, file_), out_splice)
                        elif (self.trans["trans"]
                              in file_) and (self.trans["all"] not in file_):
                            self.helper.merge_file(
                                os.path.join(fasta_prefix, file_), out_trans)
                else:
                    shutil.move(
                        os.path.join(
                            fasta_prefix, "_".join(
                                [self.splices["splice"],
                                 headers[0] + ".bed"])), out_splice)
                    shutil.move(
                        os.path.join(
                            fasta_prefix, "_".join(
                                [self.trans["trans"], headers[0] + ".bed"])),
                        out_trans)
        self.helper.remove_all_content(splice_path, None, "dir")
        return tmp_prefixs

    def _stat_and_gen_gff(self, tmp_prefixs, args_circ):
        for prefix in tmp_prefixs:
            self.helper.check_make_folder(os.path.join(self.gff_folder,
                                                       prefix))
            shutil.copytree(prefix, os.path.join(self.splice_path, prefix))
            self.helper.check_make_folder(
                os.path.join(self.candidate_path, prefix))
            print("comparing with annotation of {0}".format(prefix))
            if self.splices["all_file"] in os.listdir(
                    os.path.join(self.splice_path, prefix)):
                detect_circrna(
                    os.path.join(self.splice_path, prefix,
                                 self.splices["all_file"]),
                    os.path.join(self.gff_path, prefix + ".gff"),
                    os.path.join(self.candidate_path, prefix,
                                 "_".join(["circRNA", prefix + "_all.csv"])),
                    args_circ,
                    os.path.join(args_circ.stat_folder,
                                 "_".join(["stat_circRNA", prefix + ".csv"])))
                self.converter.convert_circ2gff(
                    os.path.join(self.candidate_path, prefix,
                                 "_".join(["circRNA", prefix + "_all.csv"])),
                    args_circ,
                    os.path.join(self.gff_folder, prefix,
                                 "_".join([prefix, "circRNA_all.gff"])),
                    os.path.join(self.gff_folder, prefix,
                                 "_".join([prefix, "circRNA_best.gff"])))

    def _assign_merge_bam(self, args_circ):
        remove_frags = []
        bam_files = []
        if (args_circ.normal_bams is not None) and (args_circ.frag_bams
                                                    is not None):
            for frag in os.listdir(args_circ.frag_bams):
                if frag.endswith(".bam"):
                    shutil.copyfile(os.path.join(args_circ.frag_bams, frag),
                                    os.path.join(args_circ.normal_bams, frag))
                    remove_frags.append(frag)
            merge_folder = args_circ.normal_bams
        elif (args_circ.normal_bams is not None):
            merge_folder = args_circ.normal_bams
        elif (args_circ.frag_bams is not None):
            merge_folder = args_circ.frag_bams
        else:
            print("Error: please assign bam folder or do alignment!!")
            sys.exit()
        for bam in os.listdir(merge_folder):
            if bam.endswith(".bam"):
                bam_files.append(os.path.join(merge_folder, bam))
        return merge_folder, remove_frags, bam_files

    def run_circrna(self, args_circ):
        for gff in os.listdir(args_circ.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(
                    os.path.join(args_circ.gffs, gff))
        if args_circ.segemehl_path is None:
            print("Error: please assign segemehl folder!!")
            sys.exit()
        self.multiparser.parser_gff(args_circ.gffs, None)
        self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta",
                                     None)
        tmp_reads = []
        if args_circ.align:
            self.multiparser.parser_fasta(args_circ.fastas)
            tmp_reads = self._deal_zip_file(args_circ.read_folder)
            align_files, prefixs = self._align(args_circ)
        else:
            self.multiparser.parser_fasta(args_circ.fastas)
            prefixs = []
            for fasta in os.listdir(self.fasta_path):
                fasta_prefix = fasta.replace(".fa", "")
                prefixs.append(fasta_prefix)
            merge_folder, remove_frag, bam_files = self._assign_merge_bam(
                args_circ)
            align_files = None
        for prefix in prefixs:
            if args_circ.align:
                sub_alignment_path = os.path.join(self.alignment_path, prefix)
                bam_files, convert_ones, remove_ones = self._convert_sam2bam(
                    sub_alignment_path, args_circ.samtools_path, align_files)
            else:
                sub_alignment_path = merge_folder
                convert_ones = []
                remove_ones = []
            self._merge_sort_aligment_file(bam_files, args_circ.samtools_path,
                                           sub_alignment_path, convert_ones,
                                           tmp_reads, remove_ones)
            self._run_testrealign(prefix, args_circ.segemehl_path,
                                  sub_alignment_path)
        tmp_prefixs = self._merge_bed(args_circ.fastas, self.splice_path)
        self.multiparser.parser_gff(args_circ.gffs, None)
        self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta",
                                     None)
        self._stat_and_gen_gff(tmp_prefixs, args_circ)
        self.helper.remove_tmp(args_circ.fastas)
        self.helper.remove_tmp(args_circ.gffs)
        for tmp_prefix in tmp_prefixs:
            shutil.rmtree(tmp_prefix)
        if (not args_circ.align) and (len(remove_frag) != 0):
            for frag in remove_frag:
                os.remove(os.path.join(merge_folder, frag))
Beispiel #7
0
class SNPCalling(object):

    def __init__(self, args_snp):
        self.multiparser = Multiparser()
        self.seq_editer = SeqEditer()
        self.helper = Helper()
        if args_snp.types == "reference":
            file_type = "compare_reference"
        else:
            file_type = "validate_target"
        self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs")
        self.stat_path = os.path.join(args_snp.out_folder, file_type,
                                      "statistics")
        self.fasta_path = os.path.join(args_snp.fastas, "tmp")
        self.outputs = {"table": os.path.join(
                        args_snp.out_folder, file_type, "SNP_table"),
                        "raw": os.path.join(
                        args_snp.out_folder, file_type, "SNP_raw_outputs"),
                        "tmp": os.path.join(args_snp.out_folder, "tmp_bcf")}
        if "whole_reads.bam" in os.listdir(args_snp.out_folder):
            self.helper.remove_all_content(args_snp.out_folder,
                                           "whole_read", "file")
        self.bams = {"whole": os.path.join(args_snp.out_folder,
                                           "whole_reads.bam"),
                     "sort": os.path.join(args_snp.out_folder,
                                          "whole_reads_sorted.bam")}
        self.header = os.path.join(args_snp.out_folder, "header")
        self.baqs = {"with": "with_BAQ", "without": "without_BAQ",
                     "extend": "extend_BAQ"}

    def _import_bam(self, bam_folder, bams):
        num_bam = 0
        for bam in os.listdir(bam_folder):
            if bam.endswith(".bam"):
                num_bam += 1
                bams.append(os.path.join(bam_folder, bam))
        return num_bam

    def _transcript_snp(self, fasta, snp, out_table_prefix, type_,
                        prefix, bam_number, table_path, args_snp):
        seq_path = os.path.join(self.seq_path, self.baqs[type_], prefix)
        stat_file = os.path.join(self.stat_path, "_".join([
            "stat", "_".join([prefix, self.baqs[type_]]), "SNP.csv"]))
        snp_detect(fasta, snp, out_table_prefix,
                   os.path.join(seq_path, prefix), bam_number,
                   stat_file, args_snp)
        self.helper.move_all_content(table_path, self.stat_path, [".png"])

    def _run_tools(self, fasta_file, out_bcf, out_raw_prefix, type_, args_snp):
        if type_ == "with":
            call([args_snp.samtools_path, "mpileup",
                  "-t", "DP", "-ugf", fasta_file, self.bams["sort"],
                  "--ignore-RG"], stdout=out_bcf)
        elif type_ == "without":
            call([args_snp.samtools_path, "mpileup",
                  "-t", "DP", "-B", "-ugf", fasta_file,
                  self.bams["sort"], "--ignore-RG"],
                 stdout=out_bcf)
        elif type_ == "extend":
            call([args_snp.samtools_path, "mpileup",
                  "-t", "DP", "-E", "-ugf", fasta_file,
                  self.bams["sort"], "--ignore-RG"], stdout=out_bcf)
        out_vcf = "_".join([out_raw_prefix, self.baqs[type_] + ".vcf"])
        if args_snp.chrom == "1":
            call([args_snp.bcftools_path, "call", "--ploidy", args_snp.chrom,
                  self.outputs["tmp"], "-vmO", "v", "-o", out_vcf])
        elif args_snp.chrom == "2":
            call([args_snp.bcftools_path, "call",
                  self.outputs["tmp"], "-vmO", "v", "-o", out_vcf])
        return out_vcf

    def _run_sub(self, args_snp, fasta_file, type_, file_prefixs, prefix,
                 table_path, bam_number):
        out_bcf = open(self.outputs["tmp"], "w")
        out_vcf = self._run_tools(fasta_file, out_bcf,
                                  file_prefixs["raw_prefix"], type_, args_snp)
        self.helper.check_make_folder(
             os.path.join(self.seq_path, self.baqs[type_], prefix))
        self._transcript_snp(
            fasta_file, out_vcf,
            "_".join([file_prefixs["table_prefix"], self.baqs[type_]]),
            type_, prefix, bam_number, table_path, args_snp)
        out_bcf.close()

    def _run_program(self, fasta_file, file_prefixs, prefix, bam_number,
                     table_path, args_snp):
        for index in args_snp.program:
            if index == "1":
                type_ = "with"
                print("Running SNP calling with BAQ...")
            elif index == "2":
                type_ = "without"
                print("Running SNP calling without BAQ...")
            elif index == "3":
                print("Running SNP calling extend BAQ...")
                type_ = "extend"
            else:
                print("Error: No correct program, please assign 1, 2, 3")
                sys.exit()
            self._run_sub(args_snp, fasta_file, type_, file_prefixs, prefix,
                          table_path, bam_number)

    def _detect_fasta(self, fasta):
        detect = False
        if fasta.endswith(".fa"):
            prefix = fasta[:-3]
            detect = True
        elif fasta.endswith(".fna"):
            prefix = fasta[:-4]
            detect = True
        elif fasta.endswith(".fasta"):
            prefix = fasta[:-6]
            detect = True
        return (detect, prefix)

    def _run_bam(self, samtools_path, sub_command, bam_file):
        if sub_command == "merge":
            command = (" ".join([samtools_path, sub_command,
                       self.bams["whole"], bam_file]))
        elif sub_command == "sort":
            command = (" ".join([samtools_path, sub_command,
                       "-o", bam_file, self.bams["whole"]]))
        os.system(command)

    def _merge_bams(self, args_snp):
        bams = []
        num_normal = 0
        num_frag = 0
        if (args_snp.frag_bams is None) and (args_snp.normal_bams is None):
            print("Error: There is no BAMs folders!!")
            sys.exit()
        else:
            if args_snp.normal_bams is not None:
                num_normal = self._import_bam(args_snp.normal_bams, bams)
            if args_snp.frag_bams is not None:
                num_frag = self._import_bam(args_snp.frag_bams, bams)
        num_bam = num_normal + num_frag
        if num_bam <= 1:
            shutil.copyfile(bams[0], self.bams["whole"])
            print("Sort BAM file now ...")
            self._run_bam(args_snp.samtools_path, "sort",
                          self.bams["sort"])
        else:
            print("Merge BAM files now ...")
            self._run_bam(args_snp.samtools_path, "merge", " ".join(bams))
            print("Sort BAM file now ...")
            self._run_bam(args_snp.samtools_path, "sort",
                          self.bams["sort"])
        return num_bam

    def _modify_header(self, fastas):
        for fasta in os.listdir(fastas):
            if fasta.endswith("fasta") or \
               fasta.endswith("fa") or \
               fasta.endswith("fna"):
                self.seq_editer.modify_header(os.path.join(fastas, fasta))

    def _get_header(self, samtools_path):
        command = " ".join([samtools_path, "view", "-H", self.bams["sort"]])
        os.system(">".join([command, self.header]))

    def _get_genome_name(self, samtools_path):
        self._get_header(samtools_path)
        fh = open(self.header, "r")
        seq_names = []
        for row in csv.reader(fh, delimiter="\t"):
            if row[0] == "@SQ":
                seq_names.append(row[1].split(":")[1])
        fh.close()
        return seq_names

    def run_snp_calling(self, args_snp):
        self.multiparser.parser_fasta(args_snp.fastas)
        self._modify_header(args_snp.fastas)
        bam_number = self._merge_bams(args_snp)
        seq_names = self._get_genome_name(args_snp.samtools_path)
        if ("1" not in args_snp.program) and (
                "2" not in args_snp.program) and (
                "3" not in args_snp.program):
            print("Error:Please assign a correct BAQ type: "
                  "'1' means 'with_BAQ', '2' means 'with_BAQ' or "
                  "'3' means 'extend_BAQ'.")
            sys.exit()
        else:
            for fasta in os.listdir(self.fasta_path):
                if (fasta.split(".f")[0] in seq_names):
                    fasta_datas = self._detect_fasta(fasta)
                    detect = fasta_datas[0]
                    prefix = fasta_datas[1]
                    if detect:
                        detect = False
                        print("Computing {0} now ...".format(fasta))
                        self.helper.check_make_folder(
                             os.path.join(self.outputs["table"], prefix))
                        self.helper.check_make_folder(
                             os.path.join(self.outputs["raw"], prefix))
                        file_prefixs = {"raw_prefix": os.path.join(
                                        self.outputs["raw"], prefix, prefix),
                                        "table_prefix": os.path.join(
                                        self.outputs["table"], prefix, prefix)}
                        fasta_file = os.path.join(self.fasta_path, fasta)
                        table_path = os.path.join(self.outputs["table"],
                                                  prefix)
                        self._run_program(fasta_file, file_prefixs, prefix,
                                          bam_number, table_path, args_snp)
                        os.remove(self.outputs["tmp"])
        self.helper.remove_tmp(args_snp.fastas)
        os.remove(self.bams["whole"])
        os.remove(self.bams["sort"])
        os.remove(self.header)
Beispiel #8
0
class Screen(object):
    '''generation of screenshot'''
    def __init__(self, args_sc):
        self.helper = Helper()
        out_folder = os.path.join(args_sc.output_folder, "screenshots")
        if os.path.exists(out_folder):
            print("Error: The {0} already exist!".format(out_folder))
            sys.exit()
        else:
            os.mkdir(out_folder)
        args_sc.output_folder = out_folder
        filename = args_sc.fasta.split("/")[-1]
        self.strain = ".".join(filename.split(".")[0:-1])
        self.helper.check_make_folder(
            os.path.join(args_sc.output_folder, self.strain))
        self.forward_file = os.path.join(args_sc.output_folder, self.strain,
                                         "forward")
        self.reverse_file = os.path.join(args_sc.output_folder, self.strain,
                                         "reverse")
        os.mkdir(self.forward_file)
        os.mkdir(self.reverse_file)

    def _import_libs(self, texs, strand, lib_dict):
        if strand == "+":
            tex = "ft"
            notex = "fn"
        else:
            tex = "rt"
            notex = "rn"
        for flib in texs:
            if (flib[1] == "tex"):
                lib_dict[tex].append(flib[0])
                for nlib in texs:
                    if (nlib[1] == "notex") and \
                       (flib[2] == nlib[2]) and \
                       (flib[3] == nlib[3]):
                        lib_dict[notex].append(nlib[0])

    def screenshot(self, args_sc):
        lib_dict = {"ft": [], "fn": [], "rt": [], "rn": [], "ff": [], "rf": []}
        f_texs = []
        r_texs = []
        if args_sc.tlibs is not None:
            for lib in args_sc.tlibs:
                lib_datas = lib.split(":")
                if not lib_datas[0].endswith(".wig"):
                    print("Error: Exist a not proper wig files!")
                    sys.exit()
                else:
                    if lib_datas[-1] == "+":
                        f_texs.append(lib_datas)
                    else:
                        r_texs.append(lib_datas)
            f_texs = sorted(f_texs, key=lambda x: (x[1], x[2], x[3]))
            r_texs = sorted(r_texs, key=lambda x: (x[1], x[2], x[3]))
            self._import_libs(f_texs, "+", lib_dict)
            self._import_libs(r_texs, "-", lib_dict)
        if args_sc.flibs is not None:
            for lib in args_sc.flibs:
                lib_datas = lib.split(":")
                if not lib_datas[0].endswith(".wig"):
                    print("Error: Exist a not proper wig files!")
                    sys.exit()
                else:
                    if lib_datas[-1] == "+":
                        lib_dict["ff"].append(lib_datas[0])
                    else:
                        lib_dict["rf"].append(lib_datas[0])
        gen_screenshot(args_sc, lib_dict, self.forward_file + ".txt",
                       self.reverse_file + ".txt", self.strain)
        if (args_sc.tlibs is None) and (args_sc.flibs is None):
            print("Error: There are no wig file assigned!")
            sys.exit()
Beispiel #9
0
class Controller(object):

    """Manage the actions of the subcommands.

    The Controller take care of providing the argumentes like path
    names and the parallel processing of tasks.

    """
    def __init__(self, args):
        """Create an instance."""
        self._args = args
        self._paths = Paths(args.project_path)
        self.args_container = ArgsContainer()
        self.helper = Helper()

    def check_folder(self, folders):
        for folder in folders:
            if folder is None:
                print("Error: There is wrong path of folder assigned, "
                      "please check it!!")
                sys.exit()
            else:
                if os.path.exists(folder):
                    if len(os.listdir(folder)) == 0:
                        print("Error: There is empty folder, "
                              "please check it!!")
                        sys.exit()
                else:
                    print("Error: There is wrong folder, please check it!!")
                    sys.exit()

    def check_parameter(self, paras, names):
        for i in range(len(paras)):
            if paras[i] is None:
                print("Error: {0} is wrong, "
                      "please check it!!".format(names[i]))
                sys.exit()

    def check_no_require_folder(self, folders):
        for folder in folders:
            if folder is not None:
                if os.path.exists(folder):
                    if len(os.listdir(folder)) == 0:
                        print("Error: There is empty folder, "
                              "please check it!!")
                        sys.exit()
                else:
                    print("Error: There is wrong folder, "
                          "please check it!!")
                    sys.exit()

    def check_file(self, files, names, require):
        for i in range(len(files)):
            if require:
                if files[i] is None:
                    print("Error: {0} is wrong, "
                          "please check it!!".format(names[i]))
                    sys.exit()
                else:
                    if not os.path.isfile(files[i]):
                        print("Error: There is wrong path of {0}, "
                              "please check it!!".format(names[i]))
                        sys.exit()
            else:
                if files[i] is not None:
                    if not os.path.isfile(files[i]):
                        print("Error: There is wrong path of {0}, "
                              "please check it!!".format(names[i]))
                        sys.exit()

    def create_project(self, version):
        """Create a new project."""
        project_creator.create_root_folder(self._args.project_path)
        project_creator.create_subfolders(self._paths.required_folders("root"))
        project_creator.create_subfolders(
            self._paths.required_folders("get_target_fasta"))
        project_creator.create_version_file(
            self._paths.version_path, version)
        sys.stdout.write("Created folder \"%s\" and required subfolders.\n" % (
            self._args.project_path))

    def get_input(self):
        """Download required files from website."""
        print("Running get input files...")
        if self._args.FTP_path is None:
            print("Error: Please assign the path for downloading the data!!")
            sys.exit()
        if self._args.for_target:
            annotation_folder = self._paths.tar_annotation_folder
            fasta_folder = self._paths.tar_fasta_folder
        else:
            annotation_folder = self._paths.ref_annotation_folder
            fasta_folder = self._paths.ref_fasta_folder
        self.helper.check_make_folder(annotation_folder)
        self.helper.check_make_folder(fasta_folder)
        if self._args.ref_gff is True:
            get_file(self._args.FTP_path, annotation_folder,
                     "gff", self._args.for_target)
            get_file(self._args.FTP_path, annotation_folder,
                     "_genomic.gff.gz", self._args.for_target)
        if self._args.ref_fasta is True:
            get_file(self._args.FTP_path, fasta_folder,
                     "fna", self._args.for_target)
            get_file(self._args.FTP_path, fasta_folder,
                     "_genomic.fna.gz", self._args.for_target)
        if self._args.ref_gbk is True:
            get_file(self._args.FTP_path, annotation_folder,
                     "gbk", self._args.for_target)
            get_file(self._args.FTP_path, annotation_folder,
                     "gbff", self._args.for_target)
            get_file(self._args.FTP_path, annotation_folder,
                     "_genomic.gbff.gz", self._args.for_target)
        if self._args.ref_ptt is True:
            get_file(self._args.FTP_path, annotation_folder,
                     "ptt", self._args.for_target)
        if self._args.ref_rnt is True:
            get_file(self._args.FTP_path, annotation_folder,
                     "rnt", self._args.for_target)
        if self._args.convert_embl is True:
            annotation_files = os.listdir(annotation_folder)
            if len(annotation_files) == 0:
                sys.stdout.write("No gbk files!!\n")
            else:
                Converter().convert_gbk2embl(annotation_folder)

    def get_target_fasta(self):
        """Get target fasta"""
        print("Running get target fasta...")
        self.check_parameter([self._args.output_format], ["--output_format"])
        self.check_folder([self._args.ref_fasta_folder])
        self.check_file([self._args.mutation_table], "--mutation_table", True)
        project_creator.create_subfolders(
            self._paths.required_folders("get_target_fasta"))
        outputs = self._args.output_format.split(",")
        for output in outputs:
            output = output.strip()
        target = TargetFasta(self._paths.tar_fasta_folder,
                             self._args.ref_fasta_folder)
        target.get_target_fasta(
                self._args.mutation_table, self._paths.tar_fasta_folder,
                self._args.ref_fasta_folder, outputs)

    def ratt(self):
        """Run RATT to transfer annotation file from reference to target."""
        print("Running annotation transfer...")
        if (self._args.transfer_type != "Strain") and (
                self._args.transfer_type != "Assembly") and (
                self._args.transfer_type != "Species") and (
                self._args.transfer_type != "Assembly.Repetitive") and (
                self._args.transfer_type != "Strain.Repetitive") and (
                self._args.transfer_type != "Species.Repetitive") and (
                self._args.transfer_type != "Multiple") and (
                self._args.transfer_type != "Free"):
            print("Error: please assign correct --transfer_type!!")
            sys.exit()
        self.check_folder([self._args.ref_embl_gbk, self._args.target_fasta,
                           self._args.ref_fasta])
        self.check_parameter([self._args.element, self._args.compare_pair],
                             ["--element", "--compare_pair"])
        project_creator.create_subfolders(
            self._paths.required_folders("annotation_transfer"))
        args_ratt = self.args_container.container_ratt(
            self._args.RATT_path, self._args.element, self._args.transfer_type,
            self._args.ref_embl_gbk, self._args.target_fasta,
            self._args.ref_fasta, self._paths.ratt_folder,
            self._args.convert_to_gff_rnt_ptt,
            self._paths.tar_annotation_folder, self._args.compare_pair)
        ratt = RATT(args_ratt)
        ratt.annotation_transfer(args_ratt)

    def tsspredator(self):
        """Run TSSpredator for predicting TSS candidates."""
        self.check_folder([self._args.fasta_folder,
                           self._args.annotation_folder,
                           self._args.wig_folder])
        self.check_parameter([self._args.lib, self._args.output_prefix],
                             ["--lib", "--output_prefix"])
        self.check_no_require_folder([self._args.compare_transcript_assembly,
                                      self._args.reference_gff_folder])
        self.check_file([self._args.merge_manual], ["--merge_manual"], False)
        if self._args.compute_program.lower() == "tss":
            print("Running TSS prediction...")
            project_creator.create_subfolders(
                self._paths.required_folders("TSS"))
            out_folder = self._paths.tsspredator_folder
        elif self._args.compute_program.lower() == "processing_site":
            print("Running processing site prediction...")
            out_folder = self._paths.processing_site_folder
            project_creator.create_subfolders(
                self._paths.required_folders("processing"))
        else:
            print("Error:No such program!!!!")
            sys.exit()
        args_tss = self.args_container.container_tsspredator(
            self._args.TSSpredator_path, self._args.compute_program,
            self._args.fasta_folder, self._args.annotation_folder,
            self._args.wig_folder, self._args.lib,
            self._args.output_prefix,
            self._args.height, self._args.height_reduction,
            self._args.factor, self._args.factor_reduction,
            self._args.base_height, self._args.enrichment_factor,
            self._args.processing_factor, self._args.replicate_match,
            out_folder, self._args.statistics,
            self._args.validate_gene, self._args.merge_manual,
            self._args.compare_transcript_assembly, self._args.fuzzy,
            self._args.utr_length, self._args.cluster,
            self._args.length, self._args.re_check_orphan,
            self._args.overlap_feature, self._args.reference_gff_folder,
            self._args.remove_low_expression)
        tsspredator = TSSpredator(args_tss)
        tsspredator.run_tsspredator(args_tss)

    def optimize(self):
        """opimize TSSpredator"""
        self.check_folder([self._args.wig_folder, self._args.fasta_file,
                           self._args.annotation_file])
        self.check_file([self._args.manual],
                        ["--manual"], True)
        self.check_parameter([self._args.strain_name, self._args.lib,
                              self._args.output_prefix],
                             ["--strain_name", "--lib", "--output_prefix"])
        if self._args.program.lower() == "tss":
            print("Running optimization of TSS prediction...")
            project_creator.create_subfolders(
                self._paths.required_folders("TSS"))
            out_folder = self._paths.tsspredator_folder
        elif self._args.program.lower() == "processing_site":
            print("Running optimization of processing site prediction...")
            out_folder = self._paths.processing_site_folder
            project_creator.create_subfolders(
                self._paths.required_folders("processing"))
        else:
            print("Error:No such program!!!!")
            sys.exit()
        args_ops = self.args_container.container_optimize(
            self._args.TSSpredator_path, self._args.fasta_file,
            self._args.annotation_file, self._args.wig_folder,
            self._args.manual, out_folder,
            self._args.strain_name, self._args.max_height,
            self._args.max_height_reduction, self._args.max_factor,
            self._args.max_factor_reduction, self._args.max_base_height,
            self._args.max_enrichment_factor, self._args.max_processing_factor,
            self._args.utr_length, self._args.lib,
            self._args.output_prefix, self._args.cluster,
            self._args.length, self._args.core,
            self._args.program, self._args.replicate_match,
            self._args.steps)
        optimize_tss(args_ops)

    def color(self):
        """color the screenshots"""
        print("Running png files coloring...")
        self.check_parameter([self._args.track_number], ["--track_numer"])
        self.check_folder([self._args.screenshot_folder])
        color = ColorPNG()
        color.generate_color_png(
                self._args.track_number, self._args.screenshot_folder,
                self._args.ImageMagick_covert_path)

    def terminator(self):
        """Run TransTermHP for detecting terminators."""
        print("Running terminator prediction...")
        if self._args.TransTermHP_path is None:
            print("Please assign the folder where you install TransTermHP.")
        self.check_folder([self._args.fasta_folder,
                           self._args.annotation_folder,
                           self._args.transcript_folder])
        self.check_no_require_folder([self._args.sRNA])
        project_creator.create_subfolders(
            self._paths.required_folders("terminator"))
        args_term = self.args_container.container_terminator(
            self._args.TransTermHP_path, self._args.expterm_path,
            self._args.RNAfold_path,
            self._paths.transterm_folder, self._args.fasta_folder,
            self._args.annotation_folder, self._args.transcript_folder,
            self._args.sRNA, self._args.statistics,
            self._args.tex_wig_folder, self._args.frag_wig_folder,
            self._args.decrease, self._args.highest_coverage,
            self._args.fuzzy_detect_coverage,
            self._args.fuzzy_within_transcript,
            self._args.fuzzy_downstream_transcript,
            self._args.fuzzy_within_gene,
            self._args.fuzzy_downstream_gene, self._paths.transtermhp_folder,
            self._args.tex_notex_libs, self._args.frag_libs,
            self._args.tex_notex, self._args.replicates_tex,
            self._args.replicates_frag, self._args.table_best,
            self._args.min_loop_length, self._args.max_loop_length,
            self._args.min_stem_length,
            self._args.max_stem_length, self._args.min_U_tail_length,
            self._args.miss_rate, self._args.range_U_tail)
        terminator = Terminator(args_term)
        terminator.run_terminator(args_term)

    def transcript(self):
        """Run Transcriptome assembly."""
        print("Running transcript assembly...")
        self.check_folder([self._args.annotation_folder])
        self.check_no_require_folder([
            self._args.compare_TSS, self._args.compare_genome_annotation,
            self._args.terminator_folder])
        project_creator.create_subfolders(
            self._paths.required_folders("transcript_assembly"))
        args_tran = self.args_container.container_transcript(
            self._args.frag_wig_path, self._args.tex_wig_path,
            self._args.tex_notex,
            self._args.length, self._args.annotation_folder,
            self._args.height, self._args.width,
            self._args.tolerance, self._args.tolerance_coverage,
            self._args.replicates_tex, self._args.replicates_frag,
            self._paths.transcript_assembly_output_folder,
            self._args.compare_TSS, self._args.compare_genome_annotation,
            self._args.TSS_fuzzy, self._args.Tex_treated_libs,
            self._args.fragmented_libs, self._args.compare_feature_genome,
            self._args.table_best, self._args.terminator_folder,
            self._args.fuzzy_term)
        transcript = TranscriptAssembly(args_tran)
        transcript.run_transcript_assembly(args_tran)

    def utr_detection(self):
        """Run UTR detection."""
        print("Running UTR detection...")
        self.check_folder([self._args.annotation_folder,
                           self._args.transcript_assembly_folder,
                           self._args.TSS_folder])
        self.check_no_require_folder([self._args.terminator_folder])
        project_creator.create_subfolders(self._paths.required_folders("utr"))
        args_utr = self.args_container.container_utr(
                self._args.TSS_folder, self._args.annotation_folder,
                self._args.transcript_assembly_folder,
                self._args.terminator_folder,
                self._args.terminator_fuzzy, self._paths.utr_folder,
                self._args.TSS_source, self._args.base_5UTR,
                self._args.UTR_length, self._args.base_3UTR)
        utr = UTRDetection(args_utr)
        utr.run_utr_detection(args_utr)

    def srna_detection(self):
        """sRNA_detection."""
        print("Running sRNA prediction...")
        self.check_folder([self._args.annotation_folder,
                           self._args.transcript_assembly_folder])
        self.check_no_require_folder([self._args.fasta_folder,
                                      self._args.sORF,
                                      self._args.terminator_folder])
        self.check_file([self._args.promoter_table],
                        ["--promoter_table"], False)
        if self._args.UTR_derived_sRNA:
            self.check_folder([self._args.TSS_folder,
                               self._args.processing_site_folder])
        else:
            self.check_no_require_folder([self._args.TSS_folder,
                                          self._args.processing_site_folder])
        project_creator.create_subfolders(self._paths.required_folders("srna"))
        args_srna = self.args_container.container_srna(
                self._args.Vienna_folder, self._args.Vienna_utils,
                self._args.blast_plus_folder,
                self._args.ps2pdf14_path, self._paths.srna_folder,
                self._args.UTR_derived_sRNA, self._args.annotation_folder,
                self._args.TSS_folder, self._args.transcript_assembly_folder,
                self._args.TSS_intergenic_fuzzy, self._args.TSS_5UTR_fuzzy,
                self._args.TSS_3UTR_fuzzy, self._args.TSS_interCDS_fuzzy,
                self._args.import_info, self._args.tex_wig_folder,
                self._args.frag_wig_folder, self._args.processing_site_folder,
                self._args.fasta_folder, self._args.mountain_plot,
                self._args.nr_format, self._args.srna_format,
                self._args.sRNA_database_path, self._args.nr_database_path,
                self._args.cutoff_energy,
                self._args.run_intergenic_TEX_coverage,
                self._args.run_intergenic_noTEX_coverage,
                self._args.run_intergenic_fragmented_coverage,
                self._args.run_antisense_TEX_coverage,
                self._args.run_antisense_noTEX_coverage,
                self._args.run_antisense_fragmented_coverage,
                self._args.intergenic_tolerance,
                self._args.run_utr_TEX_coverage,
                self._args.run_utr_noTEX_coverage,
                self._args.run_utr_fragmented_coverage,
                self._args.max_length, self._args.min_length,
                self._args.tex_notex_libs, self._args.frag_libs,
                self._args.replicates_tex, self._args.replicates_frag,
                self._args.tex_notex, self._args.blast_e_nr,
                self._args.blast_e_srna, self._args.detect_sRNA_in_CDS,
                self._args.table_best, self._args.decrease_intergenic,
                self._args.decrease_utr, self._args.fuzzy_intergenic,
                self._args.fuzzy_utr, self._args.cutoff_nr_hit,
                self._args.sORF, self._args.best_with_all_sRNAhit,
                self._args.best_without_sORF_candidate,
                self._args.overlap_percent_CDS,
                self._args.terminator_folder,
                self._args.terminator_fuzzy_in_CDS,
                self._args.terminator_fuzzy_out_CDS,
                self._args.best_with_terminator,
                self._args.ignore_hypothetical_protein, self._args.TSS_source,
                self._args.min_utr_coverage, self._args.promoter_table,
                self._args.best_with_promoter,
                self._args.ranking_time_promoter, self._args.promoter_name)
        srna = sRNADetection(args_srna)
        srna.run_srna_detection(args_srna)

    def sorf_detection(self):
        """sORF_detection."""
        print("Running sORF prediction...")
        self.check_folder([self._args.transcript_assembly_folder,
                           self._args.annotation_folder,
                           self._args.fasta_folder])
        self.check_no_require_folder([
            self._args.sRNA_folder, self._args.TSS_folder])
        project_creator.create_subfolders(
            self._paths.required_folders("sorf"))
        args_sorf = self.args_container.container_sorf(
            self._paths.sorf_folder, self._args.UTR_derived_sORF,
            self._args.transcript_assembly_folder,
            self._args.annotation_folder,
            self._args.TSS_folder, self._args.utr_length,
            self._args.min_length, self._args.max_length,
            self._args.tex_wig_folder, self._args.frag_wig_folder,
            self._args.cutoff_intergenic_coverage,
            self._args.cutoff_antisense_coverage,
            self._args.cutoff_5utr_coverage,
            self._args.cutoff_3utr_coverage,
            self._args.cutoff_interCDS_coverage,
            self._args.fasta_folder, self._args.tex_notex_libs,
            self._args.frag_libs, self._args.tex_notex,
            self._args.replicates_tex, self._args.replicates_frag,
            self._args.table_best, self._args.sRNA_folder,
            self._args.start_codon, self._args.stop_codon,
            self._args.cutoff_background, self._args.fuzzy_rbs,
            self._args.rbs_not_after_TSS, self._args.print_all_combination,
            self._args.best_no_sRNA, self._args.best_no_TSS,
            self._args.ignore_hypothetical_protein,
            self._args.min_rbs_distance, self._args.max_rbs_distance)
        sorf = sORFDetection(args_sorf)
        sorf.run_sorf_detection(args_sorf)

    def meme(self):
        """promoter detectopn"""
        print("Running promoter detection...")
        self.check_folder([self._args.TSS_folder, self._args.fasta_folder])
        if not self._args.TSS_source:
            self.check_folder([self._args.annotation_folder])
        project_creator.create_subfolders(
            self._paths.required_folders("promoter"))
        args_pro = self.args_container.container_promoter(
            self._args.MEME_path,
            self._paths.promoter_output_folder, self._args.tex_libs,
            self._args.TSS_folder, self._args.fasta_folder,
            self._args.num_motif, self._args.nt_before_TSS,
            self._args.motif_width, self._args.TSS_source,
            self._args.tex_wig_path, self._args.annotation_folder,
            self._args.combine_all, self._args.e_value)
        meme = MEME(args_pro)
        meme.run_meme(args_pro)

    def operon(self):
        """operon detection"""
        print("Running operon detection...")
        self.check_folder([self._args.TSS_folder, self._args.annotation_folder,
                           self._args.transcript_folder,
                           self._args.UTR5_folder, self._args.UTR3_folder])
        self.check_no_require_folder([self._args.term_folder])
        project_creator.create_subfolders(
            self._paths.required_folders("operon"))
        args_op = self.args_container.container_operon(
            self._args.TSS_folder, self._args.annotation_folder,
            self._args.transcript_folder, self._args.UTR5_folder,
            self._args.UTR3_folder, self._args.term_folder,
            self._args.TSS_fuzzy, self._args.term_fuzzy,
            self._args.min_length, self._args.statistics,
            self._paths.operon_output_folder, self._args.combine_gff,
            self._paths.operon_statistics_folder)
        operon = OperonDetection(args_op)
        operon.run_operon(args_op)

    def circrna(self):
        """circRNA detection"""
        print("Running circular RNA prediction...")
        self.check_folder([self._args.fasta_path, self._args.annotation_path])
        self.check_no_require_folder([self._args.tex_bam_path,
                                      self._args.fragmented_bam_path])
        project_creator.create_subfolders(
            self._paths.required_folders("circrna"))
        args_circ = self.args_container.container_circrna(
            self._args.align, self._args.process, self._args.fasta_path,
            self._args.annotation_path, self._args.tex_bam_path,
            self._args.fragmented_bam_path,
            self._paths.read_folder, self._paths.circrna_stat_folder,
            self._args.support_reads,
            self._args.segemehl_folder, self._args.samtools_path,
            self._args.start_ratio, self._args.end_ratio,
            self._args.ignore_hypothetical_protein,
            self._paths.circrna_output_folder)
        circ = CircRNADetection(args_circ)
        circ.run_circrna(args_circ)

    def goterm(self):
        """Go term discovery"""
        print("Running GO term mapping...")
        self.check_folder([self._args.annotation_path])
        self.check_no_require_folder([self._args.transcript_path])
        self.check_file([self._args.UniProt_id, self._args.go_obo,
                         self._args.goslim_obo],
                        ["--UniProt_id", "--go.obo", "--goslim_obo"], True)
        project_creator.create_subfolders(
            self._paths.required_folders("go_term"))
        args_go = self.args_container.container_goterm(
            self._args.annotation_path,
            self._paths.goterm_output_folder, self._args.UniProt_id,
            self._args.go_obo, self._args.goslim_obo,
            self._args.transcript_path)
        goterm = GoTermFinding(args_go)
        goterm.run_go_term(args_go)

    def srna_target(self):
        """sRNA target prediction"""
        print("Running sRNA target prediction...")
        self.check_folder([self._args.fasta_path, self._args.sRNA_path,
                           self._args.annotation_path])
        project_creator.create_subfolders(
            self._paths.required_folders("srna_target"))
        args_tar = self.args_container.container_srna_target(
            self._args.Vienna_folder, self._args.annotation_path,
            self._args.fasta_path, self._args.sRNA_path,
            self._args.query_sRNA, self._args.program,
            self._args.interaction_length, self._args.window_size_target,
            self._args.span_target, self._args.window_size_srna,
            self._args.span_srna,
            self._args.unstructured_region_RNAplex_target,
            self._args.unstructured_region_RNAplex_srna,
            self._args.unstructured_region_RNAup, self._args.energy_threshold,
            self._args.duplex_distance, self._args.top,
            self._paths.starget_output_folder, self._args.process_rnaplex,
            self._args.process_rnaup, self._args.continue_rnaup,
            self._args.potential_target_start, self._args.potential_target_end,
            self._args.target_feature)
        srnatarget = sRNATargetPrediction(args_tar)
        srnatarget.run_srna_target_prediction(args_tar)

    def snp(self):
        """SNP transcript detection"""
        print("Running SNP/mutations calling...")
        self.check_folder([self._args.fasta_path])
        if (self._args.bam_type != "target") and (
                self._args.bam_type != "reference"):
            print("Error: please assign \"target\" or"
                  " \"reference\" to --bam_type!!")
            sys.exit()
        if (self._args.ploidy != "haploid") and (
                self._args.ploidy != "diploid"):
            print("Error: please assign \"haploid\" or"
                  " \"diploid\" to --chromosome_type!!")
        project_creator.create_subfolders(self._paths.required_folders("snp"))
        args_snp = self.args_container.container_snp(
            self._args.samtools_path, self._args.bcftools_path,
            self._args.bam_type,
            self._args.program, self._args.fasta_path,
            self._args.tex_bam_path, self._args.frag_bam_path,
            self._args.quality, self._args.read_depth,
            self._paths.snp_output_folder, self._args.indel_fraction,
            self._args.ploidy)
        snp = SNPCalling(args_snp)
        snp.run_snp_calling(args_snp)

    def ppi(self):
        """PPI network retrieve"""
        print("Running protein-protein interaction networks prediction...")
        self.check_folder([self._args.gff_path])
        self.check_parameter([self._args.proteinID_strains,
                              self._args.species_STRING],
                             ["--proteinID_strains", "--species_STRING"])
        project_creator.create_subfolders(
            self._paths.required_folders("ppi_network"))
        args_ppi = self.args_container.container_ppi(
            self._args.gff_path, self._args.proteinID_strains,
            self._args.without_strain_pubmed, self._args.species_STRING,
            self._args.score, self._paths.ppi_output_folder,
            self._args.node_size, self._args.query)
        ppi = PPINetwork(self._paths.ppi_output_folder)
        ppi.retrieve_ppi_network(args_ppi)

    def sublocal(self):
        """Subcellular Localization prediction"""
        print("Running subcellular localization prediction...")
        self.check_folder([self._args.gff_path, self._args.fasta_path])
        self.check_no_require_folder([self._args.transcript_path])
        if (self._args.bacteria_type != "positive") and (
                self._args.bacteria_type != "negative"):
            print("Error: please assign \"positive\" or"
                  " \"negative\" to --bacteria_type!!")
            sys.exit()
        project_creator.create_subfolders(
            self._paths.required_folders("subcellular_localization"))
        args_sub = self.args_container.container_sublocal(
            self._args.Psortb_path, self._args.gff_path,
            self._args.fasta_path, self._args.bacteria_type,
            self._args.difference_multi, self._args.merge_to_gff,
            self._paths.sublocal_output_folder, self._args.transcript_path)
        sublocal = SubLocal(args_sub)
        sublocal.run_sub_local(args_sub)

    def ribos(self):
        """riboswitch prediction"""
        print("Running riboswitch prediction...")
        self.check_folder([self._args.gff_path, self._args.fasta_path,
                           self._args.tss_path, self._args.transcript_path])
        self.check_file([self._args.riboswitch_ID, self._args.Rfam],
                        ["--riboswitch_ID", "--Rfam"], True)
        project_creator.create_subfolders(
            self._paths.required_folders("riboswitch"))
        args_ribo = self.args_container.container_ribos(
            self._args.infernal_path, self._args.riboswitch_ID,
            self._args.gff_path, self._args.fasta_path,
            self._args.tss_path, self._args.transcript_path,
            self._args.Rfam, self._paths.ribos_output_folder,
            self._args.e_value,
            self._args.output_all, self._paths.database_folder,
            self._args.fuzzy, self._args.start_codon,
            self._args.min_dist_rbs, self._args.max_dist_rbs,
            self._args.fuzzy_rbs, self._args.UTR_length)
        ribos = Ribos(args_ribo)
        ribos.run_ribos(args_ribo)

    def screen(self):
        """generate screenshot"""
        print("Running screenshot generating...")
        self.check_file([self._args.main_gff, self._args.fasta],
                        ["--main_gff", "--fasta"], True)
        if self._args.side_gffs is not None:
            for gff in (self._args.side_gffs.split(",")):
                gff = gff.strip()
                if not os.path.isfile(gff):
                    print("Error: The --side_gffs no exist!!")
                    sys.exit()
        if self._args.output_folder is None:
            print("Error: please assign --output_folder!!")
            sys.exit()
        if (self._args.present != "expand") and (
                self._args.present != "collapse") and (
                self._args.present != "squish"):
            print("Error: please assign \"expand\" or "
                  "\"collapse\" or \"squish\" to --present!!")
            sys.exit()
        args_sc = self.args_container.container_screen(
            self._args.main_gff, self._args.side_gffs,
            self._args.fasta, self._args.frag_wig_folder,
            self._args.tex_wig_folder, self._args.height,
            self._args.tex_libs, self._args.frag_libs,
            self._args.present, self._args.output_folder)
        screen = Screen(args_sc)
        screen.screenshot(args_sc)
Beispiel #10
0
class SubLocal(object):
    '''detection of subcellular localization'''
    def __init__(self, args_sub):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.fixer = FormatFixer()
        self.gff_path = os.path.join(args_sub.gffs, "tmp")
        self.fasta_path = os.path.join(args_sub.fastas, "tmp")
        if args_sub.trans is not None:
            self.tran_path = os.path.join(args_sub.trans, "tmp")
        else:
            self.tran_path = None
        self.out_all = os.path.join(args_sub.out_folder, "all_CDSs")
        self.out_express = os.path.join(args_sub.out_folder, "expressed_CDSs")
        self.all_tmp_path = os.path.join(self.out_all, "tmp")
        self.express_tmp_path = os.path.join(self.out_express, "tmp")
        self.all_stat_path = os.path.join(self.out_all, "statistics")
        self.express_stat_path = os.path.join(self.out_express, "statistics")
        self.all_tmp_result = os.path.join(self.out_all, "tmp_results")
        self.express_tmp_result = os.path.join(self.out_express, "tmp_results")
        self.all_result = os.path.join(self.out_all, "psortb_results")
        self.express_result = os.path.join(self.out_express, "psortb_results")
        self.endfix_table = "table.csv"
        self.endfix_raw = "raw.txt"
        self._make_folder()

    def _make_folder(self):
        self.helper.check_make_folder(self.out_all)
        self.helper.check_make_folder(self.out_express)
        self.helper.check_make_folder(self.all_stat_path)
        self.helper.check_make_folder(self.express_stat_path)
        self.helper.check_make_folder(self.all_result)
        self.helper.check_make_folder(self.express_result)

    def _compare_cds_tran(self, gff_file, tran_file, log):
        '''compare CDS and transcript to find the expressed CDS'''
        log.write("Comparing transcripts and CDSs to get expressed CDSs.\n")
        out = open(os.path.join(self.out_all, "tmp_cds.gff"), "w")
        cdss = []
        fh = open(gff_file)
        th = open(tran_file)
        for entry in Gff3Parser().entries(fh):
            if entry.feature == "CDS":
                cdss.append(entry)
        trans = []
        for entry in Gff3Parser().entries(th):
            trans.append(entry)
        for cds in cdss:
            for ta in trans:
                if (cds.strand == ta.strand) and (cds.seq_id == ta.seq_id):
                    if ((cds.end < ta.end) and (cds.end > ta.start) and
                        (cds.start <= ta.start)) or (
                            (cds.start > ta.start) and (cds.start < ta.end) and
                            (cds.end >= ta.end)) or (
                                (cds.end >= ta.end) and
                                (cds.start <= ta.start)) or (
                                    (cds.end <= ta.end) and
                                    (cds.start >= ta.start)):
                        out.write(cds.info + "\n")
                        break
        fh.close()
        th.close()
        out.close()
        log.write("\t" + os.path.join(self.out_all, "tmp_cds.gff") + " is "
                  "temporary generated.\n")

    def _get_protein_seq(self, gff, tmp_path, tran_path, args_sub, log):
        prefix = gff.replace(".gff", "")
        fasta = self.helper.get_correct_file(self.fasta_path, ".fa", prefix,
                                             None, None)
        dna_seq_file = os.path.join(tmp_path, "_".join([prefix, "dna.fa"]))
        print("Generating CDS fasta files of {0}".format(prefix))
        if tran_path is not None:
            log.write("Predicting subcellular localization for expressed "
                      "CDSs for {0}.\n".format(prefix))
            self._compare_cds_tran(
                os.path.join(self.gff_path, gff),
                os.path.join(tran_path, "_".join([prefix, "transcript.gff"])),
                log)
            log.write("Running helper.py to extract sequences for CDSs.\n")
            self.helper.get_cds_seq(os.path.join(self.out_all, "tmp_cds.gff"),
                                    fasta, dna_seq_file)
            os.remove(os.path.join(self.out_all, "tmp_cds.gff"))
        else:
            log.write("Predicting subcellular localization for all CDSs for "
                      "{0}.\n".format(prefix))
            log.write("Running helper.py to extract sequences for CDSs.\n")
            self.helper.get_cds_seq(os.path.join(self.gff_path, gff), fasta,
                                    dna_seq_file)
        log.write("\t" + dna_seq_file + " is generated.\n")
        print("Transfering DNA sequences to protein sequence of {0}".format(
            prefix))
        log.write("Running helper.py to translate DNA sequences to Protein "
                  "sequences.\n")
        tmp_file = os.path.join(args_sub.out_folder, "tmp")
        self.helper.translation(dna_seq_file, tmp_file)
        prot_seq_file = os.path.join(tmp_path, "_".join([prefix,
                                                         "protein.fa"]))
        self.fixer.fix_emboss(tmp_file, prot_seq_file)
        log.write(prot_seq_file + " is generated.\n")
        os.remove(tmp_file)
        return prefix

    def _psortb(self, psortb_path, strain_type, prot_seq_file, out_raw,
                out_err, log):
        log.write(" ".join([psortb_path, strain_type, prot_seq_file]) + "\n")
        call([psortb_path, strain_type, prot_seq_file],
             stdout=out_raw,
             stderr=out_err)

    def _run_psortb(self, args_sub, prefix, out_folder, tmp_path, tmp_result,
                    log):
        print("Running psortb of {0}".format(prefix))
        log.write("Running Psortb for predict subcellular localization for "
                  "{0}.\n".format(prefix))
        out_err = open(os.path.join(out_folder, "tmp_log"), "w")
        out_raw = open(
            os.path.join(tmp_result, "_".join([prefix, self.endfix_raw])), "w")
        prot_seq_file = os.path.join(tmp_path, "_".join([prefix,
                                                         "protein.fa"]))
        if args_sub.gram == "positive":
            self._psortb(args_sub.psortb_path, "-p", prot_seq_file, out_raw,
                         out_err, log)
        elif args_sub.gram == "negative":
            self._psortb(args_sub.psortb_path, "-n", prot_seq_file, out_raw,
                         out_err, log)
        else:
            log.write("Please assign \"positive\" or \"negative\" to "
                      "--bacteria_type.\n")
            print("Error: {0} is not a proper bacteria type! "
                  "Please assign positive or negative.".format(args_sub.gram))
            sys.exit()
        log.write(
            "\t" +
            os.path.join(tmp_result, "_".join([prefix, self.endfix_raw])) +
            " is temporary generated.\n")
        out_err.close()
        out_raw.close()

    def _extract_result(self, args_sub, tmp_psortb_path, prefix, gff_file,
                        log):
        '''extract the result of psortb'''
        log.write("Running extract_psortb.py to extract the information of "
                  "localization.\n")
        extract_psortb(
            os.path.join(tmp_psortb_path, "_".join([prefix, self.endfix_raw])),
            os.path.join(tmp_psortb_path,
                         "_".join([prefix, self.endfix_table])), None, None,
            args_sub.fuzzy)
        log.write("\t" + os.path.join(tmp_psortb_path, "_".join(
            [prefix, self.endfix_table])) + " is tempoaray generated.\n")

    def _remove_header(self, out_all):
        out = open(out_all + "_tmp", "w")
        fh = open(out_all, "r")
        out.write("\t".join([
            "#Genome", "Protein", "Strand", "Start", "End", "Location", "Score"
        ]) + "\n")
        for row in csv.reader(fh, delimiter='\t'):
            if row[0] != "#Genome":
                out.write("\t".join(row) + "\n")
        out.close()
        fh.close()
        shutil.move(out_all + "_tmp", out_all)

    def _merge_and_stat(self, gffs, tmp_psortb_path, stat_path, psortb_result,
                        log):
        for folder in os.listdir(gffs):
            if folder.endswith(".gff_folder"):
                prefix = folder.replace(".gff_folder", "")
                self.helper.check_make_folder(
                    os.path.join(psortb_result, prefix))
                merge_table = os.path.join(
                    psortb_result, prefix,
                    "_".join([prefix, self.endfix_table]))
                for gff in os.listdir(os.path.join(gffs, folder)):
                    result = self.helper.get_correct_file(
                        tmp_psortb_path, "_" + self.endfix_raw,
                        gff.replace(".gff", ""), None, None)
                    shutil.copy(result, os.path.join(psortb_result, prefix))
                    result = self.helper.get_correct_file(
                        tmp_psortb_path, "_" + self.endfix_table,
                        gff.replace(".gff", ""), None, None)
                    self.helper.merge_file(result, merge_table)
                log.write("\t" + merge_table + "\n")
                self._remove_header(merge_table)
                self.helper.check_make_folder(os.path.join(stat_path, prefix))
                stat_folder = os.path.join(stat_path, prefix)
                stat_file = os.path.join(
                    stat_folder, "_".join(["stat", prefix, "sublocal.csv"]))
                stat_sublocal(merge_table, os.path.join(stat_folder, prefix),
                              stat_file)
                for file_ in os.listdir(stat_folder):
                    log.write("\t" + os.path.join(stat_folder, file_) + "\n")

    def _remove_tmps(self, args_sub):
        self.helper.remove_tmp_dir(args_sub.fastas)
        self.helper.remove_tmp_dir(args_sub.gffs)
        self.helper.remove_all_content(args_sub.out_folder, "tmp", "dir")
        self.helper.remove_all_content(self.out_all, "tmp", "dir")
        self.helper.remove_all_content(self.out_express, "tmp", "dir")
        os.remove(os.path.join(self.out_all, "tmp_log"))
        if args_sub.trans is not None:
            os.remove(os.path.join(self.out_express, "tmp_log"))
            self.helper.remove_tmp_dir(args_sub.trans)

    def run_sub_local(self, args_sub, log):
        for gff in os.listdir(args_sub.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(
                    os.path.join(args_sub.gffs, gff))
        self.multiparser.parser_gff(args_sub.gffs, None)
        self.multiparser.parser_fasta(args_sub.fastas)
        if args_sub.trans is not None:
            self.multiparser.parser_gff(args_sub.trans, "transcript")
            self.helper.check_make_folder(self.express_tmp_path)
            self.helper.check_make_folder(self.express_tmp_result)
        self.helper.check_make_folder(self.all_tmp_path)
        self.helper.check_make_folder(self.all_tmp_result)
        for gff in os.listdir(self.gff_path):
            if args_sub.trans is not None:
                print("Running expressed genes now")
                prefix = self._get_protein_seq(gff, self.express_tmp_path,
                                               self.tran_path, args_sub, log)
                self._run_psortb(args_sub, prefix, self.out_express,
                                 self.express_tmp_path,
                                 self.express_tmp_result, log)
                self._extract_result(args_sub, self.express_tmp_result, prefix,
                                     os.path.join(self.gff_path, gff), log)
            print("Running all genes now")
            prefix = self._get_protein_seq(gff, self.all_tmp_path, None,
                                           args_sub, log)
            self._run_psortb(args_sub, prefix, self.out_all, self.all_tmp_path,
                             self.all_tmp_result, log)
            self._extract_result(args_sub, self.all_tmp_result, prefix,
                                 os.path.join(self.gff_path, gff), log)
        log.write("Running stat_sublocal.py to do statistics, generate "
                  "merged tables, and plot figures.\n")
        log.write("The following files are generated:\n")
        self._merge_and_stat(args_sub.gffs, self.all_tmp_result,
                             self.all_stat_path, self.all_result, log)
        if args_sub.trans is not None:
            self._merge_and_stat(args_sub.gffs, self.express_tmp_result,
                                 self.express_stat_path, self.express_result,
                                 log)
        self._remove_tmps(args_sub)
Beispiel #11
0
class GoTermFinding(object):

    def __init__(self, args_go):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.out_all = os.path.join(args_go.out_folder, "all_CDS")
        self.out_express = os.path.join(args_go.out_folder, "expressed_CDS")
        self.result_all_path = os.path.join(self.out_all, "Go_term_results")
        self.result_express_path = os.path.join(self.out_express,
                                                "Go_term_results")
        self.gff_path = os.path.join(args_go.gffs, "tmp")
        if args_go.trans is not None:
            self.tran_path = os.path.join(args_go.trans, "tmp")
        else:
            self.tran_path = None
        self.stat_all_path = os.path.join(self.out_all, "statistics")
        self.stat_express_path = os.path.join(self.out_express,
                                              "statistics")
        self.all_strain = "all_strains_uniprot.csv"

    def _retrieve_go(self, uniprot, out_path, type_):
        prefixs = []
        for gff in os.listdir(self.gff_path):
            prefix = gff.replace(".gff", "")
            prefixs.append(prefix)
            self.helper.check_make_folder(os.path.join(out_path, prefix))
            out_file = os.path.join(out_path, prefix,
                                    "_".join([prefix, "uniprot.csv"]))
            print("extracting Go terms of {0} from UniProt...".format(prefix))
            if self.tran_path is not None:
                tran_file = os.path.join(self.tran_path,
                                         "_".join([prefix, "transcript.gff"]))
            else:
                tran_file = None
            retrieve_uniprot(uniprot, os.path.join(self.gff_path, gff),
                             out_file, tran_file, type_)

    def _merge_files(self, gffs, out_path, out_folder):
        folders = []
        for folder in os.listdir(gffs):
            if folder.endswith("gff_folder"):
                folder_prefix = folder.replace(".gff_folder", "")
                folder_path = os.path.join(out_folder, folder_prefix)
                self.helper.check_make_folder(folder_path)
                folders.append(folder_path)
                filenames = []
                for gff in os.listdir(os.path.join(gffs, folder)):
                    if gff.endswith(".gff"):
                        filenames.append(gff.replace(".gff", ""))
                out_all = os.path.join(folder_path, self.all_strain)
                if len(filenames) > 1:
                    if self.all_strain in os.listdir(folder_path):
                        os.remove(out_all)
                    for filename in filenames:
                        csv_file = "_".join([filename, "uniprot.csv"])
                        self.helper.merge_file(os.path.join(out_path,
                                               filename, csv_file), out_all)
                        shutil.copy(os.path.join(out_path, filename, csv_file),
                                    folder_path)
                else:
                    shutil.copyfile(os.path.join(out_path, filenames[0],
                                    "_".join([filenames[0], "uniprot.csv"])),
                                    out_all)
        self.helper.remove_all_content(out_path, None, "dir")
        self.helper.remove_all_content(out_path, None, "file")
        for folder in folders:
            folder_prefix = folder.split("/")[-1]
            shutil.move(folder, os.path.join(out_path, folder_prefix))

    def _stat(self, out_path, stat_path, go, goslim, out_folder):
        for folder in os.listdir(out_path):
            strain_stat_path = os.path.join(stat_path, folder)
            self.helper.check_make_folder(strain_stat_path)
            fig_path = os.path.join(strain_stat_path, "figs")
            if "fig" not in os.listdir(strain_stat_path):
                os.mkdir(fig_path)
            print("Computing statistics of {0}".format(folder))
            map2goslim(goslim, go,
                       os.path.join(out_path, folder, self.all_strain),
                       os.path.join(strain_stat_path,
                                    "_".join(["stat", folder + ".csv"])),
                       out_folder)
            self.helper.move_all_content(out_folder, fig_path,
                                         ["_three_roots.png"])
            self.helper.move_all_content(out_folder, fig_path,
                                         ["_molecular_function.png"])
            self.helper.move_all_content(out_folder, fig_path,
                                         ["_cellular_component.png"])
            self.helper.move_all_content(out_folder, fig_path,
                                         ["_biological_process.png"])

    def run_go_term(self, args_go):
        for gff in os.listdir(args_go.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(
                                                 args_go.gffs, gff))
        self.multiparser.parser_gff(args_go.gffs, None)
        if args_go.trans is not None:
            self.multiparser.parser_gff(args_go.trans, "transcript")
        print("Computing all CDS...")
        self._retrieve_go(args_go.uniprot, self.result_all_path, "all")
        self._merge_files(args_go.gffs, self.result_all_path, self.out_all)
        self._stat(self.result_all_path, self.stat_all_path, args_go.go,
                   args_go.goslim, self.out_all)
        if args_go.trans is not None:
            print("Computing express CDS...")
            self._retrieve_go(args_go.uniprot, self.result_express_path,
                              "express")
            self._merge_files(args_go.gffs, self.result_express_path,
                              self.out_express)
            self._stat(self.result_express_path, self.stat_express_path,
                       args_go.go, args_go.goslim, self.out_express)
        self.helper.remove_tmp(args_go.gffs)
        if args_go.trans is not None:
            self.helper.remove_tmp(args_go.trans)
Beispiel #12
0
class TranscriptAssembly(object):

    def __init__(self, args_tran):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.gff_outfolder = os.path.join(args_tran.out_folder, "gffs")
        self.tran_path = os.path.join(self.gff_outfolder, "tmp")
        self.stat_path = os.path.join(args_tran.out_folder, "statistics")
        self.tmps = {"gff": "tmp.gff", "merge": "tmp_merge",
                     "tran": os.path.join(args_tran.out_folder, "tmp_tran"),
                     "tss_ta": os.path.join(self.gff_outfolder, "tmp_tss_ta"),
                     "ta_tss": os.path.join(self.gff_outfolder, "tmp_ta_tss"),
                     "ta_gff": os.path.join(self.gff_outfolder, "tmp_ta_gff"),
                     "gff_ta": os.path.join(self.gff_outfolder, "tmp_gff_ta"),
                     "uni": os.path.join(self.gff_outfolder, "tmp_uni"),
                     "overlap": os.path.join(
                         self.gff_outfolder, "tmp_overlap")}
        self.frag = "transcript_assembly_fragment.gff"
        self.tex = "transcript_assembly_tex_notex.gff"
        self.endfix_tran = "transcript.gff"

    def _compute_transcript(self, wig_f, wig_r, wig_folder, wig_type, strain,
                            libs, args_tran):
        print("Computing transcript assembly for {0}...".format(strain))
        out = os.path.join(args_tran.out_folder, "_".join([strain, wig_type]))
        assembly(wig_f, wig_r, wig_folder, libs, out, wig_type, args_tran)

    def _compute(self, wig_type, wigs, libs, args_tran):
        strains = []
        wig_folder = os.path.join(wigs, "tmp")
        for wig in os.listdir(wig_folder):
            if wig.endswith("_forward.wig"):
                strains.append(wig.replace("_forward.wig", ""))
        for strain in strains:
            f_file = os.path.join(wig_folder, "_".join(
                [strain, "forward.wig"]))
            r_file = os.path.join(wig_folder, "_".join(
                [strain, "reverse.wig"]))
            self._compute_transcript(f_file, r_file, wigs, wig_type,
                                     strain, libs, args_tran)
        return strains

    def _compare_tss(self, tas, args_tran):
        self.multiparser.parser_gff(args_tran.compare_tss, "TSS")
        self.multiparser.combine_gff(
                self.gff_outfolder,
                os.path.join(args_tran.compare_tss, "tmp"),
                "transcript", "TSS")
        print("Comaring of Transcript assembly and TSS file...")
        tss_folder = os.path.join(args_tran.compare_tss, "tmp")
        for ta in tas:
            ta_file = os.path.join(self.gff_outfolder,
                                   "_".join([ta, self.endfix_tran]))
            stat_tss_out = os.path.join(
                    self.stat_path, "".join([
                        "stat_compare_Transcriptome_assembly_TSS_",
                        ta, ".csv"]))
            for tss in os.listdir(tss_folder):
                filename = tss.split("_TSS")
                if (filename[0] == ta) and (tss.endswith(".gff")):
                    stat_ta_tss(ta_file, os.path.join(tss_folder, tss),
                                stat_tss_out, self.tmps["ta_tss"],
                                self.tmps["tss_ta"], args_tran.fuzzy)
                    os.remove(ta_file)
                    os.remove(os.path.join(tss_folder, tss))
                    self.helper.sort_gff(self.tmps["ta_tss"], ta_file)
                    self.helper.sort_gff(
                            self.tmps["tss_ta"], os.path.join(
                                args_tran.compare_tss, tss))
                    os.remove(self.tmps["tss_ta"])
                    os.remove(self.tmps["ta_tss"])

    def _compare_cds(self, tas, args_tran):
        self.multiparser.parser_gff(args_tran.compare_cds, None)
        self.multiparser.combine_gff(
            self.gff_outfolder, os.path.join(args_tran.compare_cds, "tmp"),
            "transcript", None)
        print("Comaring of Transcript assembly and gene...")
        cds_folder = os.path.join(args_tran.compare_cds, "tmp")
        for ta in tas:
            ta_file = os.path.join(self.gff_outfolder,
                                   "_".join([ta, self.endfix_tran]))
            stat_gff_out = os.path.join(self.stat_path, "".join([
                "stat_compare_Transcriptome_assembly_gene_", ta, ".csv"]))
            for gff in os.listdir(cds_folder):
                if (gff[:-4] == ta) and (gff.endswith(".gff")):
                    cds_file = os.path.join(cds_folder, gff)
                    stat_ta_gff(ta_file, cds_file, stat_gff_out,
                                self.tmps["ta_gff"], self.tmps["gff_ta"],
                                args_tran.c_feature)
                    os.remove(ta_file)
                    os.remove(os.path.join(args_tran.compare_cds, gff))
                    self.helper.sort_gff(self.tmps["ta_gff"], ta_file)
                    self.helper.sort_gff(self.tmps["gff_ta"], os.path.join(
                        args_tran.compare_cds, gff))
                    os.remove(self.tmps["ta_gff"])
                    os.remove(self.tmps["gff_ta"])

    def _compare_tss_cds(self, tas, args_tran):
        if (args_tran.compare_tss is not None) and (
                args_tran.compare_cds is not None):
            self.multiparser.parser_gff(self.gff_outfolder, "transcript")
            self._compare_cds(tas, args_tran)
            self._compare_tss(tas, args_tran)
        elif (args_tran.compare_cds is not None) and (
                args_tran.compare_tss is None):
            self.multiparser.parser_gff(self.gff_outfolder, "transcript")
            self._compare_cds(tas, args_tran)
        elif (args_tran.compare_cds is None) and (
                args_tran.compare_tss is not None):
            self.multiparser.parser_gff(self.gff_outfolder, "transcript")
            self._compare_tss(tas, args_tran)

    def _for_one_wig(self, type_, args_tran):
        if type_ == "tex_notex":
            libs = args_tran.tlibs
            wigs = args_tran.tex_wigs
        else:
            libs = args_tran.flibs
            wigs = args_tran.frag_wigs
        print("Computing {0} wig files....".format(type_))
        strains = self._compute(type_, wigs, libs, args_tran)
        for strain in strains:
            out = os.path.join(self.gff_outfolder, "_".join([
                strain, "transcript_assembly", type_ + ".gff"]))
            self.helper.sort_gff(os.path.join(args_tran.out_folder,
                                 "_".join([strain, type_])), out)
            os.remove(os.path.join(args_tran.out_folder,
                                   "_".join([strain, type_])))
        return strains

    def _for_two_wigs(self, strains, args_tran):
        if (args_tran.frag_wigs is not None) and (
                args_tran.tex_wigs is not None):
            print("merge fragment and tex treat one ....")
            for strain in strains:
                frag_gff = os.path.join(self.gff_outfolder,
                                        "_".join([strain, self.frag]))
                tex_gff = os.path.join(self.gff_outfolder,
                                       "_".join([strain, self.tex]))
                final_gff = os.path.join(self.gff_outfolder,
                                         "_".join([strain, self.endfix_tran]))
                for gff in os.listdir(self.gff_outfolder):
                    if "transcript_assembly" in gff:
                        filename = gff.split("_transcript_assembly_")
                        if (strain == filename[0]) and (
                                "tex_notex.gff" == filename[1]):
                            tex_file = gff
                        elif (strain == filename[0]) and (
                                "fragment.gff" == filename[1]):
                            frag_file = gff
                combine(os.path.join(self.gff_outfolder, frag_file),
                        os.path.join(self.gff_outfolder, tex_file),
                        args_tran.tolerance,
                        os.path.join(self.gff_outfolder,
                                     "_".join([strain, self.endfix_tran])))
                os.remove(frag_gff)
                os.remove(tex_gff)
        else:
            if args_tran.frag_wigs is not None:
                for strain in strains:
                    frag_gff = os.path.join(
                            self.gff_outfolder, "_".join([strain, self.frag]))
                    final_gff = os.path.join(
                            self.gff_outfolder,
                            "_".join([strain, self.endfix_tran]))
                    shutil.move(frag_gff, final_gff)
            elif args_tran.tex_wigs is not None:
                for strain in strains:
                    tex_gff = os.path.join(
                            self.gff_outfolder, "_".join([strain, self.tex]))
                    final_gff = os.path.join(
                            self.gff_outfolder,
                            "_".join([strain, self.endfix_tran]))
                    shutil.move(tex_gff, final_gff)

    def _post_modify(self, tas, args_tran):
        for ta in tas:
            for gff in os.listdir(args_tran.gffs):
                if (".gff" in gff) and (gff[:-4] == ta):
                    break
            print("Modifying {0} refering to {1}...".format(ta, gff))
            fill_gap(os.path.join(args_tran.gffs, gff),
                     os.path.join(self.tran_path,
                     "_".join([ta, self.endfix_tran])),
                     "overlap", self.tmps["overlap"])
            fill_gap(os.path.join(args_tran.gffs, gff),
                     os.path.join(self.tran_path,
                     "_".join([ta, self.endfix_tran])),
                     "uni", self.tmps["uni"])
            tmp_merge = os.path.join(self.gff_outfolder, self.tmps["merge"])
            if self.tmps["merge"] in self.gff_outfolder:
                os.remove(tmp_merge)
            self.helper.merge_file(self.tmps["overlap"], tmp_merge)
            self.helper.merge_file(self.tmps["uni"], tmp_merge)
            tmp_out = os.path.join(self.gff_outfolder, "_".join(["tmp", ta]))
            self.helper.sort_gff(tmp_merge, tmp_out)
            os.remove(self.tmps["overlap"])
            os.remove(self.tmps["uni"])
            os.remove(tmp_merge)
            final_out = os.path.join(self.gff_outfolder,
                                     "_".join(["final", ta]))
            longer_ta(tmp_out, args_tran.length, final_out)
            shutil.move(final_out,
                        os.path.join(self.tmps["tran"],
                                     "_".join([ta, self.endfix_tran])))
            os.remove(tmp_out)
        shutil.rmtree(self.gff_outfolder)
        shutil.move(self.tmps["tran"], self.gff_outfolder)

    def _remove_file(self, args_tran):
        if args_tran.frag_wigs is not None:
            self.helper.remove_wigs(args_tran.frag_wigs)
        if args_tran.tex_wigs is not None:
            self.helper.remove_wigs(args_tran.tex_wigs)
        if args_tran.gffs is not None:
            self.helper.remove_tmp(args_tran.gffs)
        if args_tran.compare_cds is not None:
            self.helper.remove_tmp(args_tran.compare_cds)
        if args_tran.compare_tss is not None:
            self.helper.remove_tmp(args_tran.compare_tss)
        if args_tran.terms is not None:
            self.helper.remove_tmp(args_tran.terms)
        self.helper.remove_tmp(os.path.join(args_tran.out_folder, "gffs"))
        self.helper.remove_tmp(self.gff_outfolder)

    def _compare_term_tran(self, args_tran):
        if args_tran.terms is not None:
            print("comparing between terminators and transcripts...")
            self.multiparser.parser_gff(args_tran.terms, "term")
            self.multiparser.combine_gff(
                    args_tran.gffs,
                    os.path.join(args_tran.terms, "tmp"), None, "term")
            compare_term_tran(self.gff_outfolder,
                              os.path.join(args_tran.terms, "tmp"),
                              args_tran.fuzzy_term, args_tran.fuzzy_term,
                              args_tran.out_folder, "transcript")

    def run_transcript_assembly(self, args_tran):
        if (args_tran.frag_wigs is None) and (args_tran.tex_wigs is None):
            print("Error: there is no wigs files!!!!\n")
            sys.exit()
        if args_tran.frag_wigs is not None:
            strains = self._for_one_wig("fragment", args_tran)
        if args_tran.tex_wigs is not None:
            strains = self._for_one_wig("tex_notex", args_tran)
        self._for_two_wigs(strains, args_tran)
        tas = []
        if args_tran.gffs is not None:
            for gff in os.listdir(args_tran.gffs):
                if gff.endswith(".gff"):
                    self.helper.sort_gff(os.path.join(args_tran.gffs, gff),
                                         self.tmps["gff"])
                    shutil.move(self.tmps["gff"],
                                os.path.join(args_tran.gffs, gff))
            self.multiparser.combine_gff(args_tran.gffs, os.path.join(
                args_tran.gffs, "tmp"), None, None)
            self.multiparser.parser_gff(self.gff_outfolder, "transcript")
            self.multiparser.combine_gff(args_tran.gffs, self.tran_path,
                                         None, "transcript")
            self.helper.check_make_folder(self.tmps["tran"])
            for ta in os.listdir(self.tran_path):
                if ta.endswith(".gff"):
                    if os.path.getsize(os.path.join(self.tran_path, ta)) != 0:
                        tas.append(ta.replace("_" + self.endfix_tran, ""))
            self._post_modify(tas, args_tran)
        self._compare_tss_cds(tas, args_tran)
        self._compare_term_tran(args_tran)
        gen_table_transcript(self.gff_outfolder, args_tran)
        self._remove_file(args_tran)
Beispiel #13
0
class SNPCalling(object):
    '''detection of SNP'''

    def __init__(self, args_snp):
        self.multiparser = Multiparser()
        self.seq_editer = SeqEditer()
        self.helper = Helper()
        if args_snp.types == "related_genome":
            file_type = "compare_related_and_reference_genomes"
        else:
            file_type = "mutations_of_reference_genomes"
        self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs")
        self.stat_path = os.path.join(args_snp.out_folder, file_type,
                                      "statistics")
        self.fig_path = os.path.join(self.stat_path, "figs")
        self.helper.check_make_folder(self.fig_path)
        self.outputs = {"table": os.path.join(
                        args_snp.out_folder, file_type, "SNP_tables"),
                        "raw": os.path.join(
                        args_snp.out_folder, file_type, "SNP_raw_outputs"),
                        "tmp": os.path.join(args_snp.out_folder, "tmp_bcf"),
                        "depth": os.path.join(args_snp.out_folder, "tmp_depth")}
        self.bams = {"whole": os.path.join(args_snp.out_folder,
                                           "whole_reads.bam"),
                     "sort": os.path.join(args_snp.out_folder,
                                          "whole_reads_sorted.bam"),
                     "bams": []}
        self.header = os.path.join(args_snp.out_folder, "header")
        self.baqs = {"with": "with_BAQ", "without": "without_BAQ",
                     "extend": "extend_BAQ"}

    def _transcript_snp(self, fasta, out_table_prefix, type_,
                        prefix, bam_datas, table_path, args_snp):
        seq_path = os.path.join(self.seq_path, self.baqs[type_], prefix)
        for bam in bam_datas:
            stat_prefix = os.path.join(self.stat_path, "_".join([
                "stat", "_".join([prefix, self.baqs[type_], bam["sample"]]),
                "SNP"]))
            snp_file = os.path.join(self.outputs["raw"], prefix, "_".join(
                [prefix, self.baqs[type_], bam["sample"] + ".vcf"]))
            snp_detect(
                fasta, snp_file, self.outputs["depth"] + bam["sample"],
                "_".join([out_table_prefix, bam["sample"]]),
                os.path.join(seq_path, "_".join([prefix, bam["sample"]])),
                bam["bam_number"], stat_prefix, args_snp, bam["rep"])
            self.helper.move_all_content(table_path, self.fig_path, [".png"])

    def _get_para(self, args_snp):
        if args_snp.caller == "c":
            bcf_para = "-vcO"
        else:
            bcf_para = "-vmO"
        return bcf_para

    def _run_tools(self, fasta_file, type_, args_snp, bam_datas, log):
        bcf_para = self._get_para(args_snp)
        for bam in bam_datas:
            bam_file = os.path.join(args_snp.out_folder,
                                    bam["sample"] + ".bam")
            if type_ == "with":
                command = [args_snp.samtools_path, "mpileup", "-t", "DP"]
            elif type_ == "without":
                command = [args_snp.samtools_path, "mpileup", "-t", "DP", "-B"]
            elif type_ == "extend":
                command = [args_snp.samtools_path, "mpileup", "-t", "DP", "-E"]
            if args_snp.rg:
                command = command + ["-ugf", fasta_file, bam_file]
            else:
                command = command + ["--ignore-RG", "-ugf", fasta_file, bam_file]
            log.write(" ".join(command) + ">" + self.outputs["tmp"] + "\n")
            os.system(" ".join(command) + ">" + self.outputs["tmp"])
            bam["vcf"] = os.path.join(self.outputs["raw"], "_".join(
                [self.baqs[type_], bam["sample"] + ".vcf"]))
            if args_snp.chrom == "1":
                log.write(" ".join([
                      args_snp.bcftools_path, "call", "--ploidy", args_snp.chrom,
                      self.outputs["tmp"], bcf_para, "v", "-o", bam["vcf"]]) + "\n")
                call([args_snp.bcftools_path, "call", "--ploidy", args_snp.chrom,
                      self.outputs["tmp"], bcf_para, "v", "-o", bam["vcf"]])
            elif args_snp.chrom == "2":
                log.write(" ".join([args_snp.bcftools_path, "call",
                      self.outputs["tmp"], bcf_para, "v", "-o", bam["vcf"]]) + "\n")
                call([args_snp.bcftools_path, "call",
                      self.outputs["tmp"], bcf_para, "v", "-o", bam["vcf"]])
        log.write("Done!\n")
        log.write("The following files are generated:\n")
        for file_ in os.listdir(self.outputs["raw"]):
            log.write("\t" + os.path.join(self.outputs["raw"], file_) + "\n")

    def _parse_vcf_by_fa(self, args_snp, type_, num_prog, log):
        seq_names = []
        fa_prefixs = []
        log.write("Parsing Vcf files by comparing fasta information.\n")
        for fa in os.listdir(args_snp.fastas):
            if (fa != "all.fa") and (not fa.endswith(".fai")):
                with open(os.path.join(args_snp.fastas, fa)) as fh:
                    for line in fh:
                        line = line.strip()
                        if line.startswith(">"):
                            seq_names.append(line[1:])
                fa_prefix = ".".join(fa.split(".")[:-1])
                fa_prefixs.append(fa_prefix)
                vcf_folder = os.path.join(
                    self.outputs["raw"], fa_prefix)
                if num_prog == 0:
                    self.helper.check_make_folder(vcf_folder)
                    self.helper.check_make_folder(os.path.join(
                        self.outputs["table"], fa_prefix))
                self.helper.check_make_folder(
                    os.path.join(self.seq_path, self.baqs[type_], fa_prefix))
                for vcf in os.listdir(self.outputs["raw"]):
                    if vcf.endswith(".vcf"):
                        out = open(os.path.join(vcf_folder, "_".join(
                            [fa_prefix, vcf])), "w")
                        with open(os.path.join(self.outputs["raw"],
                                  vcf)) as vh:
                            for line in vh:
                                line = line.strip()
                                if line.startswith("#"):
                                    out.write(line + "\n")
                                else:
                                    if line.split("\t")[0] in seq_names:
                                        out.write(line + "\n")
                        out.close()
                        log.write("\t" + os.path.join(vcf_folder, "_".join(
                            [fa_prefix, vcf])) + " is generated.\n")
        for vcf in os.listdir(self.outputs["raw"]):
            if vcf.endswith(".vcf"):
                os.remove(os.path.join(self.outputs["raw"], vcf))
        return fa_prefixs

    def _run_sub(self, args_snp, all_fasta, type_, bam_datas, num_prog, log):
        self._run_tools(all_fasta, type_, args_snp, bam_datas, log)
        fa_prefixs = self._parse_vcf_by_fa(args_snp, type_, num_prog, log)
        log.write("Running transcript_SNP.py to do statistics, filter SNPs, "
                  "and generate potential sequences.\n")
        log.write("The following files are generated:\n")
        for fa_prefix in fa_prefixs:
            for fasta in os.listdir(args_snp.fastas):
                if fa_prefix in fasta:
                    fasta_file = os.path.join(args_snp.fastas, fasta)
            table_path = os.path.join(self.outputs["table"], fa_prefix)
            table_prefix = os.path.join(table_path, "_".join(
                [fa_prefix, self.baqs[type_]]))
            self._transcript_snp(
                fasta_file, table_prefix,
                type_, fa_prefix, bam_datas, table_path, args_snp)
            seq_path = os.path.join(self.seq_path, self.baqs[type_], fa_prefix)
            for folder in (table_path, self.stat_path, seq_path, self.fig_path):
                for file_ in os.listdir(folder):
                    if os.path.isfile(os.path.join(folder, file_)):
                        log.write("\t" + os.path.join(folder, file_) + "\n")

    def _run_program(self, all_fasta, bam_datas, args_snp, log):
        num_prog = 0
        log.write("Running Samtools to mpileup, and using Bcftools to "
                  "call snp.\n")
        log.write("Please make sure the version of Samtools and Bcftools "
                  "are both at least 1.3.1.\n")
        for index in args_snp.program:
            if index == "with_BAQ":
                type_ = "with"
                print("Running SNP calling with BAQ")
                log.write("Running SNP calling with BAQ.\n")
            elif index == "without_BAQ":
                type_ = "without"
                print("Running SNP calling without BAQ")
                log.write("Running SNP calling without BAQ.\n")
            elif index == "extend_BAQ":
                print("Running SNP calling extend BAQ")
                log.write("Running SNP calling extend BAQ.\n")
                type_ = "extend"
            else:
                print("Error: No correct program, please assign "
                      "\"with_BAQ\", \"without_BAQ\", \"extend_BAQ\"!")
                log.write("No valid program can be found, please assign"
                          "\"with_BAQ\", \"without_BAQ\", \"extend_BAQ\".\n")
                sys.exit()
            self._run_sub(args_snp, all_fasta, type_, bam_datas, num_prog, log)
            num_prog += 1

    def _run_bam(self, samtools_path, sub_command, bam_file, type_file, log):
        if sub_command == "merge":
            command = (" ".join([samtools_path, sub_command,
                       self.bams["whole"], bam_file]))
        elif sub_command == "sort":
            if type_file == "all":
                command = (" ".join([samtools_path, sub_command,
                                     "-o", bam_file, self.bams["whole"]]))
            else:
                command = (" ".join([samtools_path, sub_command,
                                     "-o",
                                     bam_file, type_file]))
        log.write(command + "\n")
        os.system(command)

    def _merge_bams(self, args_snp, bam_datas, log):
        bams = []
        num_normal = 0
        num_frag = 0
        log.write("Using Samtools to merge and sort BAM files.\n")
        log.write("Please make sure the version of Samtools is at least 1.3.1.\n")
        for bam in bam_datas:
            bam["bam_number"] = 0
            out_bam = os.path.join(args_snp.out_folder, bam["sample"] + ".bam")
            if len(bam["bams"]) == 1:
                print("Sorting BAM files of " + bam["sample"])
                self._run_bam(
                    args_snp.samtools_path, "sort",
                    out_bam, bam["bams"][0], log)
                bam["bam_number"] = 1
            else:
                print("Merging BAM files of " + bam["sample"])
                self._run_bam(args_snp.samtools_path, "merge",
                              " ".join(bam["bams"]), "all", log)
                print("Sorting BAM files of " + bam["sample"])
                self._run_bam(
                    args_snp.samtools_path, "sort",
                    out_bam, "all", log)
                bam["bam_number"] += 1
            if os.path.exists(self.bams["whole"]):
                os.remove(self.bams["whole"])
            out_depth = open(self.outputs["depth"] + bam["sample"], "w")
            log.write(" ".join([args_snp.samtools_path, "index",  out_bam]) + "\n")
            call([args_snp.samtools_path, "index",  out_bam])
            log.write(" ".join([args_snp.samtools_path, "depth",  out_bam]) + "\n")
            call([args_snp.samtools_path, "depth",  out_bam],
                 stdout=out_depth)
            out_depth.close()
        log.write("Done!\n")
        log.write("The following files are generated:\n")
        log.write("\t" + self.bams["whole"] + " is temporary generated "
                  "(be deleted afterward).\n")
        for file_ in os.listdir(args_snp.out_folder):
            if os.path.isfile(os.path.join(args_snp.out_folder, file_)):
                log.write("\t" + os.path.join(args_snp.out_folder, file_) + "\n")
        

    def _modify_header(self, fastas):
        for fasta in os.listdir(fastas):
            if fasta.endswith("fasta") or \
               fasta.endswith("fa") or \
               fasta.endswith("fna"):
                self.seq_editer.modify_header(os.path.join(fastas, fasta))

    def _get_header(self, samtools_path, bam, seq_names):
        command = " ".join([samtools_path, "view", "-H", bam])
        os.system(">".join([command, self.header]))
        fh = open(self.header, "r")
        for row in csv.reader(fh, delimiter="\t"):
            if row[0] == "@SQ":
                if row[1].split(":")[1] not in seq_names:
                    seq_names.append(row[1].split(":")[1])
        fh.close()

    def _get_genome_name(self, args_snp, bam_datas):
        seq_names = []
        for bam in bam_datas:
            bam_file = os.path.join(args_snp.out_folder,
                                    bam["sample"] + ".bam")
            self._get_header(args_snp.samtools_path,
                             bam_file, seq_names)
        return seq_names

    def _remove_bams(self, bam_datas, args_snp):
        for bam in bam_datas:
            bam_file = os.path.join(args_snp.out_folder,
                                    bam["sample"] + ".bam")
            if os.path.exists(bam_file):
                os.remove(bam_file)
            if os.path.exists(bam_file + ".bai"):
                os.remove(bam_file + ".bai")
            if os.path.exists(self.header):
                os.remove(self.header)
            os.remove(self.outputs["depth"] + bam["sample"])

    def _extract_bams(self, bams, log):
        bam_datas = []
        for bam in bams:
            datas = bam.split(":")
            if len(datas) != 2:
                log.write("the format of --bam_files is wrong!\n")
                print("Error: the format of --bam_files is wrong!")
                sys.exit()
            for file_ in datas[-1].split(","):
                if not os.path.exists(file_):
                    print("Error: there are some Bam files "
                          "which do not exist!")
                    log.write(file_ + " is not found.\n")
                    sys.exit()
            bam_datas.append({"sample": datas[0],
                              "rep": len(datas[-1].split(",")),
                              "bams": datas[-1].split(",")})
        return bam_datas

    def _merge_fasta(self, fastas, log):
        all_fasta = os.path.join(fastas, "all.fa")
        names = []
        out = open(all_fasta, "w")
        print_ = False
        for fasta in os.listdir(fastas):
            if (fasta.endswith(".fa")) or (
                    fasta.endswith(".fasta")) or (
                    fasta.endswith(".fna")):
                with open(os.path.join(fastas, fasta)) as fh:
                    for line in fh:
                        line = line.strip()
                        if line.startswith(">"):
                            if line not in names:
                                print_ = True
                                names.append(line)
                            else:
                                print_ = False
                        if print_:
                            out.write(line + "\n")
                log.write(os.path.join(fastas, fasta) + " is loaded.\n")
        out.close()
        return all_fasta

    def run_snp_calling(self, args_snp, log):
        self._modify_header(args_snp.fastas)
        all_fasta = self._merge_fasta(args_snp.fastas, log)
        bam_datas = self._extract_bams(args_snp.bams, log)
        self._merge_bams(args_snp, bam_datas, log)
        if ("with_BAQ" not in args_snp.program) and (
                "without_BAQ" not in args_snp.program) and (
                "extend_BAQ" not in args_snp.program):
            print("Error: Please assign a correct programs: "
                  "\"with_BAQ\", \"without_BAQ\", \"extend_BAQ\".")
            sys.exit()
        else:
            print("Detecting mutations now")
            self._run_program(all_fasta, bam_datas, args_snp, log)
            os.remove(self.outputs["tmp"])
            os.remove(all_fasta)
            os.remove(all_fasta + ".fai")
        self.helper.remove_tmp_dir(args_snp.fastas)
        self._remove_bams(bam_datas, args_snp)
        log.write("Remove all the temporary files.\n")
Beispiel #14
0
class Ribos(object):

    def __init__(self, args_ribo):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.gff_parser = Gff3Parser()
        self.gff_path = os.path.join(args_ribo.gffs, "tmp")
        self.tss_path = os.path.join(args_ribo.tsss, "tmp")
        self.tran_path = os.path.join(args_ribo.trans, "tmp")
        self.fasta_path = os.path.join(args_ribo.fastas, "tmp")
        self.stat_folder = os.path.join(args_ribo.out_folder, "statistics")
        self.gff_outfolder = os.path.join(args_ribo.out_folder, "gffs")
        self.table_folder = os.path.join(args_ribo.out_folder, "tables")
        self.scan_folder = os.path.join(args_ribo.out_folder, "scan_Rfam")
        self.ribos_rfam = os.path.join(args_ribo.database,
                                       "Rfam_riboswitch.cm")
        self.tmp_files = {"fasta": os.path.join(
                                   args_ribo.out_folder, "tmp_fasta"),
                          "scan": os.path.join(
                                  args_ribo.out_folder, "tmp_scan"),
                          "table": os.path.join(
                                   args_ribo.out_folder, "tmp_table")}
        self.suffixs = {"csv": "riboswitch.csv",
                        "txt": "riboswitch_prescan.txt",
                        "re_txt": "riboswitch_scan.txt",
                        "re_csv": "riboswitch_scan.csv"}

    def _run_infernal(self, args_ribo, seq, type_, prefix):
        scan_file = os.path.join(self.tmp_files["scan"],
                                 "_".join([prefix, self.suffixs[type_]]))
        scan = open(scan_file, "w")
        call([os.path.join(args_ribo.infernal_path, "cmscan"), "--incE",
              str(args_ribo.e_value), "--acc", self.ribos_rfam, seq],
             stdout=scan)
        scan.close()
        return scan_file

    def _scan_extract_rfam(self, prefixs, args_ribo):
        for gff in os.listdir(self.gff_path):
            if gff.endswith(".gff"):
                prefix = gff.replace(".gff", "")
                first_seq = os.path.join(self.tmp_files["fasta"],
                                         prefix + ".fa")
                prefixs.append(prefix)
                print("extracting seq of riboswitch candidates of {0}".format(
                      prefix))
                extract_potential_rbs(
                      os.path.join(self.fasta_path, prefix + ".fa"),
                      os.path.join(self.gff_path, gff),
                      os.path.join(self.tss_path, prefix + "_TSS.gff"),
                      os.path.join(self.tran_path, prefix + "_transcript.gff"),
                      first_seq, args_ribo)
                print("pre-scanning of {0}".format(prefix))
                first_scan_file = self._run_infernal(args_ribo, first_seq,
                                                     "txt", prefix)
                sec_seq = os.path.join(self.tmp_files["fasta"],
                                       "_".join([prefix, "regenerate.fa"]))
                first_table = os.path.join(
                        self.tmp_files["table"],
                        "_".join([prefix, self.suffixs["csv"]]))
                regenerate_seq(first_scan_file, first_seq,
                               first_table, sec_seq)
                print("scanning of {0}".format(prefix))
                sec_scan_file = self._run_infernal(args_ribo, sec_seq,
                                                   "re_txt", prefix)
                sec_table = os.path.join(
                        self.tmp_files["table"],
                        "_".join([prefix, self.suffixs["re_csv"]]))
                reextract_rbs(sec_scan_file, first_table, sec_table)
                shutil.move(sec_table, first_table)
                modify_table(first_table, args_ribo.output_all)
        return prefixs

    def _merge_results(self, args_ribo):
        for gff in os.listdir(args_ribo.gffs):
            if gff.endswith(".gff"):
                prefix = gff.replace(".gff", "")
                print("Merge results of {0}".format(prefix))
                pre_strain = ""
                self.helper.check_make_folder(os.path.join(
                                              self.scan_folder, prefix))
                fh = open(os.path.join(args_ribo.gffs, gff))
                for entry in self.gff_parser.entries(fh):
                    if entry.seq_id != pre_strain:
                        if len(pre_strain) == 0:
                            shutil.copyfile(os.path.join(
                                self.tmp_files["table"],
                                "_".join([entry.seq_id, self.suffixs["csv"]])),
                                os.path.join(
                                    self.table_folder,
                                    "_".join([prefix, self.suffixs["csv"]])))
                        else:
                            self.helper.merge_file(os.path.join(
                                self.tmp_files["table"],
                                "_".join([entry.seq_id, self.suffixs["csv"]])),
                                os.path.join(
                                    self.table_folder,
                                    "_".join([prefix, self.suffixs["csv"]])))
                        shutil.copy(os.path.join(
                            self.tmp_files["scan"],
                            "_".join([entry.seq_id, self.suffixs["txt"]])),
                            os.path.join(self.scan_folder, prefix))
                        shutil.copy(os.path.join(
                            self.tmp_files["scan"],
                            "_".join([entry.seq_id, self.suffixs["re_txt"]])),
                            os.path.join(self.scan_folder, prefix))
                        pre_strain = entry.seq_id
                out_stat = os.path.join(
                        self.stat_folder,
                        "_".join(["stat", prefix, "riboswitch.txt"]))
                print("compute statistics of {0}".format(prefix))
                stat_and_covert2gff(os.path.join(
                    self.table_folder,
                    "_".join([prefix, self.suffixs["csv"]])),
                    args_ribo.ribos_id, os.path.join(
                        self.gff_outfolder,
                        "_".join([prefix, "riboswitch.gff"])),
                    args_ribo.fuzzy, out_stat)
                fh.close()

    def _remove_tmp(self, args_ribo):
        self.helper.remove_tmp(args_ribo.gffs)
        self.helper.remove_tmp(args_ribo.fastas)
        self.helper.remove_all_content(args_ribo.out_folder, "tmp", "dir")

    def _remove_overlap(self, gff_path):
        for gff in os.listdir(gff_path):
            if gff.endswith(".gff"):
                rbs_overlap(
                    os.path.join(os.path.join(
                        self.tmp_files["table"],
                        "_".join([gff.replace(".gff", ""),
                                  self.suffixs["csv"]]))),
                    os.path.join(gff_path, gff))

    def run_ribos(self, args_ribo):
        if args_ribo.fuzzy_rbs > 6:
            print("Error: --fuzzy_rbs should be equal or less than 6!!")
            sys.exit()
        self.multiparser.parser_gff(args_ribo.gffs, None)
        self.multiparser.parser_fasta(args_ribo.fastas)
        self.multiparser.parser_gff(args_ribo.trans, "transcript")
        self.multiparser.parser_gff(args_ribo.tsss, "TSS")
        for gff in os.listdir(args_ribo.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(
                                                 args_ribo.gffs, gff))
        rbs_from_rfam(args_ribo.ribos_id, args_ribo.rfam, self.ribos_rfam)
        print("compressing Rfam...")
        call([os.path.join(args_ribo.infernal_path, "cmpress"),
              "-F", self.ribos_rfam])
        prefixs = []
        self.helper.check_make_folder(self.tmp_files["fasta"])
        self.helper.check_make_folder(self.tmp_files["scan"])
        self.helper.check_make_folder(self.tmp_files["table"])
        prefixs = self._scan_extract_rfam(prefixs, args_ribo)
        self._remove_overlap(self.gff_path)
        self._merge_results(args_ribo)
        mapping_ribos(self.table_folder, args_ribo.ribos_id)
        self._remove_tmp(args_ribo)
Beispiel #15
0
class TSSpredator(object):

    def __init__(self, args_tss):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.master = os.path.join(args_tss.out_folder, "MasterTables")
        self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta":
                     "tmp_tss", "tmp": "tmp"}
        if args_tss.ta_files is not None:
            self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp")
        else:
            self.tmps["ta"] = None
        self.gff_path = os.path.join(args_tss.gffs, "tmp")
        self.wig_path = os.path.join(args_tss.wig_folder, "tmp")
        self.fasta_path = os.path.join(args_tss.fastas, "tmp")
        self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics")
        self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs")

    def _assign_dict(self, lib_datas):
        return {"wig": lib_datas[0],
                "tex": lib_datas[1],
                "condition": int(lib_datas[2]),
                "replicate": lib_datas[3],
                "strand": lib_datas[4]}

    def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix):
        for num_id in range(1, lib_num+1):
            cond_list = []
            for lib in lib_list:
                if num_id == lib["condition"]:
                    cond_list.append(lib)
            cond_sort_list = sorted(cond_list, key=lambda k: k['replicate'])
            for cond in cond_sort_list:
                out.write("{0}_{1}{2} = {3}\n".format(
                          prefix, cond["condition"], cond["replicate"],
                          os.path.join(wig_folder, cond["wig"])))

    def _start_to_run(self, tsspredator_path, config_file, out_path, prefix):
        print("Running TSSpredator for " + prefix)
        out = open(os.path.join(out_path, "log.txt"), "w")
        err = open(os.path.join(out_path, "err.txt"), "w")
        call(["java", "-jar", tsspredator_path,
              config_file], stdout=out, stderr=err)
        out.close()
        err.close()

    def _import_lib(self, libs, wig_folder, project_strain_name,
                    out, gff, program, fasta):
        lib_dict = {"fp": [], "fm": [], "nm": [], "np": []}
        lib_num = 0
        rep_set = set()
        list_num_id = []
        print("Runniun {0} now...".format(program))
        for lib in libs:
            lib_datas = lib.split(":")
            if not lib_datas[0].endswith(".wig"):
                print("Error:Exist a not proper wig files!!")
                sys.exit()
            for wig in os.listdir(wig_folder):
                filename = wig.split("_STRAIN_")
                if (filename[0] == lib_datas[0][:-4]) and (
                        filename[1][:-4] == project_strain_name):
                    lib_datas[0] = wig
            if int(lib_datas[2]) > lib_num:
                lib_num = int(lib_datas[2])
            if lib_datas[3] not in rep_set:
                rep_set.add(lib_datas[3])
            if (lib_datas[1] == "tex") and (lib_datas[4] == "+"):
                lib_dict["fp"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"):
                lib_dict["fm"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"):
                lib_dict["np"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"):
                lib_dict["nm"].append(self._assign_dict(lib_datas))
        for num_id in range(1, lib_num+1):
            out.write("annotation_{0} = {1}\n".format(num_id, gff))
        if program.lower() == "tss":
            self._print_lib(lib_num, lib_dict["fm"], out,
                            wig_folder, "fivePrimeMinus")
            self._print_lib(lib_num, lib_dict["fp"], out,
                            wig_folder, "fivePrimePlus")
        elif program.lower() == "processing_site":
            self._print_lib(lib_num, lib_dict["nm"], out,
                            wig_folder, "fivePrimeMinus")
            self._print_lib(lib_num, lib_dict["np"], out,
                            wig_folder, "fivePrimePlus")
        else:
            print("Error: Wrong program name!!!")
            sys.exit()
        for num_id in range(1, lib_num+1):
            out.write("genome_{0} = {1}\n".format(num_id, fasta))
        for num_id in range(1, lib_num+1):
            list_num_id.append(str(num_id))
        return lib_num, num_id, rep_set, lib_dict, list_num_id

    def _gen_config(self, project_strain_name, args_tss, gff,
                    wig_folder, fasta, config_file):
        master_folder = "MasterTable_" + project_strain_name
        out_path = os.path.join(self.master, master_folder)
        self.helper.check_make_folder(out_path)
        out = open(config_file, "w")
        out.write("TSSinClusterSelectionMethod = HIGHEST\n")
        out.write("allowedCompareShift = 1\n")
        out.write("allowedRepCompareShift = 1\n")
        lib_num, num_id, rep_set, lib_dict, list_num_id = \
            self._import_lib(args_tss.libs, wig_folder, project_strain_name,
                             out, gff, args_tss.program, fasta)
        out.write("idList = ")
        out.write(",".join(list_num_id) + "\n")
        out.write("maxASutrLength = 100\n")
        out.write("maxGapLengthInGene = 500\n")
        out.write("maxNormalTo5primeFactor = {0}\n".format(
                  args_tss.processing_factor))
        out.write("maxTSSinClusterDistance = {0}\n".format(
                  args_tss.cluster + 1))
        out.write("maxUTRlength = {0}\n".format(args_tss.utr_length))
        out.write("min5primeToNormalFactor = {0}\n".format(
                  args_tss.enrichment_factor))
        out.write("minCliffFactor = {0}\n".format(args_tss.factor))
        out.write("minCliffFactorDiscount = {0}\n".format(
                  args_tss.factor_reduction))
        out.write("minCliffHeight = {0}\n".format(args_tss.height))
        out.write("minCliffHeightDiscount = {0}\n".format(
                  args_tss.height_reduction))
        out.write("minNormalHeight = {0}\n".format(args_tss.base_height))
        out.write("minNumRepMatches = {0}\n".format(args_tss.repmatch))
        out.write("minPlateauLength = 0\n")
        out.write("mode = cond\n")
        out.write("normPercentile = 0.9\n")
        if args_tss.program.lower() == "tss":
            self._print_lib(lib_num, lib_dict["nm"], out,
                            wig_folder, "normalMinus")
            self._print_lib(lib_num, lib_dict["np"], out,
                            wig_folder, "normalPlus")
        else:
            self._print_lib(lib_num, lib_dict["fm"], out,
                            wig_folder, "normalMinus")
            self._print_lib(lib_num, lib_dict["fp"], out,
                            wig_folder, "normalPlus")
        out.write("numReplicates = {0}\n".format(len(rep_set)))
        out.write("numberOfDatasets = {0}\n".format(lib_num))
        out.write("outputDirectory = {0}\n".format(out_path))
        for prefix_id in range(len(args_tss.output_prefixs)):
            out.write("outputPrefix_{0} = {1}\n".format(
                      prefix_id + 1, args_tss.output_prefixs[prefix_id]))
        out.write("projectName = {0}\n".format(project_strain_name))
        out.write("superGraphCompatibility = igb\n")
        out.write("texNormPercentile = 0.5\n")
        out.write("writeGraphs = 0\n")
        out.write("writeNocornacFiles = 0\n")
        out.close()

    def _convert_gff(self, prefixs, args_tss):
        for prefix in prefixs:
            out_file = os.path.join(self.gff_outfolder, "_".join([
                           prefix, args_tss.program]) + ".gff")
            gff_f = open(out_file, "w")
            out_path = os.path.join(self.master, "_".join([
                           "MasterTable", prefix]))
            if "MasterTable.tsv" not in os.listdir(out_path):
                print("Error:there is not MasterTable file in {0}".format(
                      out_path))
                print("Please check configuration file.")
            else:
                self.converter.convert_mastertable2gff(
                    os.path.join(out_path, "MasterTable.tsv"),
                    "ANNOgesic", args_tss.program, prefix, out_file)
            gff_f.close()

    def _merge_manual(self, tsss, args_tss):
        self.helper.check_make_folder(os.path.join(os.getcwd(),
                                      self.tmps["tss"]))
        for tss in tsss:
            for gff in os.listdir(args_tss.gffs):
                if (gff[:-4] == tss) and (".gff" in gff):
                    break
            filename = "_".join([tss, args_tss.program]) + ".gff"
            predict = os.path.join(self.gff_outfolder, filename)
            print("Running merge and classify manual ....")
            stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss)
            merge_manual_predict_tss(
                predict, stat_file,
                os.path.join(self.tmps["tss"], filename),
                os.path.join(args_tss.gffs, gff), args_tss)
            shutil.move(stat_file, os.path.join(args_tss.out_folder,
                                                "statistics", tss, stat_file))
        self.helper.move_all_content(self.tmps["tss"],
                                     self.gff_outfolder, [".gff"])
        shutil.rmtree(self.tmps["tss"])

    def _validate(self, tsss, args_tss):
        print("Running validation of annotation....")
        for tss in tsss:
            for gff in os.listdir(args_tss.gffs):
                if (gff[:-4] == tss) and (".gff" in gff):
                    break
            stat_file = os.path.join(
                    self.stat_outfolder, tss,
                    "".join(["stat_gene_vali_", tss, ".csv"]))
            out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff")
            if args_tss.program.lower() == "tss":
                compare_file = os.path.join(self.gff_outfolder,
                                            "_".join([tss, "TSS.gff"]))
            elif args_tss.program.lower() == "processing":
                compare_file = os.path.join(self.gff_outfolder,
                                            "_".join([tss, "processing.gff"]))
            validate_gff(compare_file, os.path.join(args_tss.gffs, gff),
                         stat_file, out_cds_file, args_tss.utr_length,
                         args_tss.program.lower())
            shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff))

    def _compare_ta(self, tsss, args_tss):
        detect = False
        print("Running compare transcript assembly and TSS ...")
        self.multiparser.parser_gff(args_tss.ta_files, "transcript")
        self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"],
                                     None, "transcript")
        for tss in tsss:
            stat_out = os.path.join(
                    self.stat_outfolder, tss, "".join([
                        "stat_compare_TSS_Transcriptome_assembly_",
                        tss, ".csv"]))
            for ta in os.listdir(self.tmps["ta"]):
                filename = ta.split("_transcript")
                if (filename[0] == tss) and (filename[1] == ".gff"):
                    detect = True
                    break
            compare_file = os.path.join(self.gff_outfolder,
                                        "_".join([tss, "TSS.gff"]))
            if detect:
                stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file,
                            stat_out, self.tmps["ta_tss"],
                            self.tmps["tss_ta"], args_tss.fuzzy)
                self.helper.sort_gff(self.tmps["tss_ta"], compare_file)
                self.helper.sort_gff(self.tmps["ta_tss"],
                                     os.path.join(args_tss.ta_files, ta))
                os.remove(self.tmps["tss_ta"])
                os.remove(self.tmps["ta_tss"])
                detect = False

    def _stat_tss(self, tsss, feature):
        print("Running statistaics.....")
        for tss in tsss:
            compare_file = os.path.join(self.gff_outfolder,
                                        "_".join([tss, feature]) + ".gff")
            stat_tsspredator(
                compare_file, feature,
                os.path.join(self.stat_outfolder, tss, "_".join([
                    "stat", feature, "class", tss]) + ".csv"),
                os.path.join(self.stat_outfolder, tss, "_".join([
                    "stat", feature, "libs", tss]) + ".csv"))
            self.helper.move_all_content(os.getcwd(), os.path.join(
                self.stat_outfolder, tss), ["_class", ".png"])
            if os.path.exists(os.path.join(
                    self.stat_outfolder, "TSSstatistics.tsv")):
                shutil.move(
                    os.path.join(
                        self.stat_outfolder, "TSSstatistics.tsv"),
                    os.path.join(
                        self.stat_outfolder, tss, "TSSstatistics.tsv"))
            plot_venn(compare_file, feature)
            self.helper.move_all_content(os.getcwd(), os.path.join(
                self.stat_outfolder, tss), ["_venn", ".png"])

    def _set_gen_config(self, args_tss, input_folder):
        prefixs = []
        detect = False
        for fasta in os.listdir(self.fasta_path):
            for gff in os.listdir(self.gff_path):
                if fasta[:-3] == gff[:-4]:
                    prefix = fasta[:-3]
                    for wig in os.listdir(self.wig_path):
                        filename = wig.split("_STRAIN_")
                        if filename[1][:-4] == prefix:
                            detect = True
                            break
                    if detect:
                        prefixs.append(prefix)
                        config = os.path.join(
                                input_folder,
                                "_".join(["config", prefix]) + ".ini")
                        self._gen_config(
                            prefix, args_tss,
                            os.path.join(self.gff_path, gff), self.wig_path,
                            os.path.join(self.fasta_path, fasta), config)
        return prefixs

    def _merge_wigs(self, wig_folder, prefix, libs):
        self.helper.check_make_folder(os.path.join(os.getcwd(),
                                      self.tmps["tmp"]))
        for wig_file in os.listdir(wig_folder):
            for lib in libs:
                info = lib.split(":")
                if (info[0][:-4] in wig_file) and (info[-1] == "+") and (
                        prefix in wig_file) and (
                        os.path.isfile(os.path.join(wig_folder, wig_file))):
                    Helper().merge_file(
                            os.path.join(wig_folder, wig_file),
                            os.path.join("tmp", "merge_forward.wig"))
                if (info[0][:-4] in wig_file) and (info[-1] == "-") and (
                        prefix in wig_file) and (
                        os.path.isfile(os.path.join(wig_folder, wig_file))):
                    Helper().merge_file(
                            os.path.join(wig_folder, wig_file),
                            os.path.join("tmp", "merge_reverse.wig"))

    def _check_orphan(self, prefixs, wig_folder, args_tss):
        for prefix in prefixs:
            self._merge_wigs(wig_folder, prefix, args_tss.libs)
            tmp_tss = os.path.join(self.tmps["tmp"], "_".join([
                          prefix, args_tss.program + ".gff"]))
            pre_tss = os.path.join(self.gff_outfolder, "_".join([
                          prefix, args_tss.program + ".gff"]))
            check_orphan(pre_tss, os.path.join(
                args_tss.gffs, prefix + ".gff"),
                "tmp/merge_forward.wig", "tmp/merge_reverse.wig", tmp_tss)
            shutil.move(tmp_tss, pre_tss)
        shutil.rmtree("tmp")

    def _remove_files(self, args_tss):
        print("Remove temperary files and folders...")
        self.helper.remove_tmp(args_tss.fastas)
        self.helper.remove_tmp(args_tss.gffs)
        self.helper.remove_tmp(args_tss.wig_folder)
        self.helper.remove_tmp(args_tss.ta_files)
        if "merge_forward.wig" in os.listdir(os.getcwd()):
            os.remove("merge_forward.wig")
        if "merge_reverse.wig" in os.listdir(os.getcwd()):
            os.remove("merge_reverse.wig")

    def _deal_with_overlap(self, out_folder, args_tss):
        if args_tss.overlap_feature.lower() == "both":
            pass
        else:
            print("Comparing TSS and Processing site...")
            if args_tss.program.lower() == "tss":
                for tss in os.listdir(out_folder):
                    if tss.endswith("_TSS.gff"):
                        ref = self.helper.get_correct_file(
                                args_tss.references, "_processing.gff",
                                tss.replace("_TSS.gff", ""), None, None)
                        filter_tss_pro(os.path.join(out_folder, tss),
                                       ref, args_tss.overlap_feature,
                                       args_tss.cluster)
            elif args_tss.program.lower() == "processing_site":
                for tss in os.listdir(out_folder):
                    if tss.endswith("_processing.gff"):
                        ref = self.helper.get_correct_file(
                                args_tss.references, "_TSS.gff",
                                tss.replace("_processing.gff", ""), None, None)
                        filter_tss_pro(os.path.join(out_folder, tss),
                                       ref, args_tss.overlap_feature,
                                       args_tss.cluster)

    def _low_expression(self, args_tss, gff_folder):
        prefix = None
        self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs)
        for gff in os.listdir(gff_folder):
            if (args_tss.program.lower() == "tss") and (
                    gff.endswith("_TSS.gff")):
                prefix = gff.replace("_TSS.gff", "")
            elif (args_tss.program.lower() == "processing") and (
                    gff.endswith("_processing.gff")):
                prefix = gff.replace("_processing.gff", "")
            if prefix:
                out = open(os.path.join(
                    self.stat_outfolder, prefix, "_".join([
                        "stat", prefix, "low_expression_cutoff.csv"])), "w")
                out.write("\t".join(["strain", "cutoff_coverage"]) + "\n")
                cutoff = filter_low_expression(
                        os.path.join(gff_folder, gff), args_tss,
                        "tmp/merge_forward.wig", "tmp/merge_reverse.wig",
                        "tmp/without_low_expression.gff")
                out.write("\t".join([prefix, str(cutoff)]) + "\n")
                os.remove(os.path.join(gff_folder, gff))
                shutil.move("tmp/without_low_expression.gff",
                            os.path.join(gff_folder, gff))
                prefix = None
        out.close()

    def run_tsspredator(self, args_tss):
        input_folder = os.path.join(args_tss.out_folder, "configs")
        for gff in os.listdir(args_tss.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(
                                                 args_tss.gffs, gff))
        self.helper.check_make_folder(self.gff_outfolder)
        self.multiparser.parser_fasta(args_tss.fastas)
        self.multiparser.parser_gff(args_tss.gffs, None)
        self.multiparser.parser_wig(args_tss.wig_folder)
        prefixs = self._set_gen_config(args_tss, input_folder)
        for prefix in prefixs:
            out_path = os.path.join(
                    self.master, "_".join(["MasterTable", prefix]))
            config_file = os.path.join(
                    input_folder, "_".join(["config", prefix]) + ".ini")
            self._start_to_run(args_tss.tsspredator_path, config_file,
                               out_path, prefix)
            if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")):
                shutil.move(os.path.join(out_path, "TSSstatistics.tsv"),
                            os.path.join(
                                self.stat_outfolder, "TSSstatistics.tsv"))
        if args_tss.program.lower() == "processing_site":
            args_tss.program = "processing"
        self._convert_gff(prefixs, args_tss)
        if args_tss.check_orphan:
            print("checking the orphan TSS...")
            self._check_orphan(prefixs,
                               os.path.join(args_tss.wig_folder, "tmp"),
                               args_tss)
        self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder,
                                     None, args_tss.program)
        datas = []
        for gff in os.listdir(self.gff_outfolder):
            if gff.endswith(".gff"):
                gff_folder = gff.replace("".join(["_", args_tss.program,
                                                  ".gff"]), "")
                self.helper.check_make_folder(
                     os.path.join(self.stat_outfolder, gff_folder))
                datas.append(gff_folder)
        if args_tss.remove_low_expression is not None:
            self._low_expression(args_tss, self.gff_outfolder)
        if args_tss.manual is not None:
            self.multiparser.combine_wig(args_tss.gffs, self.wig_path,
                                         None, args_tss.libs)
            self._merge_manual(datas, args_tss)
        self._deal_with_overlap(self.gff_outfolder, args_tss)
        if args_tss.stat:
            self._stat_tss(datas, args_tss.program)
        if args_tss.validate:
            self._validate(datas, args_tss)
        if args_tss.ta_files is not None:
            self._compare_ta(datas, args_tss)
        self._remove_files(args_tss)
Beispiel #16
0
class MEME(object):
    '''detection of promoter'''
    def __init__(self, args_pro):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.tss_path = os.path.join(args_pro.tsss, "tmp")
        if args_pro.gffs is not None:
            self.gff_path = os.path.join(args_pro.gffs, "tmp")
        else:
            self.gff_path = None
        self.out_fasta = os.path.join(args_pro.output_folder, "fasta_class")
        self.tmp_folder = os.path.join(os.getcwd(), "tmp")
        self.fastas = {
            "pri": os.path.join(self.tmp_folder, "primary.fa"),
            "sec": os.path.join(self.tmp_folder, "secondary.fa"),
            "inter": os.path.join(self.tmp_folder, "internal.fa"),
            "anti": os.path.join(self.tmp_folder, "antisense.fa"),
            "orph": os.path.join(self.tmp_folder, "orphan.fa"),
            "all_no_orph": "without_orphan.fa",
            "all": "all_type.fa",
            "tmp_fa": os.path.join(self.tmp_folder, "tmp.fa"),
            "tmp_all": os.path.join(self.tmp_folder, "tmp_all.fa")
        }
        self.all_fasta = os.path.join(args_pro.fastas, "allfasta.fa")
        self.all_tss = os.path.join(self.tss_path, "allfasta_TSS.gff")

    def _gen_and_check_folder(self, out_path, folder, type_):
        sub_out_folder = os.path.join(out_path, type_)
        if folder in os.listdir(sub_out_folder):
            shutil.rmtree(os.path.join(sub_out_folder, folder))
        return sub_out_folder

    def _run_normal_motif(self, input_path, out_path, filename, fasta, width,
                          args_pro):
        '''run MEME with specific width'''
        folder = "_".join(["promoter_motifs", filename, str(width), "nt"])
        if (args_pro.program.lower() == "meme") or (args_pro.program.lower()
                                                    == "both"):
            meme_folder = self._gen_and_check_folder(out_path, folder, "MEME")
            command = [
                args_pro.meme_path, "-maxsize", "1000000", "-dna", "-nmotifs",
                str(args_pro.num_motif), "-w",
                str(width), "-maxiter", "100", "-evt",
                str(args_pro.e_value)
            ]
            if args_pro.para is not None:
                command = command + ["-p", args_pro.para]
            call(command + [
                "-oc",
                os.path.join(meme_folder, folder),
                os.path.join(input_path, fasta)
            ])
        if (args_pro.program.lower() == "glam2") or (args_pro.program.lower()
                                                     == "both"):
            glam_folder = self._gen_and_check_folder(out_path, folder, "GLAM2")
            call([
                args_pro.glam2_path, "-O",
                os.path.join(glam_folder, folder), "-w",
                str(width), "-b",
                str(width), "-r",
                str(args_pro.num_motif), "-n",
                str(args_pro.end_run), "n",
                os.path.join(input_path, fasta)
            ])

    def _run_small_motif(self, input_path, out_path, filename, fasta, width,
                         args_pro):
        '''run MEME with range of width'''
        data = width.split("-")
        min_width = data[0]
        max_width = data[1]
        folder = "_".join([
            "promoter_motifs", filename,
            "-".join([str(min_width), str(max_width)]), "nt"
        ])
        if (args_pro.program.lower() == "meme") or (args_pro.program.lower()
                                                    == "both"):
            meme_folder = self._gen_and_check_folder(out_path, folder, "MEME")
            command = [
                args_pro.meme_path, "-maxsize", "1000000", "-dna", "-nmotifs",
                str(args_pro.num_motif), "-minsites", "0", "-maxsites", "2",
                "-minw",
                str(min_width), "-maxw",
                str(max_width), "-maxiter", "100", "-evt",
                str(args_pro.e_value)
            ]
            if args_pro.para is not None:
                command = command + ["-p", args_pro.para]
            call(command + [
                "-oc",
                os.path.join(meme_folder, folder),
                os.path.join(input_path, fasta)
            ])
        elif (args_pro.program.lower()
              == "glam2") or (args_pro.program.lower() == "both"):
            glam_folder = self._gen_and_check_folder(out_path, folder, "GLAM2")
            call([
                args_pro.glam2_path, "-O",
                os.path.join(glam_folder, folder), "-a",
                str(min_width), "-b",
                str(max_width), "-r",
                str(args_pro.num_motif), "-n",
                str(args_pro.end_run), "n",
                os.path.join(input_path, fasta)
            ])

    def _get_fasta_file(self, fasta_path, prefix):
        for fasta in os.listdir(fasta_path):
            if (fasta.endswith(".fa")) and \
               (prefix == fasta.replace(".fa", "")):
                break
            elif (fasta.endswith(".fna")) and \
                 (prefix == fasta.replace(".fna", "")):
                break
            elif (fasta.endswith(".fasta")) and \
                 (prefix == fasta.replace(".fasta", "")):
                break
        return fasta

    def _check_gff(self, gffs):
        for gff in os.listdir(gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(gffs, gff))

    def _move_and_merge_fasta(self, input_path, prefix):
        all_type = os.path.join(self.tmp_folder, self.fastas["all"])
        all_no_orph = os.path.join(self.tmp_folder, self.fastas["all_no_orph"])
        if self.fastas["all"] in os.listdir(self.tmp_folder):
            os.remove(all_type)
        if self.fastas["all_no_orph"] in os.listdir(self.tmp_folder):
            os.remove(all_no_orph)
        shutil.copyfile(self.fastas["pri"], self.fastas["tmp_fa"])
        self.helper.merge_file(self.fastas["sec"], self.fastas["tmp_fa"])
        self.helper.merge_file(self.fastas["inter"], self.fastas["tmp_fa"])
        self.helper.merge_file(self.fastas["anti"], self.fastas["tmp_fa"])
        shutil.copyfile(self.fastas["tmp_fa"], self.fastas["tmp_all"])
        self.helper.merge_file(self.fastas["orph"], self.fastas["tmp_all"])
        del_repeat_fasta(self.fastas["tmp_fa"], all_no_orph)
        del_repeat_fasta(self.fastas["tmp_all"], all_type)
        os.remove(self.fastas["tmp_fa"])
        os.remove(self.fastas["tmp_all"])
        out_prefix = os.path.join(input_path, prefix)
        shutil.move(self.fastas["pri"],
                    "_".join([out_prefix, "allstrain_primary.fa"]))
        shutil.move(self.fastas["sec"],
                    "_".join([out_prefix, "allstrain_secondary.fa"]))
        shutil.move(self.fastas["inter"],
                    "_".join([out_prefix, "allstrain_internal.fa"]))
        shutil.move(self.fastas["anti"],
                    "_".join([out_prefix, "allstrain_antisense.fa"]))
        shutil.move(self.fastas["orph"],
                    "_".join([out_prefix, "allstrain_orphan.fa"]))
        shutil.move(all_type, "_".join([out_prefix, "allstrain_all_types.fa"]))
        shutil.move(all_no_orph,
                    "_".join([out_prefix, "allstrain_without_orphan.fa"]))

    def _split_fasta_by_strain(self, input_path):
        for fasta in os.listdir(input_path):
            if "allstrain" not in fasta:
                os.remove(os.path.join(input_path, fasta))
        out = None
        for fasta in os.listdir(input_path):
            if fasta.endswith(".fa"):
                pre_strain = ""
                num_strain = 0
                with open(os.path.join(input_path, fasta), "r") as f_h:
                    for line in f_h:
                        line = line.strip()
                        if line.startswith(">"):
                            datas = line.split("_")
                            strain = "_".join(datas[2:])
                            if pre_strain != strain:
                                num_strain += 1
                                filename = fasta.split("allstrain")
                                if out is not None:
                                    out.close()
                                out = open(
                                    os.path.join(
                                        input_path, "".join([
                                            filename[0], strain, filename[-1]
                                        ])), "a")
                                pre_strain = strain
                            out.write(line + "\n")
                        else:
                            out.write(line + "\n")
                if num_strain <= 1:
                    os.remove(
                        os.path.join(
                            input_path,
                            "".join([filename[0], strain, filename[-1]])))
        out.close()

    def _run_program(self, prefixs, args_pro):
        for prefix in prefixs:
            input_path = os.path.join(self.out_fasta, prefix)
            out_path = os.path.join(args_pro.output_folder, prefix)
            if args_pro.program.lower() == "both":
                self.helper.check_make_folder(os.path.join(out_path, "MEME"))
                self.helper.check_make_folder(os.path.join(out_path, "GLAM2"))
            elif args_pro.program.lower() == "meme":
                self.helper.check_make_folder(os.path.join(out_path, "MEME"))
            elif args_pro.program.lower() == "glam2":
                self.helper.check_make_folder(os.path.join(out_path, "GLAM2"))
            for fasta in os.listdir(input_path):
                filename = fasta.replace(".fa", "")
                for width in args_pro.widths:
                    print("Computing promoters of {0} - {1}".format(
                        fasta, width))
                    if "-" in width:
                        self._run_small_motif(input_path, out_path, filename,
                                              fasta, width, args_pro)
                    else:
                        self._run_normal_motif(input_path, out_path, filename,
                                               fasta, width, args_pro)

    def _combine_file(self, prefixs, args_pro):
        '''combine all TSS file in the input folder to generate the 
        global TSS for detecting the global promoter'''
        if args_pro.source:
            for tss in os.listdir(self.tss_path):
                if tss.endswith("_TSS.gff"):
                    self.helper.merge_file(os.path.join(self.tss_path, tss),
                                           self.all_tss)
            for fasta in os.listdir(args_pro.fastas):
                if (fasta.endswith(".fa")) or (fasta.endswith(".fna")) or (
                        fasta.endswith(".fasta")):
                    self.helper.merge_file(
                        os.path.join(args_pro.fastas, fasta), self.all_fasta)
        else:
            for tss in os.listdir(
                    os.path.join(args_pro.output_folder, "TSS_class")):
                if tss.endswith("_TSS.gff"):
                    self.helper.merge_file(os.path.join(self.tss_path, tss),
                                           self.all_tss)
            for fasta in os.listdir(args_pro.fastas):
                if (fasta.endswith(".fa")) or (fasta.endswith(".fna")) or (
                        fasta.endswith(".fasta")):
                    self.helper.merge_file(
                        os.path.join(args_pro.fastas, fasta), self.all_fasta)
        print("Generating fasta file of all fasta files")
        prefixs.append("allfasta")
        input_path = os.path.join(self.out_fasta, "allfasta")
        self.helper.check_make_folder(
            os.path.join(args_pro.output_folder, "allfasta"))
        self.helper.check_make_folder(os.path.join(self.out_fasta, "allfasta"))
        args_pro.source = True
        upstream(self.all_tss, self.all_fasta, None, None, args_pro, None)
        self._move_and_merge_fasta(input_path, "allfasta")

    def _remove_files(self, args_pro):
        self.helper.remove_tmp_dir(args_pro.fastas)
        self.helper.remove_tmp_dir(args_pro.tsss)
        self.helper.remove_tmp_dir(args_pro.gffs)
        if "tmp_wig" in os.listdir(args_pro.output_folder):
            shutil.rmtree(os.path.join(args_pro.output_folder, "tmp_wig"))
        if "allfasta" in os.listdir(os.getcwd()):
            shutil.rmtree("allfasta")
        shutil.rmtree("tmp")

    def _gen_table(self, output_folder, prefixs, combine, program):
        '''generate the promoter table'''
        if combine:
            strains = prefixs + ["allfasta"]
        else:
            strains = prefixs
        for strain in strains:
            tss_file = os.path.join(self.tss_path, strain + "_TSS.gff")
            if (program.lower() == "both") or (program.lower() == "meme"):
                for folder in os.listdir(
                        os.path.join(output_folder, strain, "MEME")):
                    gen_promoter_table(
                        os.path.join(output_folder, strain, "MEME", folder,
                                     "meme.txt"),
                        os.path.join(output_folder, strain, "MEME", folder,
                                     "meme.csv"), tss_file, "meme")
            if (program.lower() == "both") or (program.lower() == "glam2"):
                for folder in os.listdir(
                        os.path.join(output_folder, strain, "GLAM2")):
                    gen_promoter_table(
                        os.path.join(output_folder, strain, "GLAM2", folder,
                                     "glam2.txt"),
                        os.path.join(output_folder, strain, "GLAM2", folder,
                                     "glam2.csv"), tss_file, "glam2")

    def _get_upstream(self, args_pro, prefix, tss, fasta):
        '''get upstream sequence of TSS'''
        if args_pro.source:
            print("Generating fasta file of {0}".format(prefix))
            upstream(os.path.join(self.tss_path, tss),
                     os.path.join(args_pro.fastas, fasta), None, None,
                     args_pro, prefix)
        else:
            if (args_pro.gffs is None) or (args_pro.tex_wigs is None) or (
                    args_pro.input_libs is None):
                print("Error: Please assign proper annotation, tex +/- "
                      "wig folder and tex treated libs!!!")
                sys.exit()
            if "TSS_class" not in os.listdir(args_pro.output_folder):
                os.mkdir(os.path.join(args_pro.output_folder, "TSS_class"))

            print("Classifying TSS and extracting fasta {0}".format(prefix))
            upstream(
                os.path.join(self.tss_path, tss),
                os.path.join(args_pro.fastas, fasta),
                os.path.join(self.gff_path, prefix + ".gff"),
                os.path.join(args_pro.output_folder, "TSS_class",
                             "_".join([prefix, "TSS.gff"])), args_pro, prefix)

    def run_meme(self, args_pro):
        if "allfasta.fa" in os.listdir(args_pro.fastas):
            os.remove(self.all_fasta)
            if "allfasta.fa_folder" in os.listdir(args_pro.fastas):
                shutil.rmtree(
                    os.path.join(args_pro.fastas, "allfasta.fa_folder"))
        self.multiparser.parser_fasta(args_pro.fastas)
        self.multiparser.parser_gff(args_pro.tsss, "TSS")
        if "allfasta_TSS.gff" in os.listdir(self.tss_path):
            os.remove(self.all_tss)
        if args_pro.gffs is not None:
            self._check_gff(args_pro.gffs)
            self.multiparser.parser_gff(args_pro.gffs, None)
            self.multiparser.combine_gff(args_pro.fastas, self.gff_path,
                                         "fasta", None)
        self._check_gff(args_pro.tsss)
        self.multiparser.combine_gff(args_pro.fastas, self.tss_path, "fasta",
                                     "TSS")
        self.helper.check_make_folder(self.out_fasta)
        self.helper.check_make_folder(self.tmp_folder)
        prefixs = []
        for tss in os.listdir(self.tss_path):
            prefix = tss.replace("_TSS.gff", "")
            prefixs.append(prefix)
            self.helper.check_make_folder(
                os.path.join(args_pro.output_folder, prefix))
            self.helper.check_make_folder(os.path.join(self.out_fasta, prefix))
            input_path = os.path.join(self.out_fasta, prefix)
            fasta = self._get_fasta_file(args_pro.fastas, prefix)
            self._get_upstream(args_pro, prefix, tss, fasta)
            self._move_and_merge_fasta(input_path, prefix)
            self._split_fasta_by_strain(input_path)
        if args_pro.combine:
            self._combine_file(prefixs, args_pro)
        self._run_program(prefixs, args_pro)
        print("Generating the table")
        self._gen_table(args_pro.output_folder, prefixs, args_pro.combine,
                        args_pro.program)
        self._remove_files(args_pro)
Beispiel #17
0
class Screen(object):

    def __init__(self, args_sc):
        self.multiparser = Multiparser()
        self.helper = Helper()
        out_folder = os.path.join(args_sc.output_folder, "screenshots")
        if os.path.exists(out_folder):
            print("Error: The {0} already exist!!!".format(
                  out_folder))
            sys.exit()
        else:
            os.mkdir(out_folder)
        args_sc.output_folder = out_folder
        filename = args_sc.fasta.split("/")[-1]
        self.strain = ".".join(filename.split(".")[0:-1])
        self.helper.check_make_folder(os.path.join(args_sc.output_folder,
                                                   self.strain))
        self.forward_file = os.path.join(args_sc.output_folder,
                                         self.strain, "forward")
        self.reverse_file = os.path.join(args_sc.output_folder,
                                         self.strain, "reverse")
        os.mkdir(self.forward_file)
        os.mkdir(self.reverse_file)

    def _import_libs(self, texs, strand, wig_path, lib_dict):
        if strand == "+":
            tex = "ft"
            notex = "fn"
        else:
            tex = "rt"
            notex = "rn"
        for flib in texs:
            if (flib[1] == "tex"):
                lib_dict[tex].append(os.path.join(wig_path, flib[0]))
                for nlib in texs:
                    if (nlib[1] == "notex") and \
                       (flib[2] == nlib[2]) and \
                       (flib[3] == nlib[3]):
                        lib_dict[notex].append(os.path.join(wig_path, nlib[0]))

    def screenshot(self, args_sc):
        lib_dict = {"ft": [], "fn": [], "rt": [], "rn": [], "ff": [], "rf": []}
        f_texs = []
        r_texs = []
        if args_sc.tlibs is not None:
            for lib in args_sc.tlibs:
                lib_datas = lib.split(":")
                if not lib_datas[0].endswith(".wig"):
                    print("Error:Exist a not proper wig files!!")
                    sys.exit()
                else:
                    if lib_datas[-1] == "+":
                        f_texs.append(lib_datas)
                    else:
                        r_texs.append(lib_datas)
            f_texs = sorted(f_texs, key=lambda x: (x[1], x[2], x[3]))
            r_texs = sorted(r_texs, key=lambda x: (x[1], x[2], x[3]))
            self._import_libs(f_texs, "+", args_sc.tex_wigs, lib_dict)
            self._import_libs(r_texs, "-", args_sc.tex_wigs, lib_dict)
        if args_sc.flibs is not None:
            for lib in args_sc.flibs:
                lib_datas = lib.split(":")
                if not lib_datas[0].endswith(".wig"):
                    print("Error:Exist a not proper wig files!!")
                    sys.exit()
                else:
                    if lib_datas[-1] == "+":
                        lib_dict["ff"].append(os.path.join(
                                       args_sc.frag_wigs, lib_datas[0]))
                    else:
                        lib_dict["rf"].append(os.path.join(
                                       args_sc.frag_wigs, lib_datas[0]))
        gen_screenshot(args_sc, lib_dict, self.forward_file + ".txt",
                       self.reverse_file + ".txt", self.strain)
        if (args_sc.tlibs is None) and (args_sc.flibs is None):
            print("Error: There are no wig file assigned!!!")
            sys.exit()
Beispiel #18
0
class Ribos(object):
    '''detection of riboswitch and RNA thermometer'''
    def __init__(self, args_ribo):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.gff_parser = Gff3Parser()
        self.gff_path = os.path.join(args_ribo.gffs, "tmp")
        if args_ribo.tsss is not None:
            self.tss_path = os.path.join(args_ribo.tsss, "tmp")
        else:
            self.tss_path = None
        self.tran_path = os.path.join(args_ribo.trans, "tmp")
        self.fasta_path = os.path.join(args_ribo.fastas, "tmp")
        if (args_ribo.program == "both") or (args_ribo.program
                                             == "riboswitch"):
            (self.ribos_stat_folder, self.ribos_gff_outfolder,
             self.ribos_table_folder, self.ribos_scan_folder,
             self.ribos_tmp_files, self.ribos_rfam,
             self.ribos_suffixs) = self._create_out_folders(
                 args_ribo.ribos_out_folder, "riboswitch", args_ribo.database)
        if (args_ribo.program == "both") or (args_ribo.program
                                             == "thermometer"):
            (self.thermo_stat_folder, self.thermo_gff_outfolder,
             self.thermo_table_folder, self.thermo_scan_folder,
             self.thermo_tmp_files, self.thermo_rfam,
             self.thermo_suffixs) = self._create_out_folders(
                 args_ribo.thermo_out_folder, "RNA_thermometer",
                 args_ribo.database)

    def _create_out_folders(self, out_folder, feature, database):
        stat_folder = os.path.join(out_folder, "statistics")
        gff_outfolder = os.path.join(out_folder, "gffs")
        table_folder = os.path.join(out_folder, "tables")
        scan_folder = os.path.join(out_folder, "scan_Rfam_results")
        tmp_files = {
            "fasta": os.path.join(out_folder, "tmp_fasta"),
            "scan": os.path.join(out_folder, "tmp_scan"),
            "table": os.path.join(out_folder, "tmp_table")
        }
        rfam = os.path.join(database, "Rfam_" + feature + ".cm")
        suffixs = {
            "csv": feature + ".csv",
            "txt": feature + "_prescan.txt",
            "re_txt": feature + "_scan.txt",
            "re_csv": feature + "_scan.csv"
        }
        return (stat_folder, gff_outfolder, table_folder, scan_folder,
                tmp_files, rfam, suffixs)

    def _run_cmscan(self, args_ribo, seq, type_, prefix, tmp_files, suffixs,
                    rfam, log):
        scan_file = os.path.join(tmp_files["scan"],
                                 "_".join([prefix, suffixs[type_]]))
        scan = open(scan_file, "w")
        if args_ribo.cutoff.split("_")[0] == "e":
            value = args_ribo.cutoff.split("_")[-1]
            log.write(" ".join(
                [args_ribo.cmscan_path, "--incE", value, "--acc", rfam, seq]) +
                      "\n")
            call([args_ribo.cmscan_path, "--incE", value, "--acc", rfam, seq],
                 stdout=scan)
        elif args_ribo.cutoff.split("_")[0] == "s":
            value = args_ribo.cutoff.split("_")[-1]
            log.write(" ".join(
                [args_ribo.cmscan_path, "--incT", value, "--acc", rfam, seq]) +
                      "\n")
            call([args_ribo.cmscan_path, "--incT", value, "--acc", rfam, seq],
                 stdout=scan)
        else:
            print("Error: the --cutoff needs to start from 'e' "
                  "(e value) or 's' (score)!")
            log.write("the --cutoff needs to start from 'e' "
                      "(e value) or 's' (score).\n")
            sys.exit()
        scan.close()
        log.write("Done!\n")
        log.write("\t" + scan_file + " is temporary generated.\n")
        return scan_file

    def _scan_extract_rfam(self, prefixs, args_ribo, tmp_files, suffixs,
                           feature, rfam, log):
        '''extract the seq of candidates and scanning the candidates'''
        for gff in os.listdir(self.gff_path):
            if gff.endswith(".gff"):
                prefix = gff.replace(".gff", "")
                first_seq = os.path.join(tmp_files["fasta"], prefix + ".fa")
                prefixs.append(prefix)
                print("Extracting sequences of candidates for {0}".format(
                    prefix))
                if self.tss_path is not None:
                    tss_file = os.path.join(self.tss_path, prefix + "_TSS.gff")
                else:
                    tss_file = None
                log.write("Running extract_RBS.py to extract potential "
                          "sequences of riboswitches/RNA thermometers for "
                          "{0}.\n".format(prefix))
                extract_potential_rbs(
                    os.path.join(self.fasta_path, prefix + ".fa"),
                    os.path.join(self.gff_path, gff), tss_file,
                    os.path.join(self.tran_path, prefix + "_transcript.gff"),
                    first_seq, args_ribo, feature)
                log.write("\t" + first_seq + " is temporary generated.\n")
                print("Pre-scanning of {0}".format(prefix))
                log.write("Using Infernal to pre-scan riboswitches/RNA "
                          "thermometers for {0}.\n".format(prefix))
                log.write(
                    "Please make sure the version of Infernal is at least 1.1.1.\n"
                )
                first_scan_file = self._run_cmscan(args_ribo, first_seq, "txt",
                                                   prefix, tmp_files, suffixs,
                                                   rfam, log)
                sec_seq = os.path.join(tmp_files["fasta"],
                                       "_".join([prefix, "regenerate.fa"]))
                first_table = os.path.join(tmp_files["table"],
                                           "_".join([prefix, suffixs["csv"]]))
                log.write(
                    "Running recompute_RBS.py to update the potential "
                    "sequences of riboswitches/RNA thermometers for {0} "
                    "based on the pre-scanning results.\n".format(prefix))
                regenerate_seq(first_scan_file, first_seq, first_table,
                               sec_seq)
                log.write("\t" + sec_seq + " is temporary generated.\n")
                print("Scanning of {0}".format(prefix))
                log.write("Using Infernal to scan riboswitches/RNA "
                          "thermometers for {0}.\n".format(prefix))
                log.write("Please make sure the version of Infernal is at "
                          "least 1.1.1.\n")
                sec_scan_file = self._run_cmscan(args_ribo, sec_seq, "re_txt",
                                                 prefix, tmp_files, suffixs,
                                                 rfam, log)
                sec_table = os.path.join(tmp_files["table"],
                                         "_".join([prefix, suffixs["re_csv"]]))
                log.write("Running recompute_RBS.py and modify_rbs_table.py "
                          "to generate tables for {0} "
                          "based on the scanning results.\n".format(prefix))
                reextract_rbs(sec_scan_file, first_table, sec_table,
                              args_ribo.cutoff)
                shutil.move(sec_table, first_table)
                modify_table(first_table, args_ribo.output_all)
        return prefixs

    def _merge_results(self, args_ribo, scan_folder, suffixs, tmp_files,
                       table_folder, stat_folder, feature_id, gff_outfolder,
                       feature, log):
        '''merge the results from the results of two searching'''
        for gff in os.listdir(args_ribo.gffs):
            if gff.endswith(".gff"):
                prefix = gff.replace(".gff", "")
                print("Merging results of {0}".format(prefix))
                pre_strain = ""
                self.helper.check_make_folder(os.path.join(
                    scan_folder, prefix))
                fh = open(os.path.join(args_ribo.gffs, gff))
                log.write("Merging the results from Infernal to generate "
                          "tables for {0}.\n".format(prefix))
                for entry in self.gff_parser.entries(fh):
                    if entry.seq_id != pre_strain:
                        if len(pre_strain) == 0:
                            shutil.copyfile(
                                os.path.join(
                                    tmp_files["table"],
                                    "_".join([entry.seq_id, suffixs["csv"]])),
                                os.path.join(
                                    table_folder,
                                    "_".join([prefix, suffixs["csv"]])))
                        else:
                            self.helper.merge_file(
                                os.path.join(
                                    tmp_files["table"],
                                    "_".join([entry.seq_id, suffixs["csv"]])),
                                os.path.join(
                                    table_folder,
                                    "_".join([prefix, suffixs["csv"]])))
                        shutil.copy(
                            os.path.join(
                                tmp_files["scan"],
                                "_".join([entry.seq_id, suffixs["txt"]])),
                            os.path.join(scan_folder, prefix))
                        shutil.copy(
                            os.path.join(
                                tmp_files["scan"],
                                "_".join([entry.seq_id, suffixs["re_txt"]])),
                            os.path.join(scan_folder, prefix))
                        pre_strain = entry.seq_id
                log.write("The following files are generated.\n")
                for folder in (table_folder, scan_folder):
                    for file_ in os.listdir(folder):
                        log.write("\t" + os.path.join(folder, file_) + "\n")
                out_stat = os.path.join(
                    stat_folder, "_".join(["stat", prefix, feature + ".txt"]))
                print("Computing statistics of {0}".format(prefix))
                log.write("Running ribo_gff.py to do statistics and generate "
                          "gff files for {0}.\n".format(prefix))
                log.write("The following files are generated:\n")
                out_gff = os.path.join(gff_outfolder,
                                       "_".join([prefix, feature + ".gff"]))
                stat_and_covert2gff(
                    os.path.join(table_folder,
                                 "_".join([prefix, suffixs["csv"]])),
                    feature_id, out_gff, args_ribo.fuzzy, out_stat, feature)
                log.write("\t" + out_gff + "\n")
                log.write("\t" + out_stat + "\n")
                fh.close()

    def _remove_tmp(self, args_ribo):
        self.helper.remove_tmp_dir(args_ribo.gffs)
        self.helper.remove_tmp_dir(args_ribo.fastas)
        self.helper.remove_tmp_dir(args_ribo.trans)
        self.helper.remove_tmp_dir(args_ribo.tsss)

    def _remove_overlap(self, gff_path, tmp_files, suffixs, type_, fuzzy, log):
        log.write("Running rbs_overlap.py to remove the overlapping "
                  "riboswitches/RNA thermometers.\n")
        for gff in os.listdir(gff_path):
            if gff.endswith(".gff"):
                tmp_table = os.path.join(
                    os.path.join(
                        tmp_files["table"],
                        "_".join([gff.replace(".gff", ""), suffixs["csv"]])))
                rbs_overlap(tmp_table, os.path.join(gff_path, gff), type_,
                            fuzzy)
                log.write("\t" + tmp_table + " is updated.\n")

    def _core_prediction(self, args_ribo, feature_id, rfam, tmp_files,
                         table_folder, feature, scan_folder, suffixs,
                         stat_folder, gff_outfolder, out_folder, type_, log):
        '''main part of detection'''
        log.write("Running get_Rfam_ribo.py to get the information of "
                  "riboswitches/RNA thermometers from Rfam.\n")
        rbs_from_rfam(feature_id, args_ribo.rfam, rfam)
        log.write("Using Infernal to compress the Rfam data of "
                  "riboswitches/RNA thermometers.\n")
        log.write(
            "Please make sure the version of Infernal is at least 1.1.1.\n")
        print("Compressing Rfam of " + feature)
        log.write(" ".join([args_ribo.cmpress_path, "-F", rfam]) + "\n")
        call([args_ribo.cmpress_path, "-F", rfam])
        log.write("Done!\n")
        prefixs = []
        self.helper.check_make_folder(tmp_files["fasta"])
        self.helper.check_make_folder(tmp_files["scan"])
        self.helper.check_make_folder(tmp_files["table"])
        prefixs = self._scan_extract_rfam(prefixs, args_ribo, tmp_files,
                                          suffixs, feature, rfam, log)
        self._remove_overlap(self.gff_path, tmp_files, suffixs, type_,
                             args_ribo.fuzzy, log)
        self._merge_results(args_ribo, scan_folder, suffixs, tmp_files,
                            table_folder, stat_folder, feature_id,
                            gff_outfolder, feature, log)
        log.write(
            "Running map_ribos.py to extract all the details from Rfam.\n")
        mapping_ribos(table_folder, feature_id, feature)
        log.write("The following files are updated:\n")
        for file_ in os.listdir(table_folder):
            log.write("\t" + os.path.join(table_folder, file_) + "\n")
        self.helper.remove_all_content(out_folder, "tmp", "dir")

    def run_ribos(self, args_ribo, log_t, log_r):
        if args_ribo.fuzzy_rbs > 6:
            if log_t is not None:
                log_t.write("--fuzzy_rbs should be equal or less than 6!\n")
            if log_r is not None:
                log_r.write("--fuzzy_rbs should be equal or less than 6!\n")
            print("Error: --fuzzy_rbs should be equal or less than 6!")
            sys.exit()
        self.multiparser.parser_gff(args_ribo.gffs, None)
        self.multiparser.parser_fasta(args_ribo.fastas)
        self.multiparser.parser_gff(args_ribo.trans, "transcript")
        if args_ribo.tsss is not None:
            self.multiparser.parser_gff(args_ribo.tsss, "TSS")
        for gff in os.listdir(args_ribo.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(
                    os.path.join(args_ribo.gffs, gff))
        if (args_ribo.program.lower() == "both") or (args_ribo.program.lower()
                                                     == "riboswitch"):
            print("Detecting riboswtiches now")
            self._core_prediction(
                args_ribo, args_ribo.ribos_id, self.ribos_rfam,
                self.ribos_tmp_files, self.ribos_table_folder, "riboswitch",
                self.ribos_scan_folder, self.ribos_suffixs,
                self.ribos_stat_folder, self.ribos_gff_outfolder,
                args_ribo.ribos_out_folder, "riboswitch", log_r)
        if (args_ribo.program.lower() == "both") or (args_ribo.program.lower()
                                                     == "thermometer"):
            print("Detecting RNA thermometers now")
            self._core_prediction(args_ribo, args_ribo.thermo_id,
                                  self.thermo_rfam, self.thermo_tmp_files,
                                  self.thermo_table_folder, "RNA_thermometer",
                                  self.thermo_scan_folder, self.thermo_suffixs,
                                  self.thermo_stat_folder,
                                  self.thermo_gff_outfolder,
                                  args_ribo.thermo_out_folder, "thermometer",
                                  log_t)
        self._remove_tmp(args_ribo)
Beispiel #19
0
class TargetFasta(object):
    '''detection of sRNA target interaction'''

    def __init__(self, tar_folder, ref_folder):
        self.multiparser = Multiparser()
        self.seq_editer = SeqEditer()
        self.helper = Helper()
        self.folders = {"tmp_tar": os.path.join(tar_folder, "tmp")}

    def gen_folder(self, out_folder, ref_files):
        new_ref_folder = os.path.join(out_folder, "tmp_reference")
        self.helper.check_make_folder(new_ref_folder)
        for file_ in ref_files:
            shutil.copy(file_, new_ref_folder)
        self.folders["tmp_ref"] = os.path.join(new_ref_folder, "tmp")
        self.multiparser.parser_fasta(new_ref_folder)
        if os.path.exists(os.path.join(out_folder, "fasta_files")):
            shutil.rmtree(os.path.join(out_folder, "fasta_files"))
            os.mkdir(os.path.join(out_folder, "fasta_files"))
        if os.path.exists(self.folders["tmp_tar"]):
            shutil.rmtree(self.folders["tmp_tar"])
        os.mkdir(self.folders["tmp_tar"])
        return new_ref_folder

    def get_target_fasta(self, mut_table, tar_folder, ref_files,
                         out_name, out_folder, log):
        new_ref_folder = self.gen_folder(out_folder, ref_files)
        log.write("Running seq_editor.py for updating sequence.\n")
        self.seq_editer.modify_seq(self.folders["tmp_ref"], mut_table,
                                   self.folders["tmp_tar"], out_name)
        print("Updating the reference sequences")
        mh = open(mut_table, "r")
        pre_strain = None
        out = None
        strain_num = 0
        for row in csv.reader(mh, delimiter='\t'):
            if not row[0].startswith("#"):
                if (pre_strain != row[0]):
                    strain_num = strain_num + 1
                    tmp_tar_name = "_".join([out_name, row[0]]) + ".fa"
                    fasta = os.path.join(out_folder, "fasta_files",
                                         tmp_tar_name)
                    if out is not None:
                        out.close()
                    out = open(fasta, "w")
                    if tmp_tar_name in os.listdir(self.folders["tmp_tar"]):
                        with open(os.path.join(
                                  self.folders["tmp_tar"],
                                  tmp_tar_name)) as f_h:
                            for line in f_h:
                                out.write(line)
                    else:
                        print("Error: No updated information of {0}.fa".format(
                              row[0]))
                pre_strain = row[0]
        out.close()
        out_seq = out_name + ".fa"
        if os.path.exists(out_seq):
            os.remove(out_seq)
        if strain_num == 1:
            o_s = open(out_seq, "w")
            for seq in os.listdir(os.path.join(out_folder, "fasta_files")):
                if seq.endswith(".fa"):
                    with open(os.path.join(
                            out_folder, "fasta_files", seq)) as t_h:
                        for line in t_h:
                            if len(line) != 0:
                                if line.startswith(">"):
                                    o_s.write(">" + out_name + "\n")
                                else:
                                     o_s.write(line)
                    os.remove(os.path.join(out_folder, "fasta_files", seq))
            o_s.close()
        else:
            for seq in os.listdir(os.path.join(out_folder, "fasta_files")):
                if seq.endswith(".fa"):
                    os.system(" ".join(["cat", os.path.join(
                                            out_folder, "fasta_files", seq),
                                        ">>", out_seq]))
                    os.remove(os.path.join(out_folder, "fasta_files", seq))
        shutil.move(out_seq, os.path.join(
            out_folder, "fasta_files", out_seq))
        shutil.rmtree(self.folders["tmp_tar"])
        shutil.rmtree(self.folders["tmp_ref"])
        if "tmp_reference" in os.listdir(out_folder):
            shutil.rmtree(new_ref_folder)
        log.write("\t" + os.path.join(out_folder, "fasta_files", out_seq) + 
                  " is generated.\n")
        print("Please use the new fasta files to remapping again.")
Beispiel #20
0
class sRNADetection(object):

    def __init__(self, args_srna):
        self.args_container = ArgsContainer()
        self.helper = Helper()
        self.multiparser = Multiparser()
        self.gff_output = os.path.join(args_srna.out_folder, "gffs")
        self.table_output = os.path.join(args_srna.out_folder, "tables")
        self.stat_path = os.path.join(args_srna.out_folder, "statistics")
        self.tss_path = self._check_folder_exist(args_srna.tss_folder)
        self.pro_path = self._check_folder_exist(args_srna.pro_folder)
        self.sorf_path = self._check_folder_exist(args_srna.sorf_file)
        self.fasta_path = os.path.join(args_srna.fastas, "tmp")
        self.tran_path = os.path.join(args_srna.trans, "tmp")
        self.term_path = self._check_folder_exist(args_srna.terms)
        self.merge_wigs = os.path.join(args_srna.out_folder, "merge_wigs")
        self.prefixs = {"merge": os.path.join(
                            args_srna.out_folder, "tmp_merge"),
                        "utr": os.path.join(
                            args_srna.out_folder, "tmp_utrsrna"),
                        "normal": os.path.join(
                            args_srna.out_folder, "tmp_normal"),
                        "in_cds": os.path.join(
                            args_srna.out_folder, "tmp_incds"),
                        "merge_table": os.path.join(
                            args_srna.out_folder, "tmp_merge_table"),
                        "utr_table": os.path.join(
                            args_srna.out_folder, "tmp_utrsrna_table"),
                        "normal_table": os.path.join(
                            args_srna.out_folder, "tmp_normal_table"),
                        "in_cds_table": os.path.join(
                            args_srna.out_folder, "tmp_incds_table"),
                        "basic": os.path.join(
                            args_srna.out_folder, "tmp_basic"),
                        "energy": os.path.join(
                            args_srna.out_folder, "tmp_energy")}
        self.tmps = {"nr": os.path.join(args_srna.out_folder, "tmp_nr"),
                     "srna": os.path.join(args_srna.out_folder, "tmp_sRNA")}
        self.best_table = os.path.join(self.table_output, "best")
        self.table_output = os.path.join(args_srna.out_folder, "tables")
        self.stat_path = os.path.join(args_srna.out_folder, "statistics")
        self.all_best = {"all_gff": os.path.join(
                             self.gff_output, "all_candidates"),
                         "best_gff": os.path.join(self.gff_output, "best"),
                         "all_table": os.path.join(
                             self.table_output, "all_candidates"),
                         "best_table": os.path.join(self.table_output, "best")}

    def _check_folder_exist(self, folder):
        if folder is not None:
            path = os.path.join(folder, "tmp")
        else:
            path = None
        return path

    def _check_gff(self, gffs):
        for gff in os.listdir(gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(gffs, gff))

    def _run_format(self, blast_path, database, type_, db_file, err):
        call([os.path.join(blast_path, "makeblastdb"), "-in", database,
              "-dbtype", type_, "-out", db_file], stderr=err)

    def _formatdb(self, database, type_, out_folder,
                  blast_path, database_type):
        err = open(os.path.join(out_folder, "log.txt"), "w")
        if (database.endswith(".fa")) or (
                database.endswith(".fna")) or (
                database.endswith(".fasta")):
            pass
        else:
            folders = database.split("/")
            filename = folders[-1]
            folder = "/".join(folders[:-1])
            for fasta in os.listdir(folder):
                if (fasta.endswith(".fa")) or (
                        fasta.endswith(".fna")) or (
                        fasta.endswith(".fasta")):
                    if ".".join(fasta.split(".")[:-1]) == filename:
                        database = os.path.join(folder, fasta)
        if database_type == "sRNA":
            change_format(database, "tmp_srna_database")
            os.remove(database)
            shutil.move("tmp_srna_database", database)
        db_file = ".".join(database.split(".")[:-1])
        self._run_format(blast_path, database, type_, db_file, err)
        err.close()

    def _merge_frag_tex_file(self, files, args_srna):
        if (args_srna.frag_wigs is not None) and (
                args_srna.tex_wigs is not None):
            self.helper.merge_file(files["frag_gff"], files["tex_gff"])
            self.helper.merge_file(files["frag_csv"], files["tex_csv"])
            shutil.move(files["tex_csv"], files["merge_csv"])
            self.helper.sort_gff(files["tex_gff"], files["merge_gff"])
            os.remove(files["frag_csv"])
            os.remove(files["frag_gff"])
            os.remove(files["tex_gff"])
        elif (args_srna.frag_wigs is not None):
            shutil.move(files["frag_csv"], files["merge_csv"])
            self.helper.sort_gff(files["frag_gff"], files["merge_gff"])
            os.remove(files["frag_gff"])
        elif (args_srna.tex_wigs is not None):
            shutil.move(files["tex_csv"], files["merge_csv"])
            self.helper.sort_gff(files["tex_gff"], files["merge_gff"])

    def _run_normal(self, prefix, gff, tran, fuzzy_tss, args_srna):
        if "tmp_cutoff_inter" in os.listdir(args_srna.out_folder):
            os.remove(os.path.join(args_srna.out_folder, "tmp_cutoff_inter"))
        files = {"frag_gff": None, "frag_csv": None,
                 "tex_gff": None, "tex_csv": None,
                 "merge_gff": None, "merge_csv": None}
        if ("tss" in args_srna.import_info):
            tss = self.helper.get_correct_file(self.tss_path, "_TSS.gff",
                                               prefix, None, None)
        else:
            tss = None
        if self.pro_path is not None:
            pro = self.helper.get_correct_file(
                    self.pro_path, "_processing.gff", prefix, None, None)
        else:
            pro = None
        if args_srna.frag_wigs is not None:
            files["frag_gff"] = os.path.join(
                    args_srna.out_folder, "_".join(["tmp_frag", prefix]))
            files["frag_csv"] = os.path.join(
                    args_srna.out_folder, "_".join(["tmp_frag_table", prefix]))

            args_srna = self.args_container.container_intersrna(
                             "frag", files, args_srna, prefix,
                             os.path.join(args_srna.gffs, gff), tran, tss,
                             pro, fuzzy_tss)
            intergenic_srna(args_srna)
        if args_srna.tex_wigs is not None:
            files["tex_gff"] = os.path.join(
                    args_srna.out_folder, "_".join(["tmp_tex", prefix]))
            files["tex_csv"] = os.path.join(
                    args_srna.out_folder, "_".join(["tmp_tex_table", prefix]))
            args_srna = self.args_container.container_intersrna(
                           "tex", files, args_srna, prefix,
                           os.path.join(args_srna.gffs, gff), tran, tss,
                           pro, fuzzy_tss)
            intergenic_srna(args_srna)
        files["merge_csv"] = "_".join([self.prefixs["normal_table"], prefix])
        files["merge_gff"] = "_".join([self.prefixs["normal"], prefix])
        self._merge_frag_tex_file(files, args_srna)
        if "TSS_class" in os.listdir(args_srna.out_folder):
            tss = os.path.join(args_srna.out_folder,
                               "TSS_class", prefix + "_TSS.gff")
        return tss

    def _run_utrsrna(self, gff, tran, prefix, tss, pro, args_srna):
        if "tmp_median" in os.listdir(args_srna.out_folder):
            os.remove(os.path.join(args_srna.out_folder, "tmp_median"))
        files = {"frag_gff": None, "frag_csv": None,
                 "tex_gff": None, "tex_csv": None,
                 "merge_gff": None, "merge_csv": None}
        if args_srna.tex_wigs is not None:
            files["tex_gff"] = os.path.join(
                    args_srna.out_folder, "_".join(["tmp_utr_tex", prefix]))
            files["tex_csv"] = os.path.join(
                    args_srna.out_folder,
                    "_".join(["tmp_utr_tex_table", prefix]))
            args_srna = self.args_container.container_utrsrna(
                    os.path.join(args_srna.gffs, gff), tran, tss, files,
                    pro, os.path.join(self.fasta_path, prefix + ".fa"),
                    "tex", prefix, args_srna)
            utr_derived_srna(args_srna)
        if args_srna.frag_wigs is not None:
            files["frag_gff"] = os.path.join(
                args_srna.out_folder, "_".join(["tmp_utr_frag", prefix]))
            files["frag_csv"] = os.path.join(
                args_srna.out_folder, "_".join(["tmp_utr_frag_table", prefix]))
            args_srna = self.args_container.container_utrsrna(
                    os.path.join(args_srna.gffs, gff), tran, tss, files,
                    pro, os.path.join(self.fasta_path, prefix + ".fa"),
                    "frag", prefix, args_srna)
            utr_derived_srna(args_srna)
        files["merge_csv"] = "_".join([self.prefixs["utr_table"], prefix])
        files["merge_gff"] = "_".join([self.prefixs["utr"], prefix])
        self._merge_frag_tex_file(files, args_srna)
        filter_utr(files["merge_gff"], files["merge_csv"], args_srna.min_utr)

    def _check_necessary_file(self, args_srna):
        if (args_srna.gffs is None) or (args_srna.trans is None) or (
                (args_srna.tex_wigs is None) and (
                args_srna.frag_wigs is None)):
            print("Error: lack required files!!!!")
            sys.exit()
        if args_srna.utr_srna:
            if (args_srna.tss_folder is None):
                print("Error: lack required TSS files for UTR "
                      "derived sRNA detection!!!!")
                sys.exit()
            if (args_srna.pro_folder is None):
                print("Warning: lack Processing site files for UTR "
                      "derived sRNA detection!!!")
                print("it may effect the results!!!!")
        self._check_gff(args_srna.gffs)
        self._check_gff(args_srna.trans)
        if args_srna.tss_folder is not None:
            self._check_gff(args_srna.tss_folder)
            self.multiparser.parser_gff(args_srna.tss_folder, "TSS")
            self.multiparser.combine_gff(args_srna.gffs, self.tss_path,
                                         None, "TSS")
        if args_srna.pro_folder is not None:
            self._check_gff(args_srna.pro_folder)
            self.multiparser.parser_gff(args_srna.pro_folder, "processing")
            self.multiparser.combine_gff(args_srna.gffs, self.pro_path,
                                         None, "processing")
        if args_srna.sorf_file is not None:
            self._check_gff(args_srna.sorf_file)
            self.multiparser.parser_gff(args_srna.sorf_file, "sORF")
            self.multiparser.combine_gff(args_srna.gffs, self.sorf_path,
                                         None, "sORF")
        if args_srna.utr_srna or ("sec_str" in args_srna.import_info) or (
           "blast_nr" in args_srna.import_info) or (
           "blast_srna" in args_srna.import_info):
            if args_srna.fastas is None:
                print("Error: lack required fasta files for UTR "
                      "derived sRNA detection!!!!")
                sys.exit()
            self.multiparser.parser_fasta(args_srna.fastas)
            self.multiparser.combine_fasta(args_srna.gffs,
                                           self.fasta_path, None)
        if args_srna.terms is not None:
            self._check_gff(args_srna.terms)
            self.multiparser.parser_gff(args_srna.terms, "term")
            self.multiparser.combine_gff(args_srna.gffs, self.term_path,
                                         None, "term")
        else:
            self.term_path = None

    def _run_program(self, args_srna):
        prefixs = []
        tss = None
        for gff in os.listdir(args_srna.gffs):
            if gff.endswith(".gff"):
                prefix = gff.replace(".gff", "")
                prefixs.append(prefix)
                print("Running sRNA detection of {0}....".format(prefix))
                tran = self.helper.get_correct_file(
                        self.tran_path, "_transcript.gff", prefix, None, None)
                gffs = {"merge": "_".join([self.prefixs["merge"], prefix]),
                        "utr": "_".join([self.prefixs["utr"], prefix]),
                        "normal": "_".join([self.prefixs["normal"], prefix])}
                csvs = {"merge": "_".join([
                            self.prefixs["merge_table"], prefix]),
                        "utr": "_".join([self.prefixs["utr_table"], prefix]),
                        "normal": "_".join([
                            self.prefixs["normal_table"], prefix])}
                tss = self._run_normal(
                        prefix, gff, tran, args_srna.fuzzy_tsss["inter"],
                        args_srna)
                if args_srna.utr_srna:
                    print("Running UTR derived sRNA detection of {0}".format(
                          prefix))
                    if tss is None:
                        tss = self.helper.get_correct_file(
                                self.tss_path, "_TSS.gff", prefix, None, None)
                    if self.pro_path is not None:
                        pro = self.helper.get_correct_file(
                                self.pro_path, "_processing.gff",
                                prefix, None, None)
                    else:
                        pro = None
                    if tss is not None:
                        self._run_utrsrna(gff, tran, prefix,
                                          tss, pro, args_srna)
                self._merge_srna(args_srna, gffs, csvs, prefix,
                                 os.path.join(args_srna.gffs, gff), tss)
                filter_frag(csvs["merge"], gffs["merge"])
                self.helper.sort_gff(gffs["merge"],
                                     "_".join([self.prefixs["basic"], prefix]))
        return prefixs

    def _merge_srna(self, args_srna, gffs, csvs, prefix, gff_file, tss):
        print("merging data of intergenic and UTR_derived sRNA...")
        merge_srna_gff(gffs, args_srna.in_cds,
                       args_srna.cutoff_overlap, gff_file)
        merge_srna_table(gffs["merge"], csvs, os.path.join(args_srna.wig_path,
                         "_".join([prefix, "forward.wig"])),
                         os.path.join(args_srna.wig_path,
                         "_".join([prefix, "reverse.wig"])),
                         tss, args_srna)

    def _run_RNAfold(self, seq_file, vienna_path, sec_file):
        os.system(" ".join(["cat", seq_file, "|",
                  os.path.join(vienna_path, "RNAfold"),
                  "-p", ">", sec_file]))

    def _get_seq_sec(self, fasta_path, out_folder, prefix, sec_path,
                     dot_path, vienna_path):
        detect = False
        for fasta in os.listdir(fasta_path):
            if fasta.endswith(".fa") and (
               fasta.replace(".fa", "") == prefix):
                detect = True
                break
        if detect:
            detect = False
            seq_file = os.path.join(out_folder, "_".join(["sRNA_seq", prefix]))
            sec_file = os.path.join(out_folder, "_".join(["sRNA_2d", prefix]))
            self.helper.get_seq("_".join([self.prefixs["basic"], prefix]),
                                os.path.join(fasta_path, fasta), seq_file)
        else:
            print("Error:There is not fasta file of {0}".format(prefix))
            print("please check your imported information")
            sys.exit()
        tmp_path = os.path.join(out_folder, "tmp_srna")
        self.helper.check_make_folder(tmp_path)
        main_path = os.getcwd()
        os.chdir(tmp_path)
        sec_file = os.path.join(main_path, sec_file)
        seq_file = os.path.join(main_path, seq_file)
        tmp_sec_path = os.path.join(main_path, sec_path)
        tmp_dot_path = os.path.join(main_path, dot_path)
        self._run_RNAfold(seq_file, vienna_path, sec_file)
        extract_energy(os.path.join(main_path,
                       "_".join([self.prefixs["basic"], prefix])),
                       sec_file, os.path.join(main_path,
                       "_".join([self.prefixs["energy"], prefix])))
        for ps in os.listdir(os.getcwd()):
            new_ps = ps.replace("|", "_")
            shutil.move(ps, new_ps)
        return {"sec": tmp_sec_path, "dot": tmp_dot_path, "main": main_path,
                "tmp": os.path.join(main_path, tmp_path)}

    def _run_replot(self, vienna_util, tmp_paths, file_, dot_file, rel_file):
        os.system(" ".join([os.path.join(vienna_util, "relplot.pl"),
                  os.path.join(tmp_paths["tmp"], file_),
                  os.path.join(tmp_paths["tmp"], dot_file),
                  ">", os.path.join(tmp_paths["tmp"], rel_file)]))

    def _convert_pdf(self, ps2pdf14_path, tmp_paths, file_, pdf_file):
        call([ps2pdf14_path, os.path.join(tmp_paths["tmp"], file_), pdf_file])

    def _replot_sec_to_pdf(self, vienna_util, tmp_paths,
                           ps2pdf14_path, prefix):
        for file_ in os.listdir(os.getcwd()):
            if file_.endswith("ss.ps"):
                dot_file = file_.replace("ss.ps", "dp.ps")
                rel_file = file_.replace("ss.ps", "rss.ps")
                print("replot {0}".format(file_))
                self._run_replot(vienna_util, tmp_paths, file_,
                                 dot_file, rel_file)
        for file_ in os.listdir(tmp_paths["tmp"]):
            if (file_.endswith("rss.ps")) or (file_.endswith("dp.ps")):
                pdf_file = file_.replace(".ps", ".pdf")
                print("convert {0} to pdf".format(file_))
                self._convert_pdf(ps2pdf14_path, tmp_paths,
                                  file_, pdf_file)
        os.mkdir(os.path.join(tmp_paths["sec"], prefix))
        os.mkdir(os.path.join(tmp_paths["dot"], prefix))
        self.helper.move_all_content(
                tmp_paths["tmp"], os.path.join(tmp_paths["sec"], prefix),
                ["rss.pdf"])
        self.helper.move_all_content(
                tmp_paths["tmp"], os.path.join(tmp_paths["dot"], prefix),
                ["dp.pdf"])

    def _run_mountain(self, vienna_util, tmp_paths, dot_file, out):
        call([os.path.join(vienna_util, "mountain.pl"),
              os.path.join(tmp_paths["tmp"], dot_file)], stdout=out)

    def _plot_mountain(self, mountain, moun_path,
                       tmp_paths, prefix, vienna_util):
        if mountain:
            tmp_moun_path = os.path.join(tmp_paths["main"], moun_path)
            os.mkdir(os.path.join(tmp_moun_path, prefix))
            txt_path = os.path.join(tmp_paths["tmp"], "tmp_txt")
            self.helper.check_make_folder(txt_path)
            print("Generating mountain plot of {0}....".format(prefix))
            for dot_file in os.listdir(tmp_paths["tmp"]):
                if dot_file.endswith("dp.ps"):
                    moun_txt = os.path.join(tmp_paths["tmp"], "mountain.txt")
                    out = open(moun_txt, "w")
                    moun_file = dot_file.replace("dp.ps", "mountain.pdf")
                    print("Generating {0}".format(moun_file))
                    self._run_mountain(vienna_util, tmp_paths, dot_file, out)
                    plot_mountain_plot(moun_txt, moun_file)
                    shutil.move(moun_file,
                                os.path.join(tmp_moun_path, prefix, moun_file))
                    out.close()
                    os.remove(moun_txt)

    def _compute_2d_and_energy(self, args_srna, prefixs):
        print("Running energy calculation....")
        moun_path = os.path.join(args_srna.out_folder, "mountain_plot")
        sec_path = os.path.join(args_srna.out_folder, "sec_structure",
                                "sec_plot")
        dot_path = os.path.join(args_srna.out_folder, "sec_structure",
                                "dot_plot")
        self.helper.remove_all_content(sec_path, None, "dir")
        self.helper.remove_all_content(dot_path, None, "dir")
        self.helper.remove_all_content(moun_path, None, "dir")
        for prefix in prefixs:
            tmp_paths = self._get_seq_sec(
                    self.fasta_path, args_srna.out_folder, prefix, sec_path,
                    dot_path, args_srna.vienna_path)
            self._replot_sec_to_pdf(args_srna.vienna_util, tmp_paths,
                                    args_srna.ps2pdf14_path, prefix)
            self._plot_mountain(args_srna.mountain, moun_path, tmp_paths,
                                prefix, args_srna.vienna_util)
            self.helper.remove_all_content(os.getcwd(), ".ps", "file")
            os.chdir(tmp_paths["main"])
            shutil.move("_".join([self.prefixs["energy"], prefix]),
                        "_".join([self.prefixs["basic"], prefix]))
            shutil.rmtree(os.path.join(args_srna.out_folder, "tmp_srna"))

    def _run_blast(self, blast_path, program, database, e, seq_file,
                   blast_file, strand):
        call([os.path.join(blast_path, program), "-db", database,
              "-evalue", str(e), "-strand", strand, "-query", seq_file,
              "-out", blast_file])

    def _get_strand_fasta(self, seq_file, out_folder):
        tmp_plus = os.path.join(out_folder, "tmp_plus.fa")
        tmp_minus = os.path.join(out_folder, "tmp_minus.fa")
        out_p = open(tmp_plus, "w")
        out_m = open(tmp_minus, "w")
        strand = ""
        with open(seq_file) as sh:
            for line in sh:
                line = line.strip()
                if line.startswith(">"):
                    if line[-1] == "+":
                        out_p.write(line + "\n")
                        strand = "plus"
                    elif line[-1] == "-":
                        out_m.write(line + "\n")
                        strand = "minus"
                else:
                    if strand == "plus":
                        out_p.write(line + "\n")
                    elif strand == "minus":
                        out_m.write(line + "\n")
        out_p.close()
        out_m.close()
        return tmp_plus, tmp_minus

    def _blast(self, database, database_format, data_type, args_srna,
               prefixs, program, database_type, e):
        if (database is None):
            print("Error: No database assigned!")
        else:
            if database_format:
                self._formatdb(database, data_type, args_srna.out_folder,
                               args_srna.blast_path, database_type)
            for prefix in prefixs:
                blast_file = os.path.join(
                        args_srna.out_folder, "blast_result_and_misc",
                        "_".join([database_type, "blast", prefix + ".txt"]))
                srna_file = "_".join([self.prefixs["basic"], prefix])
                out_file = os.path.join(
                        args_srna.out_folder,
                        "_".join(["tmp", database_type, prefix]))
                print("Running Blast of {0}".format(prefix))
                seq_file = os.path.join(
                        args_srna.out_folder, "_".join(["sRNA_seq", prefix]))
                if seq_file not in os.listdir(args_srna.out_folder):
                    self.helper.get_seq(
                            srna_file,
                            os.path.join(self.fasta_path, prefix + ".fa"),
                            seq_file)
                if database_type == "nr":
                    tmp_plus, tmp_minus = self._get_strand_fasta(
                            seq_file, args_srna.out_folder)
                    tmp_blast = os.path.join("tmp_blast.txt")
                    self._run_blast(args_srna.blast_path, program, database, e,
                                    tmp_plus, tmp_blast, "plus")
                    self._run_blast(args_srna.blast_path, program, database, e,
                                    tmp_minus, blast_file, "minus")
                    self.helper.merge_file(tmp_blast, blast_file)
                    os.remove(tmp_blast)
                    os.remove(tmp_plus)
                    os.remove(tmp_minus)
                else:
                    self._run_blast(args_srna.blast_path, program, database, e,
                                    seq_file, blast_file, "both")
                extract_blast(blast_file, srna_file, out_file,
                              out_file + ".csv", database_type)
                shutil.move(out_file, srna_file)

    def _class_srna(self, prefixs, args_srna):
        if (len(args_srna.import_info) != 1) or (
                len(args_srna.import_info) != 0):
            for prefix in prefixs:
                print("classifying sRNA of {0}".format(prefix))
                class_gff = os.path.join(self.gff_output, "for_class")
                class_table = os.path.join(self.table_output, "for_class")
                self.helper.check_make_folder(os.path.join(class_table,
                                                           prefix))
                self.helper.check_make_folder(os.path.join(class_gff, prefix))
                class_gff = os.path.join(class_gff, prefix)
                class_table = os.path.join(class_table, prefix)
                self.helper.check_make_folder(class_table)
                self.helper.check_make_folder(class_gff)
                out_stat = os.path.join(
                        self.stat_path, "_".join([
                            "stat_sRNA_class", prefix + ".csv"]))
                classify_srna(os.path.join(self.all_best["all_gff"],
                              "_".join([prefix, "sRNA.gff"])), class_gff,
                              out_stat, args_srna)
                for srna in os.listdir(class_gff):
                    out_table = os.path.join(
                            class_table, srna.replace(".gff", ".csv"))
                    gen_srna_table(
                        os.path.join(class_gff, srna),
                        "_".join([self.prefixs["merge_table"], prefix]),
                        "_".join([self.tmps["nr"], prefix + ".csv"]),
                        "_".join([self.tmps["srna"], prefix + ".csv"]),
                        args_srna, out_table)

    def _get_best_result(self, prefixs, args_srna):
        for prefix in prefixs:
            best_gff = os.path.join(self.all_best["best_gff"],
                                    "_".join([prefix, "sRNA.gff"]))
            best_table = os.path.join(self.all_best["best_table"],
                                      "_".join([prefix, "sRNA.csv"]))
            gen_best_srna(os.path.join(self.all_best["all_gff"],
                                       "_".join([prefix, "sRNA.gff"])),
                          best_gff, args_srna)
            gen_srna_table(os.path.join(self.all_best["best_gff"],
                           "_".join([prefix, "sRNA.gff"])),
                           "_".join([self.prefixs["merge_table"], prefix]),
                           "_".join([self.tmps["nr"], prefix + ".csv"]),
                           "_".join([self.tmps["srna"], prefix + ".csv"]),
                           args_srna, best_table)

    def _remove_file(self, args_srna):
        self.helper.remove_all_content(args_srna.out_folder, "tmp_", "dir")
        self.helper.remove_all_content(args_srna.out_folder, "tmp_", "file")
        self.helper.remove_tmp(args_srna.fastas)
        self.helper.remove_tmp(args_srna.gffs)
        if args_srna.frag_wigs is not None:
            self.helper.remove_tmp(args_srna.frag_wigs)
        if args_srna.tex_wigs is not None:
            self.helper.remove_tmp(args_srna.tex_wigs)
        if (args_srna.frag_wigs is not None) and (
                args_srna.tex_wigs is not None):
            shutil.rmtree(args_srna.merge_wigs)
        self.helper.remove_tmp(args_srna.trans)
        if args_srna.tss_folder is not None:
            self.helper.remove_tmp(args_srna.tss_folder)
        if args_srna.pro_folder is not None:
            self.helper.remove_tmp(args_srna.pro_folder)
        if args_srna.sorf_file is not None:
            self.helper.remove_tmp(args_srna.sorf_file)
        if "tmp_median" in os.listdir(args_srna.out_folder):
            os.remove(os.path.join(args_srna.out_folder, "tmp_median"))
        if self.term_path is not None:
            self.helper.remove_tmp(args_srna.terms)

    def _filter_srna(self, args_srna, prefixs):
        if "sec_str" in args_srna.import_info:
            self._compute_2d_and_energy(args_srna, prefixs)
        if "blast_nr" in args_srna.import_info:
            self._blast(args_srna.nr_database, args_srna.nr_format, "prot",
                        args_srna, prefixs, "blastx", "nr", args_srna.e_nr)
        if "blast_srna" in args_srna.import_info:
            self._blast(args_srna.srna_database, args_srna.srna_format, "nucl",
                        args_srna, prefixs, "blastn", "sRNA", args_srna.e_srna)
        if "sorf" in args_srna.import_info:
            for prefix in prefixs:
                if ("_".join([prefix, "sORF.gff"]) in
                        os.listdir(self.sorf_path)):
                    tmp_srna = os.path.join(args_srna.out_folder,
                                            "".join(["tmp_srna_sorf", prefix]))
                    tmp_sorf = os.path.join(args_srna.out_folder,
                                            "".join(["tmp_sorf_srna", prefix]))
                    srna_sorf_comparison(
                            "_".join([self.prefixs["basic"], prefix]),
                            os.path.join(self.sorf_path,
                                         "_".join([prefix, "sORF.gff"])),
                            tmp_srna, tmp_sorf)
                    os.remove(tmp_sorf)
                    shutil.move(tmp_srna,
                                "_".join([self.prefixs["basic"], prefix]))

    def _import_info_format(self, import_info):
        new_info = []
        for info in import_info:
            info = info.lower()
            new_info.append(info)
        return new_info

    def _gen_table(self, prefixs, args_srna):
        for prefix in prefixs:
            out_table = os.path.join(self.all_best["all_table"],
                                     "_".join([prefix, "sRNA.csv"]))
            gen_srna_table(os.path.join(self.all_best["all_gff"],
                           "_".join([prefix, "sRNA.gff"])),
                           "_".join([self.prefixs["merge_table"], prefix]),
                           "_".join([self.tmps["nr"], prefix + ".csv"]),
                           "_".join([self.tmps["srna"], prefix + ".csv"]),
                           args_srna, out_table)

    def _print_rank_all(self, prefixs):
        for prefix in prefixs:
            all_table = os.path.join(self.all_best["all_table"],
                                     "_".join([prefix, "sRNA.csv"]))
            best_table = os.path.join(self.all_best["best_table"],
                                      "_".join([prefix, "sRNA.csv"]))
            print_rank_all(all_table, best_table)

    def _filter_min_utr(self, prefixs, min_utr):
        for prefix in prefixs:
            filter_utr(os.path.join(self.all_best["all_gff"],
                                    "_".join([prefix, "sRNA.gff"])),
                       os.path.join(self.all_best["all_table"],
                                    "_".join([prefix, "sRNA.csv"])), min_utr)

    def _antisense(self, gffs, prefixs):
        for prefix in prefixs:
            all_table = os.path.join(self.all_best["all_table"],
                                     "_".join([prefix, "sRNA.csv"]))
            best_table = os.path.join(self.all_best["best_table"],
                                      "_".join([prefix, "sRNA.csv"]))
            all_gff = os.path.join(self.all_best["all_gff"],
                                   "_".join([prefix, "sRNA.gff"]))
            best_gff = os.path.join(self.all_best["best_gff"],
                                    "_".join([prefix, "sRNA.gff"]))
            srna_antisense(all_gff, all_table,
                           os.path.join(gffs, prefix + ".gff"))
            srna_antisense(best_gff, best_table,
                           os.path.join(gffs, prefix + ".gff"))

    def _blast_stat(self, stat_path, srna_tables):
        for srna_table in os.listdir(os.path.join(srna_tables, "best")):
            out_srna_blast = os.path.join(
                    stat_path, "stat_" +
                    srna_table.replace(".csv", "_blast.csv"))
        blast_class(os.path.join(srna_tables, "best", srna_table),
                    out_srna_blast)

    def _compare_term_promoter(self, out_table, prefix, args_srna):
        if ("term" in args_srna.import_info) and (
                self.term_path is not None):
            compare_srna_term(os.path.join(self.all_best["all_gff"],
                              "_".join([prefix, "sRNA.gff"])),
                              out_table, os.path.join(self.term_path,
                              "_".join([prefix, "term.gff"])),
                              args_srna.fuzzy_b, args_srna.fuzzy_a)
        if ("promoter" in args_srna.import_info) and (
                args_srna.promoter_table is not None) and (
                "tss" in args_srna.import_info):
            compare_srna_promoter(os.path.join(self.all_best["all_gff"],
                                  "_".join([prefix, "sRNA.gff"])),
                                  out_table, args_srna)

    def run_srna_detection(self, args_srna):
        self._check_necessary_file(args_srna)
        self.multiparser.parser_gff(args_srna.trans, "transcript")
        self.multiparser.combine_gff(args_srna.gffs, self.tran_path,
                                     None, "transcript")
        args_srna.import_info = self._import_info_format(args_srna.import_info)
        prefixs = self._run_program(args_srna)
        self._filter_srna(args_srna, prefixs)
        for prefix in prefixs:
            shutil.copyfile("_".join([self.prefixs["basic"], prefix]),
                            os.path.join(self.all_best["all_gff"],
                            "_".join([prefix, "sRNA.gff"])))
            self._compare_term_promoter("_".join([self.prefixs["merge_table"],
                                        prefix]), prefix, args_srna)
        self._gen_table(prefixs, args_srna)
        self._class_srna(prefixs, args_srna)
        self._get_best_result(prefixs, args_srna)
        self._print_rank_all(prefixs)
        if "blast_srna" in args_srna.import_info:
            self._blast_stat(self.stat_path, self.table_output)
        self._remove_file(args_srna)
Beispiel #21
0
class TSSpredator(object):

    def __init__(self, args_tss):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.master = os.path.join(args_tss.out_folder, "MasterTables")
        self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta":
                     "tmp_tss", "tmp": "tmp"}
        if args_tss.ta_files is not None:
            self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp")
        else:
            self.tmps["ta"] = None
        self.gff_path = os.path.join(args_tss.gffs, "tmp")
        if args_tss.manual is not None:
            self.manual_path = os.path.join(args_tss.manual, "tmp")
        self.wig_path = os.path.join(args_tss.wig_folder, "tmp")
        self.fasta_path = os.path.join(args_tss.fastas, "tmp")
        self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics")
        self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs")

    def _assign_dict(self, lib_datas):
        return {"wig": lib_datas[0],
                "tex": lib_datas[1],
                "condition": int(lib_datas[2]),
                "replicate": lib_datas[3],
                "strand": lib_datas[4]}

    def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix, rep_set):
        for num_id in range(1, lib_num+1):
            cond_list = []
            for lib in lib_list:
                if num_id == lib["condition"]:
                    cond_list.append(lib)
            cond_sort_list = sorted(cond_list, key=lambda k: k['replicate'])
            reps = []
            for cond in cond_sort_list:
                out.write("{0}_{1}{2} = {3}\n".format(
                          prefix, cond["condition"], cond["replicate"],
                          os.path.join(wig_folder, cond["wig"])))
                reps.append(cond["replicate"])
            for rep in sorted(rep_set):
                if rep not in reps:
                    out.write("{0}_{1}{2} = \n".format(
                              prefix, cond["condition"], rep))

    def _start_to_run(self, tsspredator_path, config_file, out_path, prefix, log):
        print("Running TSSpredator for " + prefix)
        log.write("Make sure the version of TSSpredator is at least 1.06.\n")
        out = open(os.path.join(out_path, "log.txt"), "w")
        err = open(os.path.join(out_path, "err.txt"), "w")
        log.write(" ".join(["java", "-jar", tsspredator_path,
                            config_file]) + "\n")
        call(["java", "-jar", tsspredator_path,
              config_file], stdout=out, stderr=err)
        out.close()
        err.close()
        log.write("Done!\n")
        log.write("The following files are generated in {0}:\n".format(out_path))
        for file_ in os.listdir(out_path):
            log.write("\t" + file_ + "\n")

    def _import_lib(self, libs, wig_folder, project_strain_name,
                    out, gff, program, fasta):
        lib_dict = {"fp": [], "fm": [], "nm": [], "np": []}
        lib_num = 0
        rep_set = set()
        list_num_id = []
        for lib in libs:
            lib_datas = lib.split(":")
            if not lib_datas[0].endswith(".wig"):
                print("Error: Wiggle files are not end with .wig!")
                sys.exit()
            for wig in os.listdir(wig_folder):
                filename = wig.split("_STRAIN_")
                if (filename[0] == lib_datas[0][:-4]) and (
                        filename[1][:-4] == project_strain_name):
                    lib_datas[0] = wig
            if int(lib_datas[2]) > lib_num:
                lib_num = int(lib_datas[2])
            if lib_datas[3] not in rep_set:
                rep_set.add(lib_datas[3])
            if (lib_datas[1] == "tex") and (lib_datas[4] == "+"):
                lib_dict["fp"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"):
                lib_dict["fm"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"):
                lib_dict["np"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"):
                lib_dict["nm"].append(self._assign_dict(lib_datas))
        for num_id in range(1, lib_num+1):
            out.write("annotation_{0} = {1}\n".format(num_id, gff))
        if program.lower() == "tss":
            self._print_lib(lib_num, lib_dict["fm"], out,
                            wig_folder, "fivePrimeMinus", rep_set)
            self._print_lib(lib_num, lib_dict["fp"], out,
                            wig_folder, "fivePrimePlus", rep_set)
        elif program.lower() == "ps":
            self._print_lib(lib_num, lib_dict["nm"], out,
                            wig_folder, "fivePrimeMinus", rep_set)
            self._print_lib(lib_num, lib_dict["np"], out,
                            wig_folder, "fivePrimePlus", rep_set)
        else:
            print("Error: Wrong program name! Please assing tss "
                  "or processing_site.")
            sys.exit()
        for num_id in range(1, lib_num+1):
            out.write("genome_{0} = {1}\n".format(num_id, fasta))
        for num_id in range(1, lib_num+1):
            list_num_id.append(str(num_id))
        return lib_num, num_id, rep_set, lib_dict, list_num_id

    def _print_repmatch(self, args_tss, out):
        '''check replicate match'''
        detect_all = False
        for rep in args_tss.repmatch:
            if "all" in rep:
                detect_all = True
                match = rep.split("_")[-1]
                out.write("minNumRepMatches = {0}\n".format(match))
                break
        if not detect_all:
            nums = {}
            matchs = {}
            for match in args_tss.repmatch:
                lib = match.split("_")[0]
                rep = match.split("_")[-1]
                matchs[lib] = rep
                if rep not in nums.keys():
                    nums[rep] = 1
                else:
                    nums[rep] += 1
            for rep, num in nums.items():
                if num == max(nums.values()):
                    out.write("minNumRepMatches = {0}\n".format(rep))
                    max_rep = rep
                    break
            for lib, rep in matchs.items():
                if rep != max_rep:
                    out.write("minNumRepMatches_{0} = {1}\n".format(
                        lib, rep))

    def _gen_config(self, project_strain_name, args_tss, gff,
                    wig_folder, fasta, config_file, log):
        '''generation of config files'''
        master_folder = "MasterTable_" + project_strain_name
        out_path = os.path.join(self.master, master_folder)
        self.helper.check_make_folder(out_path)
        out = open(config_file, "w")
        out.write("TSSinClusterSelectionMethod = HIGHEST\n")
        out.write("allowedCompareShift = 1\n")
        out.write("allowedRepCompareShift = 1\n")
        lib_num, num_id, rep_set, lib_dict, list_num_id = \
            self._import_lib(args_tss.libs, wig_folder, project_strain_name,
                             out, gff, args_tss.program, fasta)
        out.write("idList = ")
        out.write(",".join(list_num_id) + "\n")
        out.write("maxASutrLength = 100\n")
        out.write("maxGapLengthInGene = 500\n")
        out.write("maxNormalTo5primeFactor = {0}\n".format(
                  args_tss.processing_factor))
        out.write("maxTSSinClusterDistance = {0}\n".format(
                  args_tss.cluster + 1))
        out.write("maxUTRlength = {0}\n".format(args_tss.utr_length))
        out.write("min5primeToNormalFactor = {0}\n".format(
                  args_tss.enrichment_factor))
        out.write("minCliffFactor = {0}\n".format(args_tss.factor))
        out.write("minCliffFactorDiscount = {0}\n".format(
                  args_tss.factor_reduction))
        out.write("minCliffHeight = {0}\n".format(args_tss.height))
        out.write("minCliffHeightDiscount = {0}\n".format(
                  args_tss.height_reduction))
        out.write("minNormalHeight = {0}\n".format(args_tss.base_height))
        self._print_repmatch(args_tss, out)
        out.write("minPlateauLength = 0\n")
        out.write("mode = cond\n")
        out.write("normPercentile = 0.9\n")
        if args_tss.program.lower() == "tss":
            self._print_lib(lib_num, lib_dict["nm"], out,
                            wig_folder, "normalMinus", rep_set)
            self._print_lib(lib_num, lib_dict["np"], out,
                            wig_folder, "normalPlus", rep_set)
        else:
            self._print_lib(lib_num, lib_dict["fm"], out,
                            wig_folder, "normalMinus", rep_set)
            self._print_lib(lib_num, lib_dict["fp"], out,
                            wig_folder, "normalPlus", rep_set)
        out.write("numReplicates = {0}\n".format(len(rep_set)))
        out.write("numberOfDatasets = {0}\n".format(lib_num))
        out.write("outputDirectory = {0}\n".format(out_path))
        for prefix_id in range(len(args_tss.output_prefixs)):
            out.write("outputPrefix_{0} = {1}\n".format(
                      prefix_id + 1, args_tss.output_prefixs[prefix_id]))
        out.write("projectName = {0}\n".format(project_strain_name))
        out.write("superGraphCompatibility = igb\n")
        out.write("texNormPercentile = 0.5\n")
        out.write("writeGraphs = 0\n")
        out.write("writeNocornacFiles = 0\n")
        log.write("\t" + config_file + " is generated.\n")
        out.close()

    def _convert_gff(self, prefixs, args_tss, log):
        for prefix in prefixs:
            out_file = os.path.join(self.gff_outfolder, "_".join([
                           prefix, args_tss.program]) + ".gff")
            gff_f = open(out_file, "w")
            out_path = os.path.join(self.master, "_".join([
                           "MasterTable", prefix]))
            if "MasterTable.tsv" not in os.listdir(out_path):
                print("Error: There is not MasterTable file in {0} ".format(
                      out_path))
                print("Please check configuration file.")
                log.write("not MasterTable file is found in {0}\n".format(
                           out_path))
            else:
                if args_tss.program.lower() == "processing":
                    feature = "processing_site"
                elif args_tss.program.lower() == "tss":
                    feature = "TSS"
                self.converter.convert_mastertable2gff(
                    os.path.join(out_path, "MasterTable.tsv"),
                    "ANNOgesic", feature, prefix, out_file)
                log.write("\t" + out_file + "is generated.\n")
            gff_f.close()

    def _merge_manual(self, tsss, args_tss):
        '''if manual detected TSS is provided, it can merge manual detected TSS 
        and TSSpredator predicted TSS'''
        self.helper.check_make_folder(os.path.join(os.getcwd(),
                                      self.tmps["tss"]))
        for tss in tsss:
            for gff in os.listdir(args_tss.gffs):
                if (gff[:-4] == tss) and (".gff" in gff):
                    break
            filename = "_".join([tss, args_tss.program]) + ".gff"
            predict = os.path.join(self.gff_outfolder, filename)
            manual = os.path.join(self.manual_path, tss + ".gff")
            fasta = os.path.join(self.fasta_path, tss + ".fa")
            stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss)
            if os.path.exists(manual):
                print("Merging and classiflying manually-detected "
                      "TSSs for {0}".format(tss))
                merge_manual_predict_tss(
                    predict, stat_file,
                    os.path.join(self.tmps["tss"], filename),
                    os.path.join(args_tss.gffs, gff), args_tss, manual, fasta)
            if os.path.exists(stat_file):
                shutil.move(stat_file, os.path.join(
                    args_tss.out_folder, "statistics", tss, stat_file))
        self.helper.move_all_content(self.tmps["tss"],
                                     self.gff_outfolder, [".gff"])
        shutil.rmtree(self.tmps["tss"])

    def _validate(self, tsss, args_tss, log):
        '''validate TSS with genome annotation'''
        print("Validating TSSs with genome annotations")
        log.write("Running validate_gene.py to compare genome "
                  "annotations and TSSs/PSs.\n")
        for tss in tsss:
            for gff in os.listdir(args_tss.gffs):
                if (gff[:-4] == tss) and (".gff" in gff):
                    break
            stat_file = os.path.join(
                    self.stat_outfolder, tss,
                    "".join(["stat_gene_vali_", tss, ".csv"]))
            out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff")
            if args_tss.program.lower() == "tss":
                compare_file = os.path.join(self.gff_outfolder,
                                            "_".join([tss, "TSS.gff"]))
            elif args_tss.program.lower() == "processing":
                compare_file = os.path.join(self.gff_outfolder,
                                            "_".join([tss, "processing.gff"]))
            validate_gff(compare_file, os.path.join(args_tss.gffs, gff),
                         stat_file, out_cds_file, args_tss.utr_length,
                         args_tss.program.lower())
            log.write("\t" + stat_file + " is generated.\n")
            shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff))

    def _compare_ta(self, tsss, args_tss, log):
        '''compare TSS with transcript'''
        detect = False
        log.write("Running stat_TA_comparison to compare transcripts "
                  "and TSSs/PSs.\n")
        print("Comparing transcripts and TSSs")
        self.multiparser.parser_gff(args_tss.ta_files, "transcript")
        self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"],
                                     None, "transcript")
        for tss in tsss:
            stat_out = os.path.join(
                    self.stat_outfolder, tss, "".join([
                        "stat_compare_TSS_transcript_",
                        tss, ".csv"]))
            for ta in os.listdir(self.tmps["ta"]):
                filename = ta.split("_transcript")
                if (filename[0] == tss) and (filename[1] == ".gff"):
                    detect = True
                    break
            compare_file = os.path.join(self.gff_outfolder,
                                        "_".join([tss, "TSS.gff"]))
            if detect:
                stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file,
                            stat_out, self.tmps["ta_tss"],
                            self.tmps["tss_ta"], args_tss.fuzzy)
                self.helper.sort_gff(self.tmps["tss_ta"], compare_file)
                self.helper.sort_gff(self.tmps["ta_tss"],
                                     os.path.join(args_tss.ta_files, ta))
                os.remove(self.tmps["tss_ta"])
                os.remove(self.tmps["ta_tss"])
                detect = False
            log.write("\t" + stat_out + " is generated.\n")

    def _stat_tss(self, tsss, feature, log):
        print("Running statistaics")
        for tss in tsss:
            compare_file = os.path.join(self.gff_outfolder,
                                        "_".join([tss, feature]) + ".gff")
            stat_tsspredator(
                compare_file, feature,
                os.path.join(self.stat_outfolder, tss, "_".join([
                    "stat", feature, "class", tss]) + ".csv"),
                os.path.join(self.stat_outfolder, tss, "_".join([
                    "stat", feature, "libs", tss]) + ".csv"))
            self.helper.move_all_content(os.getcwd(), os.path.join(
                self.stat_outfolder, tss), ["_class", ".png"])
            if os.path.exists(os.path.join(
                    self.stat_outfolder, "TSSstatistics.tsv")):
                shutil.move(
                    os.path.join(
                        self.stat_outfolder, "TSSstatistics.tsv"),
                    os.path.join(
                        self.stat_outfolder, tss, "TSSstatistics.tsv"))
            plot_venn(compare_file, feature)
            self.helper.move_all_content(os.getcwd(), os.path.join(
                self.stat_outfolder, tss), ["_venn", ".png"])
            log.write("The following files in {0} are generated:\n".format(
                (os.path.join(self.stat_outfolder, tss))))
            for file_ in os.listdir(os.path.join(
                    self.stat_outfolder, tss)):
                log.write("\t" + file_ + "\n")

    def _set_gen_config(self, args_tss, input_folder, log):
        prefixs = []
        detect = False
        log.write("Generating config files for TSSpredator.\n")
        for fasta in os.listdir(self.fasta_path):
            run = False
            for gff in os.listdir(self.gff_path):
                if fasta[:-3] == gff[:-4]:
                    prefix = fasta[:-3]
                    for wig in os.listdir(self.wig_path):
                        filename = wig.split("_STRAIN_")
                        if filename[1][:-4] == prefix:
                            detect = True
                            break
                    if detect:
                        prefixs.append(prefix)
                        config = os.path.join(
                                input_folder,
                                "_".join(["config", prefix]) + ".ini")
                        self._gen_config(
                            prefix, args_tss,
                            os.path.join(self.gff_path, gff), self.wig_path,
                            os.path.join(self.fasta_path, fasta), config, log)
        return prefixs

    def _merge_wigs(self, wig_folder, prefix, libs):
        self.helper.check_make_folder(os.path.join(os.getcwd(),
                                      self.tmps["tmp"]))
        for wig_file in os.listdir(wig_folder):
            for lib in libs:
                info = lib.split(":")
                if (info[0][:-4] in wig_file) and (info[-1] == "+") and (
                        prefix in wig_file) and (
                        os.path.isfile(os.path.join(wig_folder, wig_file))):
                    Helper().merge_file(
                            os.path.join(wig_folder, wig_file),
                            os.path.join("tmp", "merge_forward.wig"))
                if (info[0][:-4] in wig_file) and (info[-1] == "-") and (
                        prefix in wig_file) and (
                        os.path.isfile(os.path.join(wig_folder, wig_file))):
                    Helper().merge_file(
                            os.path.join(wig_folder, wig_file),
                            os.path.join("tmp", "merge_reverse.wig"))

    def _check_orphan(self, prefixs, wig_folder, args_tss):
        '''if genome has no locus tag, it can use for classify the TSS'''
        for prefix in prefixs:
            self._merge_wigs(wig_folder, prefix, args_tss.libs)
            tmp_tss = os.path.join(self.tmps["tmp"], "_".join([
                          prefix, args_tss.program + ".gff"]))
            pre_tss = os.path.join(self.gff_outfolder, "_".join([
                          prefix, args_tss.program + ".gff"]))
            check_orphan(pre_tss, os.path.join(
                args_tss.gffs, prefix + ".gff"),
                "tmp/merge_forward.wig", "tmp/merge_reverse.wig", tmp_tss)
            shutil.move(tmp_tss, pre_tss)
        shutil.rmtree("tmp")

    def _remove_files(self, args_tss):
        print("Remove temperary files and folders")
        self.helper.remove_tmp_dir(args_tss.fastas)
        self.helper.remove_tmp_dir(args_tss.gffs)
        self.helper.remove_tmp_dir(args_tss.ta_files)
        if "merge_forward.wig" in os.listdir(os.getcwd()):
            os.remove("merge_forward.wig")
        if "merge_reverse.wig" in os.listdir(os.getcwd()):
            os.remove("merge_reverse.wig")
        shutil.rmtree(args_tss.wig_folder)
        if args_tss.manual is not None:
            shutil.rmtree(args_tss.manual)

    def _deal_with_overlap(self, out_folder, args_tss):
        '''deal with the situation that TSS and 
        processing site at the same position'''
        if not args_tss.overlap_feature:
            pass
        else:
            print("Comparing TSSs and Processing sites")
            if args_tss.program.lower() == "tss":
                for tss in os.listdir(out_folder):
                    if tss.endswith("_TSS.gff"):
                        ref = self.helper.get_correct_file(
                                args_tss.overlap_gffs, "_processing.gff",
                                tss.replace("_TSS.gff", ""), None, None)
                        filter_tss_pro(os.path.join(out_folder, tss),
                                       ref, args_tss.program,
                                       args_tss.cluster)
            elif args_tss.program.lower() == "processing":
                for tss in os.listdir(out_folder):
                    if tss.endswith("_processing.gff"):
                        ref = self.helper.get_correct_file(
                                args_tss.overlap_gffs, "_TSS.gff",
                                tss.replace("_processing.gff", ""), None, None)
                        filter_tss_pro(os.path.join(out_folder, tss),
                                       ref, args_tss.program,
                                       args_tss.cluster)

    def _low_expression(self, args_tss, gff_folder):
        '''deal with the low expressed TSS'''
        prefix = None
        self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs)
        for gff in os.listdir(gff_folder):
            if (args_tss.program.lower() == "tss") and (
                    gff.endswith("_TSS.gff")):
                prefix = gff.replace("_TSS.gff", "")
            elif (args_tss.program.lower() == "processing") and (
                    gff.endswith("_processing.gff")):
                prefix = gff.replace("_processing.gff", "")
            if prefix:
                out = open(os.path.join(
                    self.stat_outfolder, prefix, "_".join([
                        "stat", prefix, "low_expression_cutoff.csv"])), "w")
                out.write("\t".join(["Genome", "Cutoff_coverage"]) + "\n")
                cutoff = filter_low_expression(
                        os.path.join(gff_folder, gff), args_tss,
                        "tmp/merge_forward.wig", "tmp/merge_reverse.wig",
                        "tmp/without_low_expression.gff")
                out.write("\t".join([prefix, str(cutoff)]) + "\n")
                os.remove(os.path.join(gff_folder, gff))
                shutil.move("tmp/without_low_expression.gff",
                            os.path.join(gff_folder, gff))
                prefix = None
        out.close()

    def run_tsspredator(self, args_tss, log):
        input_folder = os.path.join(args_tss.out_folder, "configs")
        for gff in os.listdir(args_tss.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(
                                                 args_tss.gffs, gff))
        self.helper.check_make_folder(self.gff_outfolder)
        self.multiparser.parser_fasta(args_tss.fastas)
        self.multiparser.parser_gff(args_tss.gffs, None)
        self.multiparser.parser_wig(args_tss.wig_folder)
        prefixs = self._set_gen_config(args_tss, input_folder, log)
        for prefix in prefixs:
            out_path = os.path.join(
                    self.master, "_".join(["MasterTable", prefix]))
            config_file = os.path.join(
                    input_folder, "_".join(["config", prefix]) + ".ini")
            self._start_to_run(args_tss.tsspredator_path, config_file,
                               out_path, prefix, log)
            if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")):
                shutil.move(os.path.join(out_path, "TSSstatistics.tsv"),
                            os.path.join(
                                self.stat_outfolder, "TSSstatistics.tsv"))
        if args_tss.program.lower() == "ps":
            args_tss.program = "processing"
        self._convert_gff(prefixs, args_tss, log)
        if args_tss.check_orphan:
            print("checking the orphan TSSs")
            log.write("Running check_orphan.py to re-check orphan TSSs.\n")
            self._check_orphan(prefixs,
                               os.path.join(args_tss.wig_folder, "tmp"),
                               args_tss)
        self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder,
                                     None, args_tss.program)
        datas = []
        for gff in os.listdir(self.gff_outfolder):
            if gff.endswith(".gff"):
                gff_folder = gff.replace("".join(["_", args_tss.program,
                                                  ".gff"]), "")
                self.helper.check_make_folder(
                     os.path.join(self.stat_outfolder, gff_folder))
                datas.append(gff_folder)
        if args_tss.remove_low_expression is not None:
            log.write("Running filter_low_expression.py to filter out "
                      "low expressed TSS/PS.\n")
            self._low_expression(args_tss, self.gff_outfolder)
        if args_tss.manual is not None:
            self.multiparser.parser_gff(args_tss.manual, None)
            self.multiparser.combine_gff(args_tss.gffs, self.manual_path,
                                         None, None)
            self.multiparser.combine_fasta(args_tss.gffs, self.fasta_path,
                                         None)
            self.multiparser.combine_wig(args_tss.gffs, self.wig_path,
                                         None, args_tss.libs)
            log.write("Running merge_manual.py to merge the manual TSSs.\n")
            self._merge_manual(datas, args_tss)
        log.write("Running filter_TSS_pro.py to deal with the overlap "
                  "position between TSS and PS.\n")
        self._deal_with_overlap(self.gff_outfolder, args_tss)
        log.write("Running stat_TSSpredator.py to do statistics.\n")
        self._stat_tss(datas, args_tss.program, log)
        if args_tss.validate:
            self._validate(datas, args_tss, log)
        if args_tss.ta_files is not None:
            self._compare_ta(datas, args_tss, log)
        self._remove_files(args_tss)
Beispiel #22
0
class Crispr(object):
    '''Detection of CRISPR'''
    def __init__(self, args_cris):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.gff_parser = Gff3Parser()
        self.gff_path = os.path.join(args_cris.gffs, "tmp")
        self.fasta_path = os.path.join(args_cris.fastas, "tmp")
        self.stat_folder = os.path.join(args_cris.out_folder, "statistics")
        self.gff_out = os.path.join(args_cris.out_folder, "gffs")
        self.all_out = os.path.join(args_cris.out_folder, "gffs",
                                    "all_candidates")
        self.best_out = os.path.join(args_cris.out_folder, "gffs", "best")
        self.helper.check_make_folder(self.all_out)
        self.helper.check_make_folder(self.best_out)
        self.data_folder = os.path.join(args_cris.out_folder, "CRT_output")
        self.helper.check_make_folder(self.data_folder)
        self.helper.check_make_folder(self.stat_folder)

    def _run_crt(self, args_cris):
        '''Running CRT'''
        print("Running CRT")
        for seq in os.listdir(self.fasta_path):
            prefix = ".".join(seq.split(".")[:-1])
            call([
                "java", "-cp", args_cris.crt_path, "crt", "-minNR",
                str(args_cris.min_num_r), "-minRL",
                str(args_cris.min_len_r), "-maxRL",
                str(args_cris.max_len_r), "-minSL",
                str(args_cris.min_len_s), "-maxSL",
                str(args_cris.max_len_s), "-searchWL",
                str(args_cris.win_size),
                os.path.join(self.fasta_path, seq),
                os.path.join(self.data_folder, prefix + ".txt")
            ])

    def _read_gff(self, txt):
        gffs = []
        gh = open(os.path.join(self.gff_path, txt.replace(".txt", ".gff")),
                  "r")
        for entry in Gff3Parser().entries(gh):
            if (entry.feature == "gene") or (entry.feature == "CDS") or (
                    entry.feature == "tRNA") or (entry.feature == "rRNA"):
                gffs.append(entry)
        gh.close()
        return gffs

    def _compare_gff(self, strain, start, end, gffs, bh, indexs, ignore_hypo):
        '''Compare CRISPR and genome annotation to 
        remove the false positives'''
        overlap = False
        id_ = None
        for gff in gffs:
            if (gff.seq_id == strain):
                if ((gff.start <= start) and (gff.end >= end)) or (
                    (gff.start >= start) and (gff.end <= end)) or (
                        (gff.start <= start) and (gff.end > start) and
                        (gff.end <= end)) or ((gff.start >= start) and
                                              (gff.start < end) and
                                              (gff.end >= end)):
                    if "product" in gff.attributes.keys():
                        if ((not ignore_hypo) and
                            ("hypothetical protein"
                             in gff.attributes["product"])) or (
                                 "hypothetical protein"
                                 not in gff.attributes["product"]):
                            overlap = True
        if not overlap:
            id_ = "CRISPR_" + str(indexs["best"])
            attribute = ";".join(["ID=" + strain + "_" + id_, "method=CRT"])
            bh.write("\t".join([
                strain, "ANNOgesic", "CRISPR",
                str(start),
                str(end), ".", ".", ".", attribute
            ]) + "\n")
            indexs["best"] += 1
        return overlap, id_

    def _print_repeat(self, row, strain, file_h, indexs, id_, best):
        '''Print the repeat units'''
        if best:
            num = indexs["re_best"]
        else:
            num = indexs["re_all"]
        if (not row[0].startswith("-")) and (
                not row[0].startswith("Repeats:")) and (
                    not row[0].startswith("CRISPR")) and (
                        not row[0].startswith("POSITION")):
            start = row[0].strip()
            end = str(int(start) + len(row[2].strip()) - 1)
            attribute = ";".join([
                "ID=" + strain + "_Repeat_" + str(num), "method=CRT",
                "Parent=" + id_
            ])
            file_h.write("\t".join([
                strain, "ANNOgesic", "repeat_unit", start, end, ".", ".", ".",
                attribute
            ]) + "\n")
            num += 1
        if row[0].startswith("Repeats:"):
            indexs["run"] = False
        return num

    def _convert_gff(self, ignore_hypo):
        '''Convert the final CRT output to gff format'''
        for txt in os.listdir(self.data_folder):
            gffs = self._read_gff(txt)
            fh = open(os.path.join(self.data_folder, txt), "r")
            oh = open(
                os.path.join(self.all_out, txt.replace(".txt", "_CRISPR.gff")),
                "w")
            bh = open(
                os.path.join(self.best_out, txt.replace(".txt",
                                                        "_CRISPR.gff")), "w")
            indexs = {
                "all": 0,
                "re_all": 0,
                "best": 0,
                "re_best": 0,
                "run": False
            }
            for row in csv.reader(fh, delimiter='\t'):
                if len(row) != 0:
                    if row[0].startswith("ORGANISM:"):
                        strain = row[0].split(" ")[-1]
                    elif row[0].startswith("CRISPR"):
                        end = row[0].split("-")[-1].strip()
                        start = row[0].split("-")[0].split(":")[-1].strip()
                        id_ = "CRISPR_" + str(indexs["all"])
                        attribute = ";".join(
                            ["ID=" + strain + "_" + id_, "method=CRT"])
                        oh.write("\t".join([
                            strain, "ANNOgesic", "CRISPR", start, end, ".",
                            ".", ".", attribute
                        ]) + "\n")
                        overlap, over_id = self._compare_gff(
                            strain, int(start), int(end), gffs, bh, indexs,
                            ignore_hypo)
                        indexs["all"] += 1
                        indexs["run"] = True
                    if indexs["run"]:
                        indexs["re_all"] = self._print_repeat(
                            row, strain, oh, indexs, id_, False)
                        if not overlap:
                            indexs["re_best"] = self._print_repeat(
                                row, strain, bh, indexs, over_id, True)
            fh.close()
            oh.close()
            bh.close()

    def _stat_and_correct(self, stats, folder):
        '''do statistics and print the final gff file'''
        for gff in os.listdir(folder):
            prefix = gff.replace("_CRISPR.gff", "")
            stats[prefix] = {"all": {"cri": 0, "re": {}}}
            gh = open(os.path.join(folder, gff), "r")
            oh = open("tmp_cri.gff", "w")
            oh.write("##gff-version 3\n")
            cr_num = 0
            re_num = 0
            first = True
            for entry in Gff3Parser().entries(gh):
                if entry.seq_id not in stats[prefix].keys():
                    stats[prefix][entry.seq_id] = {"cri": 0, "re": {}}
                if entry.feature == "CRISPR":
                    id_ = "CRISPR_" + str(cr_num)
                    attribute = ";".join(
                        ["ID=" + entry.seq_id + "_" + id_, "method=CRT"])
                    cr_num += 1
                    if first:
                        first = False
                    else:
                        if repeat not in stats[prefix][
                                entry.seq_id]["re"].keys():
                            stats[prefix][entry.seq_id]["re"][repeat] = 1
                        else:
                            stats[prefix][entry.seq_id]["re"][repeat] += 1
                        if repeat not in stats[prefix]["all"]["re"].keys():
                            stats[prefix]["all"]["re"][repeat] = 1
                        else:
                            stats[prefix]["all"]["re"][repeat] += 1
                    repeat = 0
                    stats[prefix][entry.seq_id]["cri"] += 1
                    stats[prefix]["all"]["cri"] += 1
                elif entry.feature == "repeat_unit":
                    attribute = ";".join([
                        "ID=" + entry.seq_id + "_Repeat_" + str(re_num),
                        "method=CRT", "Parent=" + id_
                    ])
                    re_num += 1
                    repeat += 1
                oh.write(
                    "\t".join([entry.info_without_attributes, attribute]) +
                    "\n")
            if not first:
                if repeat not in stats[prefix][entry.seq_id]["re"].keys():
                    stats[prefix][entry.seq_id]["re"][repeat] = 1
                else:
                    stats[prefix][entry.seq_id]["re"][repeat] += 1
                if repeat not in stats[prefix]["all"]["re"].keys():
                    stats[prefix]["all"]["re"][repeat] = 1
                else:
                    stats[prefix]["all"]["re"][repeat] += 1
            gh.close()
            oh.close()
            os.remove(os.path.join(folder, gff))
            shutil.move("tmp_cri.gff", os.path.join(folder, gff))

    def _print_file(self, sh, cri_res_all, cri_res_best):
        sh.write("\tthe number of CRISPR - {0}\n".format(cri_res_all["cri"]))
        for index, num in cri_res_all["re"].items():
            sh.write("\t\tCRISPR with {0} repeat units - {1}\n".format(
                index, num))
        sh.write("\tthe number of CRISPR which not overlap "
                 "with genome annotation - {0}\n".format(cri_res_best["cri"]))
        for index, num in cri_res_best["re"].items():
            sh.write("\t\tCRISPR with {0} repeat units - {1}\n".format(
                index, num))

    def _print_stat(self, stats):
        '''print the statistics file'''
        for prefix, strains in stats["all"].items():
            sh = open(os.path.join(self.stat_folder, prefix + ".csv"), "w")
            if len(strains) == 1:
                sh.write("No CRISPR can be detected")
            elif len(strains) <= 2:
                for strain, cri_res in strains.items():
                    if strain != "all":
                        sh.write(strain + ":\n")
                        self._print_file(sh, cri_res,
                                         stats["best"][prefix][strain])
            else:
                sh.write("All strains:\n")
                self._print_file(sh, stats["all"][prefix]["all"],
                                 stats["best"][prefix]["all"])
                for strain, cri_res in strains.items():
                    if strain != "all":
                        sh.write(strain + ":\n")
                        if strain not in stats["best"][prefix].keys():
                            stats["best"][prefix][strain] = {
                                "cri": 0,
                                "re": {}
                            }
                        self._print_file(sh, cri_res,
                                         stats["best"][prefix][strain])
            sh.close()

    def run_crispr(self, args_cris):
        '''detection of CRISPR'''
        self.multiparser.parser_fasta(args_cris.fastas)
        self.multiparser.parser_gff(args_cris.gffs, None)
        self._run_crt(args_cris)
        self._convert_gff(args_cris.ignore_hypo)
        print("All candidates:")
        self.multiparser.combine_gff(args_cris.gffs, self.all_out, None,
                                     "CRISPR")
        print("Best candidates:")
        self.multiparser.combine_gff(args_cris.gffs, self.best_out, None,
                                     "CRISPR")
        stats = {"all": {}, "best": {}}
        self._stat_and_correct(stats["all"], self.all_out)
        self._stat_and_correct(stats["best"], self.best_out)
        self._print_stat(stats)
        self.helper.remove_tmp_dir(args_cris.gffs)
        self.helper.remove_tmp_dir(args_cris.fastas)
Beispiel #23
0
class CircRNADetection(object):

    def __init__(self, args_circ):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.alignment_path = os.path.join(args_circ.output_folder,
                                           "segemehl_align")
        self.splice_path = os.path.join(args_circ.output_folder,
                                        "segemehl_splice")
        self.candidate_path = os.path.join(args_circ.output_folder,
                                           "circRNA_tables")
        self.gff_folder = os.path.join(args_circ.output_folder, "gffs")
        self.gff_path = os.path.join(args_circ.gffs, "tmp")
        self.splices = {"all_file": "splicesites_all.bed",
                        "file": "splicesites.bed",
                        "all": "splicesites_all", "splice": "splicesites"}
        self.trans = {"all_file": "transrealigned_all.bed",
                      "file": "transrealigned.bed",
                      "all": "transrealigned_all", "trans": "transrealigned"}
        self.bams = {"whole": "whole_reads.bam", "sort": "whole_reads_sort"}
        if args_circ.align:
            if args_circ.fastas is None:
                print("Error: There is no genome fasta file!!!")
                sys.exit()
            else:
                self.fasta_path = os.path.join(args_circ.fastas, "tmp")
        else:
            self.fasta_path = os.path.join(args_circ.fastas, "tmp")

    def _wait_process(self, processes):
        for p in processes:
            p.wait()
            if p.stdout:
                p.stdout.close()
            if p.stdin:
                p.stdin.close()
            if p.stderr:
                p.stderr.close()
            try:
                p.kill()
            except OSError:
                pass
            time.sleep(5)

    def _deal_zip_file(self, read_folder):
        tmp_reads = []
        for read in os.listdir(read_folder):
            if read.endswith(".bz2"):
                mod_read = read.replace(".bz2", "")
                if (".fa" not in mod_read) and (
                        ".fasta" not in mod_read) and (
                        ".fna" not in mod_read):
                    mod_read = mod_read + ".fa"
                read_out = open(os.path.join(read_folder, mod_read), "w")
                tmp_reads.append(os.path.join(read_folder, mod_read))
                print(" ".join(["unzip", read]))
                call(["bzcat", os.path.join(read_folder, read)],
                     stdout=read_out)
                read_out.close()
            elif read.endswith(".gz"):
                mod_read = read.replace(".gz", "")
                if (".fa" not in mod_read) and (
                        ".fasta" not in mod_read) and (
                        ".fna" not in mod_read):
                    mod_read = mod_read + ".fa"
                read_out = open(os.path.join(read_folder, mod_read), "w")
                tmp_reads.append(os.path.join(read_folder, mod_read))
                print(" ".join(["unzip", read]))
                call(["zcat", os.path.join(read_folder, read)],
                     stdout=read_out)
                read_out.close()
        return tmp_reads

    def _run_segemehl_fasta_index(self, segemehl_path, fasta_path,
                                  index, fasta):
        call([os.path.join(segemehl_path, "segemehl.x"),
              "-x", os.path.join(fasta_path, index),
              "-d", os.path.join(fasta_path, fasta)])

    def _run_segemehl_align(self, args_circ, index, fasta, read,
                            sam_file, log_file, fasta_prefix):
        out = open(os.path.join(self.alignment_path,
                   fasta_prefix, sam_file), "w")
        log = open(os.path.join(self.alignment_path,
                   fasta_prefix, log_file), "w")
        p = Popen([os.path.join(args_circ.segemehl_path, "segemehl.x"),
                   "-i", os.path.join(self.fasta_path, index),
                   "-d", os.path.join(self.fasta_path, fasta),
                   "-q", os.path.join(args_circ.read_folder, read), "-S"],
                  stdout=out, stderr=log)
        return p

    def _align(self, args_circ):
        prefixs = []
        align_files = []
        for fasta in os.listdir(self.fasta_path):
            index = fasta.replace(".fa", ".idx")
            self._run_segemehl_fasta_index(args_circ.segemehl_path,
                                           self.fasta_path, index, fasta)
            processes = []
            num_process = 0
            fasta_prefix = fasta.replace(".fa", "")
            prefixs.append(fasta_prefix)
            self.helper.check_make_folder(os.path.join(
                                self.alignment_path, fasta_prefix))
            for read in os.listdir(args_circ.read_folder):
                num_process += 1
                if read.endswith(".fa") or \
                   read.endswith(".fna") or \
                   read.endswith("fasta"):
                    filename = read.split(".")
                    read_prefix = ".".join(filename[:-1])
                    sam_file = "_".join([read_prefix, fasta_prefix + ".sam"])
                    log_file = "_".join([read_prefix, fasta_prefix + ".log"])
                    align_files.append("_".join([read_prefix, fasta_prefix]))
                    print("mapping {0}".format(sam_file))
                    p = self._run_segemehl_align(
                            args_circ, index, fasta, read,
                            sam_file, log_file, fasta_prefix)
                    processes.append(p)
                    if num_process == args_circ.cores:
                        self._wait_process(processes)
                        num_process = 0
            self._wait_process(processes)
        return align_files, prefixs

    def _run_samtools_convert_bam(self, samtools_path, pre_sam, out_bam):
        call([samtools_path, "view", "-bS", pre_sam, "-o", out_bam])

    def _convert_sam2bam(self, sub_alignment_path, samtools_path, align_files):
        bam_files = []
        convert_ones = []
        remove_ones = []
        for sam in os.listdir(sub_alignment_path):
            pre_sam = os.path.join(sub_alignment_path, sam)
            if sam.endswith(".sam"):
                bam_file = sam.replace(".sam", ".bam")
                print("Convert {0} to {1}".format(sam, bam_file))
                out_bam = os.path.join(sub_alignment_path, bam_file)
                self._run_samtools_convert_bam(samtools_path, pre_sam, out_bam)
                bam_files.append(out_bam)
                if align_files:
                    if bam_file.replace(".bam", "") not in align_files:
                        convert_ones.append(out_bam)
                    else:
                        remove_ones.append(pre_sam)
            elif sam.endswith(".bam"):
                if (pre_sam not in convert_ones) and (
                        pre_sam not in remove_ones):
                    bam_files.append(pre_sam)
            elif sam.endswith(".log"):
                os.remove(pre_sam)
        return bam_files, convert_ones, remove_ones

    def _run_samtools_merge_sort(self, samtools_path,
                                 sub_alignment_path, bam_files):
        print("Merge all bam files....")
        whole_bam = os.path.join(sub_alignment_path, self.bams["whole"])
        if len(bam_files) <= 1:
            shutil.copyfile(bam_files[0], whole_bam)
        else:
            file_line = " ".join(bam_files)
            os.system(" ".join([samtools_path, "merge",
                                whole_bam, file_line]))
        print("Sort bam files....")
        call([samtools_path, "sort", "-o", os.path.join(sub_alignment_path,
              self.bams["sort"] + ".bam"), whole_bam])
        os.remove(os.path.join(sub_alignment_path, self.bams["whole"]))

    def _run_samtools_convert_sam(self, samtools_path, sub_alignment_path):
        print("Convert whole reads bam file to sam file....")
        call([samtools_path, "view", "-h", "-o",
              os.path.join(sub_alignment_path, self.bams["sort"] + ".sam"),
              os.path.join(sub_alignment_path, self.bams["sort"] + ".bam")])

    def _merge_sort_aligment_file(self, bam_files, samtools_path,
                                  sub_alignment_path, convert_ones,
                                  tmp_reads, remove_ones):
        self._run_samtools_merge_sort(samtools_path,
                                      sub_alignment_path, bam_files)
        self._run_samtools_convert_sam(samtools_path, sub_alignment_path)
        for bam in convert_ones:
            os.remove(bam)
        for sam in remove_ones:
            os.remove(sam)
        if len(tmp_reads) != 0:
            for read in tmp_reads:
                os.remove(read)

    def _run_testrealign(self, prefix, segemehl_path, sub_alignment_path):
        self.helper.check_make_folder(os.path.join(self.splice_path, prefix))
        sub_splice_path = os.path.join(self.splice_path, prefix)
        err_log = os.path.join(sub_splice_path, prefix + ".log")
        print("Running testrealign.x for {0}".format(prefix))
        command = " ".join([
                  os.path.join(segemehl_path, "testrealign.x"),
                  "-d", os.path.join(self.fasta_path, prefix + ".fa"),
                  "-q", os.path.join(sub_alignment_path,
                                     self.bams["sort"] + ".sam"),
                  "-n"])
        os.system(command + " 2>" + err_log)
        self.helper.move_all_content(os.getcwd(), sub_splice_path, [".bed"])
        self.helper.remove_all_content(sub_alignment_path,
                                       self.bams["sort"], "file")

    def _merge_bed(self, fastas, splice_path):
        tmp_prefixs = []
        for fasta in os.listdir(fastas):
            headers = []
            if (fasta.endswith(".fa") or fasta.endswith(".fna") or
                    fasta.endswith(".fasta")):
                with open(os.path.join(fastas, fasta), "r") as f_h:
                    for line in f_h:
                        line = line.strip()
                        if line.startswith(">"):
                            headers.append(line[1:])
                filename = fasta.split(".")
                fasta_prefix = ".".join(filename[:-1])
                tmp_prefixs.append(fasta_prefix)
                self.helper.check_make_folder(os.path.join(
                                              os.getcwd(), fasta_prefix))
                for header in headers:
                    shutil.copyfile(os.path.join(splice_path, header,
                                    self.splices["file"]),
                                    os.path.join(fasta_prefix,
                                    "_".join([self.splices["splice"],
                                              header + ".bed"])))
                    shutil.copyfile(os.path.join(splice_path, header,
                                    self.trans["file"]),
                                    os.path.join(fasta_prefix,
                                    "_".join([self.trans["trans"],
                                              header + ".bed"])))
                out_splice = os.path.join(fasta_prefix,
                                          self.splices["all_file"])
                out_trans = os.path.join(fasta_prefix,
                                         self.trans["all_file"])
                if len(headers) > 1:
                    for file_ in os.listdir(fasta_prefix):
                        if (self.splices["splice"] in file_) and (
                                self.splices["all"] not in file_):
                            self.helper.merge_file(os.path.join(
                                    fasta_prefix, file_), out_splice)
                        elif (self.trans["trans"] in file_) and (
                                self.trans["all"] not in file_):
                            self.helper.merge_file(os.path.join(
                                    fasta_prefix, file_), out_trans)
                else:
                    shutil.move(os.path.join(
                                fasta_prefix,
                                "_".join([self.splices["splice"],
                                         headers[0] + ".bed"])),
                                out_splice)
                    shutil.move(os.path.join(
                                fasta_prefix,
                                "_".join([self.trans["trans"],
                                          headers[0] + ".bed"])),
                                out_trans)
        self.helper.remove_all_content(splice_path, None, "dir")
        return tmp_prefixs

    def _stat_and_gen_gff(self, tmp_prefixs, args_circ):
        for prefix in tmp_prefixs:
            self.helper.check_make_folder(os.path.join(self.gff_folder,
                                                       prefix))
            shutil.copytree(prefix, os.path.join(self.splice_path, prefix))
            self.helper.check_make_folder(os.path.join(
                                          self.candidate_path, prefix))
            print("comparing with annotation of {0}".format(prefix))
            if self.splices["all_file"] in os.listdir(os.path.join(
                                           self.splice_path, prefix)):
                detect_circrna(os.path.join(self.splice_path, prefix,
                               self.splices["all_file"]), os.path.join(
                               self.gff_path, prefix + ".gff"),
                               os.path.join(self.candidate_path, prefix,
                               "_".join(["circRNA", prefix + "_all.csv"])),
                               args_circ, os.path.join(args_circ.stat_folder,
                               "_".join(["stat_circRNA", prefix + ".csv"])))
                self.converter.convert_circ2gff(
                     os.path.join(self.candidate_path, prefix,
                                  "_".join(["circRNA",
                                            prefix + "_all.csv"])),
                     args_circ, os.path.join(
                                self.gff_folder, prefix,
                                "_".join([prefix, "circRNA_all.gff"])),
                     os.path.join(self.gff_folder, prefix,
                                  "_".join([prefix, "circRNA_best.gff"])))

    def _assign_merge_bam(self, args_circ):
        remove_frags = []
        bam_files = []
        if (args_circ.normal_bams is not None) and (
                args_circ.frag_bams is not None):
            for frag in os.listdir(args_circ.frag_bams):
                if frag.endswith(".bam"):
                    shutil.copyfile(os.path.join(args_circ.frag_bams, frag),
                                    os.path.join(args_circ.normal_bams, frag))
                    remove_frags.append(frag)
            merge_folder = args_circ.normal_bams
        elif (args_circ.normal_bams is not None):
            merge_folder = args_circ.normal_bams
        elif (args_circ.frag_bams is not None):
            merge_folder = args_circ.frag_bams
        else:
            print("Error: please assign bam folder or do alignment!!")
            sys.exit()
        for bam in os.listdir(merge_folder):
            if bam.endswith(".bam"):
                bam_files.append(os.path.join(merge_folder, bam))
        return merge_folder, remove_frags, bam_files

    def run_circrna(self, args_circ):
        for gff in os.listdir(args_circ.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(
                                                 args_circ.gffs, gff))
        if args_circ.segemehl_path is None:
            print("Error: please assign segemehl folder!!")
            sys.exit()
        self.multiparser.parser_gff(args_circ.gffs, None)
        self.multiparser.combine_gff(args_circ.fastas, self.gff_path,
                                     "fasta", None)
        tmp_reads = []
        if args_circ.align:
            self.multiparser.parser_fasta(args_circ.fastas)
            tmp_reads = self._deal_zip_file(args_circ.read_folder)
            align_files, prefixs = self._align(args_circ)
        else:
            self.multiparser.parser_fasta(args_circ.fastas)
            prefixs = []
            for fasta in os.listdir(self.fasta_path):
                fasta_prefix = fasta.replace(".fa", "")
                prefixs.append(fasta_prefix)
            merge_folder, remove_frag, bam_files = self._assign_merge_bam(
                                                   args_circ)
            align_files = None
        for prefix in prefixs:
            if args_circ.align:
                sub_alignment_path = os.path.join(self.alignment_path, prefix)
                bam_files, convert_ones, remove_ones = self._convert_sam2bam(
                    sub_alignment_path, args_circ.samtools_path, align_files)
            else:
                sub_alignment_path = merge_folder
                convert_ones = []
                remove_ones = []
            self._merge_sort_aligment_file(
                bam_files, args_circ.samtools_path, sub_alignment_path,
                convert_ones, tmp_reads, remove_ones)
            self._run_testrealign(prefix, args_circ.segemehl_path,
                                  sub_alignment_path)
        tmp_prefixs = self._merge_bed(args_circ.fastas, self.splice_path)
        self.multiparser.parser_gff(args_circ.gffs, None)
        self.multiparser.combine_gff(args_circ.fastas, self.gff_path,
                                     "fasta", None)
        self._stat_and_gen_gff(tmp_prefixs, args_circ)
        self.helper.remove_tmp(args_circ.fastas)
        self.helper.remove_tmp(args_circ.gffs)
        for tmp_prefix in tmp_prefixs:
            shutil.rmtree(tmp_prefix)
        if (not args_circ.align) and (len(remove_frag) != 0):
            for frag in remove_frag:
                os.remove(os.path.join(merge_folder, frag))
Beispiel #24
0
class Terminator(object):

    def __init__(self, args_term):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.gff_parser = Gff3Parser()
        self.gff_path = os.path.join(args_term.gffs, "tmp")
        self.fasta_path = os.path.join(args_term.fastas, "tmp")
        self.tran_path = os.path.join(args_term.trans, "tmp")
        self.outfolder = {"term": os.path.join(args_term.out_folder, "gffs"),
                          "csv": os.path.join(args_term.out_folder, "tables")}
        self.terms = {"all": os.path.join(self.outfolder["term"],
                                          "all_candidates"),
                      "express": os.path.join(self.outfolder["term"],
                                              "express"),
                      "best": os.path.join(self.outfolder["term"], "best"),
                      "non": os.path.join(self.outfolder["term"],
                                          "non_express")}
        self.csvs = {"all": os.path.join(self.outfolder["csv"],
                                         "all_candidates"),
                     "express": os.path.join(self.outfolder["csv"], "express"),
                     "best": os.path.join(self.outfolder["csv"], "best"),
                     "non": os.path.join(self.outfolder["csv"], "non_express")}
        self.combine_path = os.path.join(self.gff_path, "combine")
        self.tmps = {"transterm": os.path.join(os.getcwd(), "tmp_transterm"),
                     "hp": "transtermhp", "hp_gff": "transtermhp.gff",
                     "hp_path": "tmp_transterm/tmp",
                     "term_table": os.path.join(os.getcwd(), "tmp_term_table"),
                     "merge": os.path.join(os.getcwd(), "tmp_merge_gff"),
                     "gff": "tmp.gff",
                     "folder": os.path.join(os.getcwd(), "tmp")}
        self.suffixs = {"gff": "term.gff", "csv": "term.csv",
                        "allgff": "term_all.gff"}
        if args_term.srnas:
            self.srna_path = os.path.join(args_term.srnas, "tmp")
        else:
            self.srna_path = None
        self._make_gff_folder()

    def _combine_annotation(self, combine_file, files):
        with open(combine_file, 'w') as result:
            for file_ in files:
                check_start = False
                fh = open(file_, 'r')
                for line in fh:
                    if check_start:
                        result.write(line)
                    if "Location" in line:
                        check_start = True
                if "\n" not in line:
                    result.write("\n")
                fh.close()

    def _make_gff_folder(self):
        self.helper.check_make_folder(self.terms["all"])
        self.helper.check_make_folder(self.csvs["all"])
        self.helper.check_make_folder(self.terms["best"])
        self.helper.check_make_folder(self.csvs["best"])
        self.helper.check_make_folder(self.terms["express"])
        self.helper.check_make_folder(self.csvs["express"])
        self.helper.check_make_folder(self.terms["non"])
        self.helper.check_make_folder(self.csvs["non"])

    def _convert_gff2rntptt(self, gff_path, fasta_path, sRNAs):
        file_types = {}
        prefixs = []
        for gff in os.listdir(gff_path):
            if gff.endswith(".gff"):
                filename = gff.split("/")
                prefix = filename[-1][:-4]
                prefixs.append(prefix)
                gff_file = os.path.join(gff_path, gff)
                rnt_file = os.path.join(gff_path, gff.replace(".gff", ".rnt"))
                ptt_file = os.path.join(gff_path, gff.replace(".gff", ".ptt"))
                fasta = self.helper.get_correct_file(
                             fasta_path, ".fa", prefix, None, None)
                if not fasta:
                    print("Error: no proper file - {0}.fa".format(prefix))
                    sys.exit()
                if sRNAs:
                    self.multiparser.parser_gff(sRNAs, "sRNA")
                    srna = self.helper.get_correct_file(
                            self.srna_path, "_sRNA.gff", prefix, None, None)
                    if (srna) and (fasta):
                        self.converter.convert_gff2rntptt(
                            gff_file, fasta, ptt_file, rnt_file, srna,
                            srna.replace(".gff", ".rnt"))
                        file_types[prefix] = "srna"
                    if (not srna) and (fasta):
                        self.converter.convert_gff2rntptt(
                            gff_file, fasta, ptt_file, rnt_file, None, None)
                        file_types[prefix] = "normal"
                else:
                    self.converter.convert_gff2rntptt(
                        gff_file, fasta, ptt_file, rnt_file, None, None)
                    file_types[prefix] = "normal"
        return file_types, prefixs

    def _combine_ptt_rnt(self, gff_path, file_types, srna_path):
        self.helper.check_make_folder(self.combine_path)
        for prefix, file_type in file_types.items():
            combine_file = os.path.join(self.combine_path, prefix + '.ptt')
            if file_type == "normal":
                files = [os.path.join(gff_path, prefix + ".ptt"),
                         os.path.join(gff_path, prefix + ".rnt")]
                self._combine_annotation(combine_file, files)
            elif file_type == "srna":
                files = [os.path.join(gff_path, prefix + ".ptt"),
                         os.path.join(gff_path, prefix + ".rnt"),
                         os.path.join(srna_path,
                                      "_".join([prefix, "sRNA.rnt"]))]
                self._combine_annotation(combine_file, files)

    def _TransTermHP(self, fasta, file_, out_path, prefix, out, args_term):
        call([args_term.TransTermHP_path, "-p", args_term.expterm_path,
              fasta, os.path.join(self.combine_path, file_), "--t2t-perf",
              os.path.join(out_path, "_".join([
                  prefix,
                  "terminators_within_robust_tail-to-tail_regions.t2t"])),
              "--bag-output", os.path.join(out_path, "_".join([
                  prefix, "best_terminator_after_gene.bag"]))],
             stdout=out)

    def _run_TransTermHP(self, args_term):
        self.helper.check_make_folder(self.tmps["transterm"])
        for file_ in os.listdir(self.combine_path):
            if ".ptt" in file_:
                prefix = file_.replace(".ptt", "")
                fasta = self.helper.get_correct_file(
                             self.fasta_path, ".fa", prefix, None, None)
                if not fasta:
                    print("Error: no proper file - {0}.fa".format(prefix))
                    sys.exit()
                out_path = os.path.join(args_term.hp_folder, prefix)
                self.helper.check_make_folder(out_path)
                out = open(os.path.join(out_path,
                           "_".join([prefix, "terminators.txt"])), "w")
                self._TransTermHP(fasta, file_, out_path,
                                  prefix, out, args_term)
                out.close()
        shutil.rmtree(self.combine_path)

    def _convert_to_gff(self, prefixs, args_term):
        for prefix in prefixs:
            for folder in os.listdir(args_term.hp_folder):
                if prefix == folder:
                    out_path = os.path.join(args_term.hp_folder, folder)
                    for file_ in os.listdir(out_path):
                        if file_.endswith(".bag"):
                            out_file = os.path.join(
                                    self.tmps["transterm"],
                                    "_".join([prefix, self.tmps["hp_gff"]]))
                            self.converter.convert_transtermhp2gff(
                                 os.path.join(out_path, file_), out_file)
        self.multiparser.combine_gff(args_term.gffs, self.tmps["transterm"],
                                     None, self.tmps["hp"])

    def _combine_wigs(self, args_term):
        if (args_term.tex_wigs is not None) and (
                args_term.frag_wigs is not None):
            folder = args_term.tex_wigs.split("/")
            folder = "/".join(folder[:-1])
            merge_wigs = os.path.join(folder, "merge_wigs")
            self.helper.check_make_folder(merge_wigs)
            for wig in os.listdir(args_term.tex_wigs):
                if os.path.isdir(os.path.join(args_term.tex_wigs, wig)):
                    pass
                else:
                    shutil.copy(os.path.join(args_term.tex_wigs, wig),
                                merge_wigs)
            for wig in os.listdir(args_term.frag_wigs):
                if os.path.isdir(os.path.join(args_term.frag_wigs, wig)):
                    pass
                else:
                    shutil.copy(os.path.join(args_term.frag_wigs, wig),
                                merge_wigs)
        elif (args_term.tex_wigs is not None):
            merge_wigs = args_term.tex_wigs
        elif (args_term.frag_wigs is not None):
            merge_wigs = args_term.frag_wigs
        else:
            print("Error: no proper wig files!!!")
            sys.exit()
        return merge_wigs

    def _merge_sRNA(self, sRNAs, prefixs, gff_path):
        if sRNAs is not None:
            self.multiparser.parser_gff(sRNAs, "sRNA")
            self.helper.check_make_folder(self.tmps["merge"])
            for prefix in prefixs:
                tmp_gff = os.path.join(self.tmps["merge"], self.tmps["gff"])
                if self.tmps["gff"] in os.listdir(self.tmps["merge"]):
                    os.remove(tmp_gff)
                self.helper.merge_file(os.path.join(gff_path, prefix + ".gff"),
                                       tmp_gff)
                self.helper.merge_file(os.path.join(
                    self.srna_path, "_".join([prefix, "sRNA.gff"])), tmp_gff)
                self.helper.sort_gff(tmp_gff, os.path.join(
                    self.tmps["merge"], prefix + ".gff"))
                os.remove(tmp_gff)
            merge_path = self.tmps["merge"]
        else:
            merge_path = gff_path
        return merge_path

    def _move_file(self, term_outfolder, csv_outfolder):
        for gff in os.listdir(term_outfolder):
            if gff.endswith("_term.gff"):
                self.helper.sort_gff(os.path.join(term_outfolder, gff),
                                     self.tmps["gff"])
                shutil.move(self.tmps["gff"],
                            os.path.join(term_outfolder, gff))
                prefix = gff.replace("_term.gff", "")
                new_gff = os.path.join(self.terms["all"], "_".join([
                        prefix, self.suffixs["allgff"]]))
                csv_file = os.path.join(
                        os.path.join(self.csvs["all"], "_".join([
                            prefix, self.suffixs["csv"]])))
                out = open(new_gff, "w")
                out.write("##gff-version 3\n")
                out.close()
                self.helper.merge_file(
                        os.path.join(term_outfolder, gff),
                        os.path.join(
                            self.terms["all"], "_".join([
                                prefix, self.suffixs["allgff"]])))
                os.remove(os.path.join(term_outfolder, gff))
                pre_strain = ""
                if ("_".join([prefix, self.suffixs["csv"]]) in
                        os.listdir(self.csvs["all"])):
                    os.remove(csv_file)
                out_csv = open(csv_file, "w")
                out_csv.write("\t".join(["strain", "name", "start", "end",
                              "strand", "detect", "coverage_detail"]) + "\n")
                out_csv.close()
                fh = open(new_gff)
                for entry in self.gff_parser.entries(fh):
                    if entry.seq_id != pre_strain:
                        self.helper.merge_file(os.path.join(
                            self.tmps["term_table"], "_".join([
                                entry.seq_id, "term_raw.csv"])),
                            os.path.join(self.csvs["all"], "_".join([
                                prefix, self.suffixs["csv"]])))
                    pre_strain = entry.seq_id
                fh.close()

    def _run_rnafold(self, RNAfold_path, tmp_seq, tmp_sec, prefix):
        print("Computing secondray structure of {0}".format(prefix))
        self.helper.check_make_folder(self.tmps["folder"])
        pre_cwd = os.getcwd()
        os.chdir(self.tmps["folder"])
        os.system(" ".join([RNAfold_path, "<", os.path.join("..", tmp_seq),
                  ">", os.path.join("..", tmp_sec)]))
        os.chdir(pre_cwd)
        shutil.rmtree(self.tmps["folder"])

    def _compute_intersection_forward_reverse(
            self, prefixs, merge_path, wig_path, merge_wigs, args_term):
        for prefix in prefixs:
            tmp_seq = os.path.join(args_term.out_folder,
                                   "_".join(["inter_seq", prefix]))
            tmp_sec = os.path.join(args_term.out_folder,
                                   "_".join(["inter_sec", prefix]))
            tran_file = os.path.join(self.tran_path,
                                     "_".join([prefix, "transcript.gff"]))
            gff_file = os.path.join(merge_path, prefix + ".gff")
            print("Extracting seq of {0}".format(prefix))
            intergenic_seq(os.path.join(self.fasta_path, prefix + ".fa"),
                           tran_file, gff_file, tmp_seq)
            self._run_rnafold(args_term.RNAfold_path, tmp_seq, tmp_sec, prefix)
            tmp_cand = os.path.join(args_term.out_folder,
                                    "_".join(["term_candidates", prefix]))
            poly_t(tmp_seq, tmp_sec, gff_file, tran_file, tmp_cand, args_term)
            print("detection of terminator")
            detect_coverage(
                tmp_cand, os.path.join(merge_path, prefix + ".gff"),
                os.path.join(self.tran_path, "_".join([
                    prefix, "transcript.gff"])),
                os.path.join(self.fasta_path, prefix + ".fa"),
                os.path.join(wig_path, "_".join([prefix, "forward.wig"])),
                os.path.join(wig_path, "_".join([prefix, "reverse.wig"])),
                os.path.join(self.tmps["hp_path"], "_".join([
                    prefix, self.tmps["hp_gff"]])), merge_wigs,
                os.path.join(self.outfolder["term"], "_".join([
                    prefix, self.suffixs["gff"]])),
                os.path.join(self.tmps["term_table"], "_".join([
                    prefix, "term_raw.csv"])), args_term)
        self.multiparser.combine_gff(args_term.gffs, self.outfolder["term"],
                                     None, "term")
        self._move_file(self.outfolder["term"], self.outfolder["csv"])

    def _remove_tmp_file(self, merge_wigs, args_term):
        self.helper.remove_tmp(args_term.gffs)
        self.helper.remove_tmp(args_term.fastas)
        if args_term.srnas is not None:
            self.helper.remove_tmp(args_term.srnas)
            shutil.rmtree(self.tmps["merge"])
        if (args_term.tex_wigs is not None) and (
                args_term.frag_wigs is not None):
            shutil.rmtree(merge_wigs)
        self.helper.remove_tmp(args_term.trans)
        self.helper.remove_tmp(args_term.tex_wigs)
        self.helper.remove_tmp(args_term.frag_wigs)
        self.helper.remove_tmp(self.outfolder["term"])
        shutil.rmtree(self.tmps["transterm"])
        shutil.rmtree(self.tmps["term_table"])
        self.helper.remove_all_content(args_term.out_folder,
                                       "inter_seq_", "file")
        self.helper.remove_all_content(args_term.out_folder,
                                       "inter_sec_", "file")
        self.helper.remove_all_content(args_term.out_folder,
                                       "term_candidates_", "file")

    def _compute_stat(self, args_term):
        new_prefixs = []
        for gff in os.listdir(self.terms["all"]):
            if gff.endswith("_term_all.gff"):
                out_tmp = open(self.tmps["gff"], "w")
                out_tmp.write("##gff-version 3\n")
                new_prefix = gff.replace("_term_all.gff", "")
                new_prefixs.append(gff.replace("_term_all.gff", ""))
                num = 0
                fh = open(os.path.join(self.terms["all"], gff))
                for entry in self.gff_parser.entries(fh):
                    name = '%0*d' % (5, num)
                    entry.attributes["ID"] = "term" + str(num)
                    entry.attributes["Name"] = "_".join(["Terminator_" + name])
                    entry.attribute_string = ";".join([
                        "=".join(items) for items in entry.attributes.items()])
                    out_tmp.write("\t".join([entry.info_without_attributes,
                                  entry.attribute_string]) + "\n")
                    num += 1
                out_tmp.close()
                fh.close()
                shutil.move(self.tmps["gff"], os.path.join(self.terms["all"],
                            "_".join([new_prefix, self.suffixs["gff"]])))
        if args_term.stat:
            stat_path = os.path.join(args_term.out_folder, "statistics")
            for prefix in new_prefixs:
                stat_term(os.path.join(self.terms["all"],
                          "_".join([prefix, self.suffixs["gff"]])),
                          os.path.join(self.csvs["all"],
                          "_".join([prefix, self.suffixs["csv"]])),
                          os.path.join(stat_path,
                          "_".join(["stat", prefix + ".csv"])),
                          os.path.join(self.terms["best"],
                          "_".join([prefix, "term"])),
                          os.path.join(self.terms["express"],
                          "_".join([prefix, "term"])),
                          os.path.join(self.terms["non"],
                          "_".join([prefix, "term"])))
                shutil.move(os.path.join(self.terms["best"],
                            "_".join([prefix, self.suffixs["csv"]])),
                            os.path.join(self.csvs["best"],
                            "_".join([prefix, self.suffixs["csv"]])))
                shutil.move(os.path.join(self.terms["express"],
                            "_".join([prefix, self.suffixs["csv"]])),
                            os.path.join(self.csvs["express"],
                            "_".join([prefix, self.suffixs["csv"]])))
                shutil.move(os.path.join(self.terms["non"],
                            "_".join([prefix, self.suffixs["csv"]])),
                            os.path.join(self.csvs["non"],
                            "_".join([prefix, self.suffixs["csv"]])))
                os.remove(os.path.join(self.terms["all"],
                          "_".join([prefix, self.suffixs["allgff"]])))

    def _check_gff_file(self, folder):
        for file_ in os.listdir(folder):
            if file_.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(folder, file_))

    def _compare_term_tran(self, args_term):
        self.multiparser.combine_gff(args_term.gffs, self.tran_path,
                                     None, "transcript")
        for type_ in ("best", "express", "all_candidates"):
            compare_term_tran(self.tran_path,
                              os.path.join(self.outfolder["term"], type_),
                              args_term.fuzzy_up_ta, args_term.fuzzy_down_ta,
                              args_term.out_folder, "terminator")
            shutil.move(
                os.path.join(
                    args_term.out_folder, "statistics",
                    "stat_comparison_terminator_transcript.csv"),
                os.path.join(
                    args_term.out_folder, "statistics",
                    "stat_comparison_terminator_transcript_" + type_ + ".csv"))

    def run_terminator(self, args_term):
        self._check_gff_file(args_term.gffs)
        self._check_gff_file(args_term.trans)
        self.multiparser.parser_fasta(args_term.fastas)
        if (not args_term.gffs) or (not args_term.fastas):
            print("Error: please assign gff annotation folder "
                  "and fasta folder!!!")
            sys.exit()
        file_types, prefixs = self._convert_gff2rntptt(
                self.gff_path, self.fasta_path, args_term.srnas)
        self._combine_ptt_rnt(self.gff_path, file_types, self.srna_path)
        self._run_TransTermHP(args_term)
        self._convert_to_gff(prefixs, args_term)
        self.helper.remove_tmp(self.gff_path)
        self.multiparser.parser_gff(args_term.trans, "transcript")
        self.helper.check_make_folder(self.tmps["term_table"])
        self.multiparser.parser_gff(self.tmps["transterm"], self.tmps["hp"])
        merge_path = self._merge_sRNA(args_term.srnas, prefixs, self.gff_path)
        self._compute_intersection_forward_reverse(
                prefixs, merge_path, args_term.wig_path,
                args_term.merge_wigs, args_term)
        self._compute_stat(args_term)
        self._compare_term_tran(args_term)
        self._remove_tmp_file(args_term.merge_wigs, args_term)
Beispiel #25
0
class MEME(object):

    def __init__(self, args_pro):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.tss_path = os.path.join(args_pro.tsss, "tmp")
        if args_pro.gffs is not None:
            self.gff_path = os.path.join(args_pro.gffs, "tmp")
        else:
            self.gff_path = None
        self.out_fasta = os.path.join(args_pro.output_folder, "fasta_class")
        self.tmp_folder = os.path.join(os.getcwd(), "tmp")
        self.fastas = {"pri": os.path.join(self.tmp_folder, "primary.fa"),
                       "sec": os.path.join(self.tmp_folder, "secondary.fa"),
                       "inter": os.path.join(self.tmp_folder, "internal.fa"),
                       "anti": os.path.join(self.tmp_folder, "antisense.fa"),
                       "orph": os.path.join(self.tmp_folder, "orphan.fa"),
                       "all_no_orph": "without_orphan.fa",
                       "all": "all_type.fa",
                       "tmp_fa": os.path.join(self.tmp_folder, "tmp.fa"),
                       "tmp_all": os.path.join(self.tmp_folder, "tmp_all.fa")}
        self.all_fasta = os.path.join(args_pro.fastas, "allfasta.fa")
        self.all_tss = os.path.join(self.tss_path, "allfasta_TSS.gff")

    def _run_normal_motif(self, input_path, out_path, filename,
                          fasta, width, args_pro):
        print(os.path.join(input_path, fasta))
        folder = "_".join(["promoter_motifs", filename,
                           str(width), "nt"])
        if folder not in os.listdir(out_path):
            call([args_pro.meme_path, "-maxsize", "1000000",
                  "-dna", "-nmotifs", str(args_pro.num_motif),
                  "-w", str(width), "-maxiter", "100",
                  "-evt", str(args_pro.e_value),
                  "-oc", os.path.join(out_path, folder),
                  os.path.join(input_path, fasta)])

    def _run_small_motif(self, input_path, out_path, filename,
                         fasta, width, args_pro):
        data = width.split("-")
        min_width = data[0]
        max_width = data[1]
        folder = "_".join(["promoter_motifs", filename,
                           "-".join([str(min_width), str(max_width)]), "nt"])
        if folder not in os.listdir(out_path):
            call([args_pro.meme_path, "-maxsize", "1000000",
                  "-dna", "-nmotifs", str(args_pro.num_motif),
                  "-minsites", "0", "-maxsites", "2",
                  "-minw", str(min_width), "-maxw", str(max_width),
                  "-maxiter", "100",
                  "-evt", str(args_pro.e_value),
                  "-oc", os.path.join(out_path, folder),
                  os.path.join(input_path, fasta)])

    def _get_fasta_file(self, fasta_path, prefix):
        for fasta in os.listdir(fasta_path):
            if (fasta.endswith(".fa")) and \
               (prefix == fasta.replace(".fa", "")):
                break
            elif (fasta.endswith(".fna")) and \
                 (prefix == fasta.replace(".fna", "")):
                break
            elif (fasta.endswith(".fasta")) and \
                 (prefix == fasta.replace(".fasta", "")):
                break
        return fasta

    def _check_gff(self, gffs):
        for gff in os.listdir(gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(gffs, gff))

    def _move_and_merge_fasta(self, input_path, prefix):
        all_type = os.path.join(self.tmp_folder, self.fastas["all"])
        all_no_orph = os.path.join(self.tmp_folder, self.fastas["all_no_orph"])
        if self.fastas["all"] in os.listdir(self.tmp_folder):
            os.remove(all_type)
        if self.fastas["all_no_orph"] in os.listdir(self.tmp_folder):
            os.remove(all_no_orph)
        shutil.copyfile(self.fastas["pri"], self.fastas["tmp_fa"])
        self.helper.merge_file(self.fastas["sec"], self.fastas["tmp_fa"])
        self.helper.merge_file(self.fastas["inter"], self.fastas["tmp_fa"])
        self.helper.merge_file(self.fastas["anti"], self.fastas["tmp_fa"])
        shutil.copyfile(self.fastas["tmp_fa"], self.fastas["tmp_all"])
        self.helper.merge_file(self.fastas["orph"], self.fastas["tmp_all"])
        del_repeat_fasta(self.fastas["tmp_fa"], all_no_orph)
        del_repeat_fasta(self.fastas["tmp_all"], all_type)
        os.remove(self.fastas["tmp_fa"])
        os.remove(self.fastas["tmp_all"])
        out_prefix = os.path.join(input_path, prefix)
        shutil.move(self.fastas["pri"], "_".join([
            out_prefix, "allstrain_primary.fa"]))
        shutil.move(self.fastas["sec"], "_".join([
            out_prefix, "allstrain_secondary.fa"]))
        shutil.move(self.fastas["inter"], "_".join([
            out_prefix, "allstrain_internal.fa"]))
        shutil.move(self.fastas["anti"], "_".join([
            out_prefix, "allstrain_antisense.fa"]))
        shutil.move(self.fastas["orph"], "_".join([
            out_prefix, "allstrain_orphan.fa"]))
        shutil.move(all_type, "_".join([
            out_prefix, "allstrain_all_types.fa"]))
        shutil.move(all_no_orph, "_".join([
            out_prefix, "allstrain_without_orphan.fa"]))

    def _split_fasta_by_strain(self, input_path):
        for fasta in os.listdir(input_path):
            if "allstrain" not in fasta:
                os.remove(os.path.join(input_path, fasta))
        out = None
        for fasta in os.listdir(input_path):
            if fasta.endswith(".fa"):
                pre_strain = ""
                num_strain = 0
                with open(os.path.join(input_path, fasta), "r") as f_h:
                    for line in f_h:
                        line = line.strip()
                        if line.startswith(">"):
                            datas = line.split("_")
                            strain = "_".join(datas[2:])
                            if pre_strain != strain:
                                num_strain += 1
                                filename = fasta.split("allstrain")
                                if out is not None:
                                    out.close()
                                out = open(os.path.join(
                                           input_path, "".join([
                                               filename[0], strain,
                                               filename[-1]])), "a")
                                pre_strain = strain
                            out.write(line + "\n")
                        else:
                            out.write(line + "\n")
                if num_strain <= 1:
                    os.remove(os.path.join(input_path,
                              "".join([filename[0], strain, filename[-1]])))
        out.close()

    def _run_program(self, prefixs, args_pro):
        for prefix in prefixs:
            print(prefix)
            input_path = os.path.join(self.out_fasta, prefix)
            out_path = os.path.join(args_pro.output_folder, prefix)
            for fasta in os.listdir(input_path):
                filename = fasta.replace(".fa", "")
                for width in args_pro.widths:
                    print("Computing promoters of {0} - {1}".format(
                          fasta, width))
                    if "-" in width:
                        self._run_small_motif(input_path, out_path, filename,
                                              fasta, width, args_pro)
                    else:
                        self._run_normal_motif(input_path, out_path, filename,
                                               fasta, width, args_pro)

    def _combine_file(self, prefixs, args_pro):
        if args_pro.source:
            for tss in os.listdir(self.tss_path):
                if tss.endswith("_TSS.gff"):
                    self.helper.merge_file(os.path.join(
                         self.tss_path, tss), self.all_tss)
            for fasta in os.listdir(args_pro.fastas):
                if (fasta.endswith(".fa")) or (
                        fasta.endswith(".fna")) or (
                        fasta.endswith(".fasta")):
                    self.helper.merge_file(os.path.join(
                         args_pro.fastas, fasta), self.all_fasta)
        else:
            for tss in os.listdir(os.path.join(
                                  args_pro.output_folder, "TSS_class")):
                if tss.endswith("_TSS.gff"):
                    self.helper.merge_file(os.path.join(
                         self.tss_path, tss), self.all_tss)
            for fasta in os.listdir(args_pro.fastas):
                if (fasta.endswith(".fa")) or (
                        fasta.endswith(".fna")) or (
                        fasta.endswith(".fasta")):
                    self.helper.merge_file(os.path.join(
                         args_pro.fastas, fasta), self.all_fasta)
        print("generating fasta file of all fasta files")
        prefixs.append("allfasta")
        input_path = os.path.join(self.out_fasta, "allfasta")
        self.helper.check_make_folder(os.path.join(
                                      args_pro.output_folder, "allfasta"))
        self.helper.check_make_folder(os.path.join(
                                      self.out_fasta, "allfasta"))
        args_pro.source = True
        upstream(self.all_tss, self.all_fasta, None,
                 None, args_pro)
        self._move_and_merge_fasta(input_path, "allfasta")

    def _remove_files(self, args_pro):
        self.helper.remove_tmp(args_pro.fastas)
        self.helper.remove_tmp(args_pro.tsss)
        self.helper.remove_tmp(args_pro.gffs)
        self.helper.remove_tmp(args_pro.wigs)
        if "allfasta.fa" in os.listdir(args_pro.fastas):
            os.remove(self.all_fasta)
        if "allfasta" in os.listdir(os.getcwd()):
            shutil.rmtree("allfasta")
        shutil.rmtree("tmp")

    def _gen_table(self, output_folder, prefixs, combine):
        if combine:
            strains = prefixs + ["allfasta"]
        else:
            strains = prefixs
        for strain in strains:
            for folder in os.listdir(os.path.join(output_folder, strain)):
                tss_file = os.path.join(self.tss_path, strain + "_TSS.gff")
                gen_promoter_table(os.path.join(output_folder, strain,
                                   folder, "meme.txt"),
                                   os.path.join(output_folder, strain,
                                   folder, "meme.csv"), tss_file)

    def _get_upstream(self, args_pro, prefix, tss, fasta):
        if args_pro.source:
            print("generating fasta file of {0}".format(prefix))
            upstream(os.path.join(self.tss_path, tss),
                     os.path.join(args_pro.fastas, fasta),
                     None, None, args_pro)
        else:
            if (args_pro.gffs is None) or (
                    args_pro.wigs is None) or (
                    args_pro.input_libs is None):
                print("Error:please assign proper annotation, tex +/- "
                      "wig folder and tex treated libs!!!")
                sys.exit()
            if "TSS_class" not in os.listdir(args_pro.output_folder):
                os.mkdir(os.path.join(args_pro.output_folder, "TSS_class"))
            print("classifying TSS and extracting fasta {0}".format(prefix))
            upstream(os.path.join(self.tss_path, tss),
                     os.path.join(args_pro.fastas, fasta),
                     os.path.join(self.gff_path, prefix + ".gff"),
                     os.path.join(args_pro.output_folder, "TSS_class",
                     "_".join([prefix, "TSS.gff"])), args_pro)

    def run_meme(self, args_pro):
        if "allfasta.fa" in os.listdir(args_pro.fastas):
            os.remove(self.all_fasta)
            if "allfasta.fa_folder" in os.listdir(args_pro.fastas):
                shutil.rmtree(os.path.join(args_pro.fastas,
                              "allfasta.fa_folder"))
        self.multiparser.parser_fasta(args_pro.fastas)
        self.multiparser.parser_gff(args_pro.tsss, "TSS")
        if "allfasta_TSS.gff" in os.listdir(self.tss_path):
            os.remove(self.all_tss)
        if args_pro.gffs is not None:
            self._check_gff(args_pro.gffs)
            self.multiparser.parser_gff(args_pro.gffs, None)
            self.multiparser.combine_gff(args_pro.fastas, self.gff_path,
                                         "fasta", None)
        self._check_gff(args_pro.tsss)
        self.multiparser.combine_gff(args_pro.fastas, self.tss_path,
                                     "fasta", "TSS")
        self.helper.check_make_folder(self.out_fasta)
        self.helper.check_make_folder(self.tmp_folder)
        prefixs = []
        for tss in os.listdir(self.tss_path):
            prefix = tss.replace("_TSS.gff", "")
            prefixs.append(prefix)
            self.helper.check_make_folder(os.path.join(args_pro.output_folder,
                                                       prefix))
            self.helper.check_make_folder(os.path.join(self.out_fasta,
                                                       prefix))
            input_path = os.path.join(self.out_fasta, prefix)
            fasta = self._get_fasta_file(args_pro.fastas, prefix)
            self._get_upstream(args_pro, prefix, tss, fasta)
            self._move_and_merge_fasta(input_path, prefix)
            self._split_fasta_by_strain(input_path)
        if args_pro.combine:
            self._combine_file(prefixs, args_pro)
        self._run_program(prefixs, args_pro)
        print("generating the table...")
        self._gen_table(args_pro.output_folder, prefixs, args_pro.combine)
        self._remove_files(args_pro)
Beispiel #26
0
class RATT(object):
    '''annotation transfer'''
    def __init__(self, args_ratt):
        self.multiparser = Multiparser()
        self.converter = Converter()
        self.format_fixer = FormatFixer()
        self.helper = Helper()
        if args_ratt.ref_gbk:
            self.gbk = os.path.join(args_ratt.ref_gbk, "gbk_tmp")
            self.gbk_tmp = os.path.join(self.gbk, "tmp")
            self.embl = os.path.join(args_ratt.ref_gbk, "embls")
        if args_ratt.ref_embls:
            self.embl = args_ratt.ref_embls
        self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt")
        self.tmp_files = {
            "tar": os.path.join(args_ratt.tar_fastas, "tmp"),
            "ref": os.path.join(args_ratt.ref_fastas, "tmp"),
            "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"),
            "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"),
            "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"),
            "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt")
        }

    def _convert_to_pttrnt(self, gffs, files):
        for gff in files:
            if gff.endswith(".gff"):
                gff = os.path.join(gffs, gff)
                filename = gff.split("/")
                prefix = filename[-1][:-4]
                rnt = gff[:-3] + "rnt"
                ptt = gff[:-3] + "ptt"
                fasta = self.helper.get_correct_file(self.tmp_files["tar"],
                                                     ".fa", prefix, None, None)
                if fasta:
                    self.converter.convert_gff2rntptt(gff, fasta, ptt, rnt,
                                                      None, None)

    def _remove_files(self, args_ratt, out_gbk):
        self.helper.remove_all_content(args_ratt.gff_outfolder, ".gff", "file")
        self.helper.remove_all_content(args_ratt.gff_outfolder, ".ptt", "file")
        self.helper.remove_all_content(args_ratt.gff_outfolder, ".rnt", "file")
        self.helper.move_all_content(self.tmp_files["out_gff"],
                                     args_ratt.gff_outfolder, None)
        shutil.rmtree(self.tmp_files["out_gff"])
        shutil.rmtree(self.tmp_files["tar"])
        shutil.rmtree(self.tmp_files["ref"])
        self.helper.remove_tmp_dir(args_ratt.tar_fastas)
        self.helper.remove_tmp_dir(args_ratt.ref_fastas)
        self.helper.remove_tmp_dir(args_ratt.ref_embls)
        self.helper.remove_tmp_dir(args_ratt.ref_gbk)

    def _convert_to_gff(self, ratt_result, args_ratt, files):
        name = ratt_result.split(".")
        filename = ".".join(name[1:-2]) + ".gff"
        output_file = os.path.join(args_ratt.output_path, filename)
        self.converter.convert_embl2gff(
            os.path.join(args_ratt.output_path, ratt_result), output_file)
        self.format_fixer.fix_ratt(output_file, ".".join(name[1:-2]),
                                   "tmp_gff")
        shutil.move("tmp_gff", output_file)
        shutil.copy(output_file, os.path.join(args_ratt.gff_outfolder,
                                              filename))
        files.append(filename)

    def _parser_embl_gbk(self, files):
        self.helper.check_make_folder(self.gbk)
        for file_ in files:
            close = False
            with open(file_, "r") as f_h:
                for line in f_h:
                    if (line.startswith("LOCUS")):
                        out = open(self.gbk_tmp, "w")
                        datas = line.split(" ")
                        for data in datas:
                            if (len(data) != 0) and (data != "LOCUS"):
                                filename = ".".join([data, "gbk"])
                                break
                    elif (line.startswith("VERSION")):
                        datas = line.split(" ")
                        for data in datas:
                            if (len(data) != 0) and (data != "VERSION"):
                                new_filename = ".".join([data, "gbk"])
                                break
                        if new_filename.find(filename):
                            filename = new_filename
                    if out:
                        out.write(line)
                    if line.startswith("//"):
                        out.close()
                        close = True
                        shutil.move(self.gbk_tmp,
                                    os.path.join(self.gbk, filename))
            if not close:
                out.close()
        return self.gbk

    def _convert_embl(self, ref_embls):
        '''convert gbk to embl'''
        detect_gbk = False
        gbks = []
        out_gbk = None
        for embl in os.listdir(ref_embls):
            if (embl.endswith(".gbk")) or (embl.endswith(".gbff")) or (
                    embl.endswith(".gb")):
                detect_gbk = True
                gbks.append(os.path.join(ref_embls, embl))
        if not detect_gbk:
            print("Error: Please assign proper Genebank files!")
            sys.exit()
        elif detect_gbk:
            out_gbk = self._parser_embl_gbk(gbks)
            self.converter.convert_gbk2embl(out_gbk)
            self.helper.check_make_folder(self.embl)
            self.helper.move_all_content(out_gbk, self.embl, [".embl"])
        return out_gbk

    def _run_ratt(self, args_ratt, tar, ref, out):
        call([
            args_ratt.ratt_path, self.embl,
            os.path.join(self.tmp_files["tar"], tar + ".fa"),
            args_ratt.element, args_ratt.transfer_type,
            os.path.join(self.tmp_files["ref"], ref + ".fa")
        ],
             stdout=out,
             stderr=DEVNULL)

    def _format_and_run(self, args_ratt):
        print("Running RATT")
        for pair in args_ratt.pairs:
            ref = pair.split(":")[0]
            tar = pair.split(":")[1]
            out = open(self.ratt_log, "w+")
            self._run_ratt(args_ratt, tar, ref, out)
            for filename in os.listdir():
                if ("final" in filename):
                    shutil.move(filename,
                                os.path.join(args_ratt.output_path, filename))
                elif (args_ratt.element in filename) or (
                        "query" in filename) or ("Reference" in filename) or (
                            "Query" in filename) or ("Sequences" in filename):
                    if os.path.isfile(filename):
                        os.remove(filename)
                    if os.path.isdir(filename):
                        shutil.rmtree(filename)
        out.close()

    def annotation_transfer(self, args_ratt):
        self.multiparser.parser_fasta(args_ratt.tar_fastas)
        self.multiparser.parser_fasta(args_ratt.ref_fastas)
        out_gbk = None
        if args_ratt.ref_embls is None:
            out_gbk = self._convert_embl(args_ratt.ref_gbk)
        self._format_and_run(args_ratt)
        if args_ratt.convert:
            files = []
            for data in os.listdir(args_ratt.output_path):
                if "final.embl" in data:
                    self._convert_to_gff(data, args_ratt, files)
                    self._convert_to_pttrnt(args_ratt.gff_outfolder, files)
            self.helper.check_make_folder(self.tmp_files["out_gff"])
            for folder in os.listdir(args_ratt.tar_fastas):
                files = []
                if "_folder" in folder:
                    datas = folder.split("_folder")
                    prefix = ".".join(datas[0].split(".")[:-1])
                    for file_ in os.listdir(
                            os.path.join(args_ratt.tar_fastas, folder)):
                        files.append(file_[:-3])
                    for gff in os.listdir(args_ratt.gff_outfolder):
                        for file_ in files:
                            if (".gff" in gff) and (file_ == gff[:-4]):
                                self.helper.merge_file(
                                    os.path.join(args_ratt.gff_outfolder, gff),
                                    self.tmp_files["gff"])
                            if (".ptt" in gff) and (file_ == gff[:-4]):
                                self.helper.merge_file(
                                    os.path.join(args_ratt.gff_outfolder, gff),
                                    self.tmp_files["ptt"])
                            if (".rnt" in gff) and (file_ == gff[:-4]):
                                self.helper.merge_file(
                                    os.path.join(args_ratt.gff_outfolder, gff),
                                    self.tmp_files["rnt"])
                    if os.path.exists(self.tmp_files["gff"]):
                        shutil.move(
                            self.tmp_files["gff"],
                            os.path.join(self.tmp_files["out_gff"],
                                         prefix + ".gff"))
                        shutil.move(
                            self.tmp_files["ptt"],
                            os.path.join(self.tmp_files["out_gff"],
                                         prefix + ".ptt"))
                        shutil.move(
                            self.tmp_files["rnt"],
                            os.path.join(self.tmp_files["out_gff"],
                                         prefix + ".rnt"))
                    else:
                        print("Error: Please check your fasta or "
                              "annotation files, they should only contain "
                              "the query genome. And make sure your RATT can "
                              "work properly (check $ANNOgesic/output/"
                              "annotation_transfer/ratt_log.txt).")
        self._remove_files(args_ratt, out_gbk)
Beispiel #27
0
class sRNATargetPrediction(object):

    def __init__(self, args_tar):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.fixer = FormatFixer()
        self.gff_parser = Gff3Parser()
        self.target_seq_path = os.path.join(args_tar.out_folder, "target_seqs")
        self.srna_seq_path = os.path.join(args_tar.out_folder, "sRNA_seqs")
        self.rnaplex_path = os.path.join(args_tar.out_folder, "RNAplex")
        self.rnaup_path = os.path.join(args_tar.out_folder, "RNAup")
        self.merge_path = os.path.join(args_tar.out_folder, "merge")
        self.srna_path = os.path.join(args_tar.srnas, "tmp")
        self.fasta_path = os.path.join(args_tar.fastas, "tmp")
        self.gff_path = os.path.join(args_tar.gffs, "tmp")
        self.tmps = {"tmp": "tmp", "rnaup": "tmp_rnaup", "log": "tmp_log",
                     "all_fa": "tmp*.fa", "all_txt": "tmp*.txt"}

    def _check_gff(self, gffs):
        for gff in os.listdir(gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(gffs, gff))

    def _run_rnaplfold(self, vienna_path, file_type, win_size, span,
                       unstr_region, seq_path, prefix, out_path):
        current = os.getcwd()
        os.chdir(out_path)
        command = " ".join([os.path.join(vienna_path, "RNAplfold"),
                            "-W", str(win_size),
                            "-L", str(span),
                            "-u", str(unstr_region),
                            "-O"])
        if file_type == "sRNA":
            os.system("<".join([command, os.path.join(current, seq_path,
                                "_".join([self.tmps["tmp"], prefix,
                                          file_type + ".fa"]))]))
        else:
            os.system("<".join([command, os.path.join(current, seq_path,
                                "_".join([prefix, file_type + ".fa"]))]))
        os.chdir(current)

    def _wait_process(self, processes):
        for p in processes:
            p.wait()
            if p.stdout:
                p.stdout.close()
            if p.stdin:
                p.stdin.close()
            if p.stderr:
                p.stderr.close()
            try:
                p.kill()
            except OSError:
                pass
            time.sleep(5)

    def _sort_srna_fasta(self, fasta, prefix, path):
        out = open(os.path.join(path,
                   "_".join([self.tmps["tmp"], prefix, "sRNA.fa"])), "w")
        srnas = []
        with open(fasta) as f_h:
            for line in f_h:
                line = line.strip()
                if line.startswith(">"):
                    name = line[1:]
                else:
                    srnas.append({"name": name, "seq": line, "len": len(line)})
        srnas = sorted(srnas, key=lambda x: (x["len"]))
        for srna in srnas:
            out.write(">" + srna["name"].split("|")[0] + "\n")
            out.write(srna["seq"] + "\n")
        out.close()

    def _read_fasta(self, fasta_file):
        seq = ""
        with open(fasta_file, "r") as seq_f:
            for line in seq_f:
                line = line.strip()
                if line.startswith(">"):
                    continue
                else:
                    seq = seq + line
        return seq

    def _get_specific_seq(self, srna_file, seq_file, srna_out, querys):
        for query in querys:
            srna_datas = query.split(":")
            srna = {"seq_id": srna_datas[0], "strand": srna_datas[1],
                    "start": int(srna_datas[2]), "end": int(srna_datas[3])}
            gff_f = open(srna_file, "r")
            out = open(srna_out, "a")
            seq = self._read_fasta(seq_file)
            num = 0
            for entry in self.gff_parser.entries(gff_f):
                if (entry.seq_id == srna["seq_id"]) and (
                        entry.strand == srna["strand"]) and (
                        entry.start == srna["start"]) and (
                        entry.end == srna["end"]):
                    if "ID" in entry.attributes.keys():
                        id_ = entry.attributes["ID"]
                    else:
                        id_ = entry.feature + str(num)
                    gene = self.helper.extract_gene(seq, entry.start,
                                                    entry.end, entry.strand)
                    out.write(">{0}|{1}|{2}|{3}|{4}\n{5}\n".format(
                              id_, entry.seq_id, entry.start,
                              entry.end, entry.strand, gene))
                    num += 1
            gff_f.close()
            out.close()

    def _gen_seq(self, prefixs, args_tar):
        print("Generating sRNA fasta files...")
        for srna in os.listdir(self.srna_path):
            if srna.endswith("_sRNA.gff"):
                prefix = srna.replace("_sRNA.gff", "")
                prefixs.append(prefix)
                srna_out = os.path.join(self.srna_seq_path,
                                        "_".join([prefix, "sRNA.fa"]))
                if "all" in args_tar.query:
                    self.helper.get_seq(
                            os.path.join(self.srna_path, srna),
                            os.path.join(self.fasta_path, prefix + ".fa"),
                            srna_out)
                else:
                    if "_".join([prefix, "sRNA.fa"]) in os.listdir(
                       self.srna_seq_path):
                        os.remove(srna_out)
                    self._get_specific_seq(
                            os.path.join(self.srna_path, srna),
                            os.path.join(self.fasta_path, prefix + ".fa"),
                            srna_out, args_tar.query)
                self._sort_srna_fasta(srna_out, prefix, self.srna_seq_path)
        print("Generating target fasta files...")
        for gff in os.listdir(self.gff_path):
            if gff.endswith(".gff"):
                prefix = gff.replace(".gff", "")
                potential_target(os.path.join(self.gff_path, gff),
                                 os.path.join(self.fasta_path, prefix + ".fa"),
                                 os.path.join(self.target_seq_path), args_tar)
                file_num = 1
                num = 0
                sub_prefix = os.path.join(self.target_seq_path,
                                          "_".join([prefix, "target"]))
                sub_out = open("_".join([sub_prefix, str(file_num) + ".fa"]),
                               "w")
                with open((sub_prefix + ".fa"), "r") as t_f:
                    for line in t_f:
                        line = line.strip()
                        if line.startswith(">"):
                            num += 1
                        if (num == 100):
                            num = 0
                            file_num += 1
                            sub_out.close()
                            sub_out = open("_".join([sub_prefix,
                                           str(file_num) + ".fa"]), "w")
                        sub_out.write(line + "\n")
                sub_out.close()

    def _run_rnaplex(self, prefix, rnaplfold_path, args_tar):
        print("Running RNAplex of {0}".format(prefix))
        num_process = 0
        processes = []
        for seq in os.listdir(self.target_seq_path):
            if (prefix in seq) and ("_target_" in seq):
                print("Running RNAplex with {0}".format(seq))
                out_rnaplex = open(os.path.join(
                    self.rnaplex_path, prefix, "_".join([
                        prefix, "RNAplex", str(num_process) + ".txt"])), "w")
                num_process += 1
                p = Popen([os.path.join(args_tar.vienna_path, "RNAplex"),
                           "-q", os.path.join(
                               self.srna_seq_path, "_".join([
                                   self.tmps["tmp"], prefix, "sRNA.fa"])),
                           "-t", os.path.join(self.target_seq_path, seq),
                           "-l", str(args_tar.inter_length),
                           "-e", str(args_tar.energy),
                           "-z", str(args_tar.duplex_dist),
                           "-a", rnaplfold_path], stdout=out_rnaplex)
                processes.append(p)
                if num_process % args_tar.core_plex == 0:
                    self._wait_process(processes)
        self._wait_process(processes)
        return num_process

    def _rna_plex(self, prefixs, args_tar):
        for prefix in prefixs:
            print("Running RNAplfold of {0}".format(prefix))
            self.helper.check_make_folder(
                        os.path.join(self.rnaplex_path, prefix))
            rnaplfold_path = os.path.join(self.rnaplex_path, prefix,
                                          "RNAplfold")
            os.mkdir(rnaplfold_path)
            self._run_rnaplfold(
                args_tar.vienna_path, "sRNA", args_tar.win_size_s,
                args_tar.span_s, args_tar.unstr_region_rnaplex_s,
                self.srna_seq_path, prefix, rnaplfold_path)
            self._run_rnaplfold(
                args_tar.vienna_path, "target", args_tar.win_size_t,
                args_tar.span_t, args_tar.unstr_region_rnaplex_t,
                self.target_seq_path, prefix, rnaplfold_path)
            num_process = self._run_rnaplex(prefix, rnaplfold_path, args_tar)
            rnaplex_file = os.path.join(self.rnaplex_path, prefix,
                                        "_".join([prefix, "RNAplex.txt"]))
            if ("_".join([prefix, "RNAplex.txt"]) in
                    os.listdir(os.path.join(self.rnaplex_path, prefix))):
                os.remove(rnaplex_file)
            for index in range(0, num_process):
                self.helper.merge_file(os.path.join(
                    self.rnaplex_path, prefix, "_".join([
                        prefix, "RNAplex", str(index) + ".txt"])),
                    rnaplex_file)
            self.helper.remove_all_content(os.path.join(
                 self.rnaplex_path, prefix), "_RNAplex_", "file")
            self.fixer.fix_rnaplex(rnaplex_file, self.tmps["tmp"])
            shutil.move(self.tmps["tmp"], rnaplex_file)

    def _run_rnaup(self, num_up, processes, out_rnaup, out_log, args_tar):
        for index in range(1, num_up + 1):
            out_tmp_up = open(os.path.join(
                args_tar.out_folder, "".join([self.tmps["rnaup"],
                                              str(index), ".txt"])), "w")
            out_err = open(os.path.join(
                args_tar.out_folder, "".join([self.tmps["log"],
                                              str(index), ".txt"])), "w")
            in_up = open(os.path.join(
                args_tar.out_folder, "".join([self.tmps["tmp"],
                                              str(index), ".fa"])), "r")
            p = Popen([os.path.join(args_tar.vienna_path, "RNAup"),
                       "-u", str(args_tar.unstr_region_rnaup),
                       "-o", "--interaction_first"],
                      stdin=in_up, stdout=out_tmp_up, stderr=out_err)
            processes.append(p)
        if len(processes) != 0:
            time.sleep(5)
            self._wait_process(processes)
            os.system("rm " + os.path.join(args_tar.out_folder,
                                           self.tmps["all_fa"]))
            self._merge_txt(num_up, out_rnaup, out_log, args_tar.out_folder)
            os.system("rm " + os.path.join(args_tar.out_folder,
                                           self.tmps["all_txt"]))

    def _merge_txt(self, num_up, out_rnaup, out_log, out_folder):
        for index in range(1, num_up + 1):
            self.helper.merge_file(
                os.path.join(out_folder, "".join([self.tmps["rnaup"],
                                                  str(index), ".txt"])),
                out_rnaup)
            self.helper.merge_file(
                os.path.join(out_folder, "".join([self.tmps["log"],
                                                  str(index), ".txt"])),
                out_log)

    def _get_continue(self, out_rnaup):
        srnas = []
        matchs = {}
        out = open("tmp.txt", "w")
        with open(out_rnaup) as f_h:
            for line in f_h:
                line = line.strip()
                if ">srna" in line:
                    srna = line[1:]
                    srnas.append(srna)
                    matchs[srna] = []
                else:
                    matchs[srna].append(line)
        srnas = srnas[:-1]
        for srna in srnas:
            out.write(">" + srna + "\n")
            for target in matchs[srna]:
                out.write(target + "\n")
        out.close()
        os.remove(out_rnaup)
        shutil.move("tmp.txt", out_rnaup)
        return srnas

    def _rnaup(self, prefixs, args_tar):
        for prefix in prefixs:
            srnas = []
            print("Running RNAup of {0}".format(prefix))
            if not os.path.exists(os.path.join(self.rnaup_path, prefix)):
                os.mkdir(os.path.join(self.rnaup_path, prefix))
            num_up = 0
            processes = []
            out_rnaup = os.path.join(self.rnaup_path, prefix,
                                     "_".join([prefix + "_RNAup.txt"]))
            out_log = os.path.join(self.rnaup_path, prefix,
                                   "_".join([prefix + "_RNAup.log"]))
            if "_".join([prefix, "RNAup.txt"]) in \
                    os.listdir(os.path.join(self.rnaup_path, prefix)):
                if not args_tar.continue_rnaup:
                    os.remove(out_rnaup)
                    os.remove(out_log)
                else:
                    srnas = self._get_continue(out_rnaup)
            with open(os.path.join(self.srna_seq_path, "_".join([
                    self.tmps["tmp"], prefix, "sRNA.fa"])), "r") as s_f:
                for line in s_f:
                    line = line.strip()
                    if line.startswith(">"):
                        if line[1:] in srnas:
                            start = False
                            continue
                        start = True
                        print("Running RNAup with {0}".format(line[1:]))
                        num_up += 1
                        out_up = open(os.path.join(args_tar.out_folder,
                                      "".join([self.tmps["tmp"],
                                               str(num_up), ".fa"])), "w")
                        out_up.write(line + "\n")
                    else:
                        if start:
                            out_up.write(line + "\n")
                            out_up.close()
                            self.helper.merge_file(os.path.join(
                                self.target_seq_path,
                                "_".join([prefix, "target.fa"])),
                                os.path.join(args_tar.out_folder,
                                             "".join([self.tmps["tmp"],
                                                      str(num_up), ".fa"])))
                            if num_up == args_tar.core_up:
                                self._run_rnaup(num_up, processes,
                                                out_rnaup, out_log, args_tar)
                                processes = []
                                num_up = 0
            self._run_rnaup(num_up, processes, out_rnaup, out_log, args_tar)

    def _merge_rnaplex_rnaup(self, prefixs, args_tar):
        for prefix in prefixs:
            rnaplex_file = None
            rnaup_file = None
            out_rnaplex = None
            out_rnaup = None
            self.helper.check_make_folder(os.path.join(
                                          self.merge_path, prefix))
            print("Ranking {0} now...".format(prefix))
            if (args_tar.program == "both") or (args_tar.program == "RNAplex"):
                rnaplex_file = os.path.join(self.rnaplex_path, prefix,
                                            "_".join([prefix, "RNAplex.txt"]))
                out_rnaplex = os.path.join(
                        self.rnaplex_path, prefix,
                        "_".join([prefix, "RNAplex_rank.csv"]))
            if (args_tar.program == "both") or (args_tar.program == "RNAup"):
                rnaup_file = os.path.join(self.rnaup_path, prefix,
                                          "_".join([prefix, "RNAup.txt"]))
                out_rnaup = os.path.join(self.rnaup_path, prefix,
                                         "_".join([prefix, "RNAup_rank.csv"]))
            merge_srna_target(rnaplex_file, rnaup_file, args_tar,
                              out_rnaplex, out_rnaup,
                              os.path.join(self.merge_path, prefix,
                                           "_".join([prefix, "merge.csv"])),
                              os.path.join(self.merge_path, prefix,
                                           "_".join([prefix, "overlap.csv"])),
                              os.path.join(self.srna_path,
                                           "_".join([prefix, "sRNA.gff"])),
                              os.path.join(self.gff_path, prefix + ".gff"))

    def run_srna_target_prediction(self, args_tar):
        self._check_gff(args_tar.gffs)
        self._check_gff(args_tar.srnas)
        self.multiparser.parser_gff(args_tar.gffs, None)
        self.multiparser.parser_fasta(args_tar.fastas)
        self.multiparser.parser_gff(args_tar.srnas, "sRNA")
        prefixs = []
        self._gen_seq(prefixs, args_tar)
        if (args_tar.program == "both") or (
                args_tar.program == "RNAplex"):
            self._rna_plex(prefixs, args_tar)
        self.helper.remove_all_content(self.target_seq_path,
                                       "_target_", "file")
        if (args_tar.program == "both") or (
                args_tar.program == "RNAup"):
            self._rnaup(prefixs, args_tar)
        self._merge_rnaplex_rnaup(prefixs, args_tar)
        if (args_tar.program == "RNAplex") or (
                args_tar.program == "both"):
            for strain in os.listdir(os.path.join(
                          args_tar.out_folder, "RNAplex")):
                shutil.rmtree(os.path.join(args_tar.out_folder, "RNAplex",
                                           strain, "RNAplfold"))
        self.helper.remove_all_content(args_tar.out_folder,
                                       self.tmps["tmp"], "dir")
        self.helper.remove_all_content(args_tar.out_folder,
                                       self.tmps["tmp"], "file")
        self.helper.remove_tmp(args_tar.gffs)
        self.helper.remove_tmp(args_tar.srnas)
        self.helper.remove_tmp(args_tar.fastas)
        self.helper.remove_all_content(self.srna_seq_path, "tmp_", "file")
Beispiel #28
0
class TSSpredator(object):
    def __init__(self, args_tss):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.master = os.path.join(args_tss.out_folder, "MasterTables")
        self.tmps = {
            "tss": "tmp_TSS",
            "ta_tss": "tmp_ta_tss",
            "tss_ta": "tmp_tss",
            "tmp": "tmp"
        }
        if args_tss.ta_files is not None:
            self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp")
        else:
            self.tmps["ta"] = None
        self.gff_path = os.path.join(args_tss.gffs, "tmp")
        self.wig_path = os.path.join(args_tss.wig_folder, "tmp")
        self.fasta_path = os.path.join(args_tss.fastas, "tmp")
        self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics")
        self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs")

    def _assign_dict(self, lib_datas):
        return {
            "wig": lib_datas[0],
            "tex": lib_datas[1],
            "condition": int(lib_datas[2]),
            "replicate": lib_datas[3],
            "strand": lib_datas[4]
        }

    def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix, rep_set):
        for num_id in range(1, lib_num + 1):
            cond_list = []
            for lib in lib_list:
                if num_id == lib["condition"]:
                    cond_list.append(lib)
            cond_sort_list = sorted(cond_list, key=lambda k: k['replicate'])
            reps = []
            for cond in cond_sort_list:
                out.write("{0}_{1}{2} = {3}\n".format(
                    prefix, cond["condition"], cond["replicate"],
                    os.path.join(wig_folder, cond["wig"])))
                reps.append(cond["replicate"])
            for rep in sorted(rep_set):
                if rep not in reps:
                    out.write("{0}_{1}{2} = \n".format(prefix,
                                                       cond["condition"], rep))

    def _start_to_run(self, tsspredator_path, config_file, out_path, prefix):
        print("Running TSSpredator for " + prefix)
        out = open(os.path.join(out_path, "log.txt"), "w")
        err = open(os.path.join(out_path, "err.txt"), "w")
        call(["java", "-jar", tsspredator_path, config_file],
             stdout=out,
             stderr=err)
        out.close()
        err.close()

    def _import_lib(self, libs, wig_folder, project_strain_name, out, gff,
                    program, fasta):
        lib_dict = {"fp": [], "fm": [], "nm": [], "np": []}
        lib_num = 0
        rep_set = set()
        list_num_id = []
        print("Runniun {0} now...".format(program))
        for lib in libs:
            lib_datas = lib.split(":")
            if not lib_datas[0].endswith(".wig"):
                print("Error:Exist a not proper wig files!!")
                sys.exit()
            for wig in os.listdir(wig_folder):
                filename = wig.split("_STRAIN_")
                if (filename[0]
                        == lib_datas[0][:-4]) and (filename[1][:-4]
                                                   == project_strain_name):
                    lib_datas[0] = wig
            if int(lib_datas[2]) > lib_num:
                lib_num = int(lib_datas[2])
            if lib_datas[3] not in rep_set:
                rep_set.add(lib_datas[3])
            if (lib_datas[1] == "tex") and (lib_datas[4] == "+"):
                lib_dict["fp"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"):
                lib_dict["fm"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"):
                lib_dict["np"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"):
                lib_dict["nm"].append(self._assign_dict(lib_datas))
        for num_id in range(1, lib_num + 1):
            out.write("annotation_{0} = {1}\n".format(num_id, gff))
        if program.lower() == "tss":
            self._print_lib(lib_num, lib_dict["fm"], out, wig_folder,
                            "fivePrimeMinus", rep_set)
            self._print_lib(lib_num, lib_dict["fp"], out, wig_folder,
                            "fivePrimePlus", rep_set)
        elif program.lower() == "processing_site":
            self._print_lib(lib_num, lib_dict["nm"], out, wig_folder,
                            "fivePrimeMinus", rep_set)
            self._print_lib(lib_num, lib_dict["np"], out, wig_folder,
                            "fivePrimePlus", rep_set)
        else:
            print("Error: Wrong program name!!!")
            sys.exit()
        for num_id in range(1, lib_num + 1):
            out.write("genome_{0} = {1}\n".format(num_id, fasta))
        for num_id in range(1, lib_num + 1):
            list_num_id.append(str(num_id))
        return lib_num, num_id, rep_set, lib_dict, list_num_id

    def _print_repmatch(self, args_tss, out):
        '''check replicate match'''
        if "all" in args_tss.repmatch:
            match = args_tss.repmatch.split("_")[-1]
            out.write("minNumRepMatches = {0}\n".format(match))
        else:
            nums = {}
            matchs = {}
            for match in args_tss.repmatch.split(","):
                lib = match.split("_")[0]
                rep = match.split("_")[-1]
                matchs[lib] = rep
                if rep not in nums.keys():
                    nums[rep] = 1
                else:
                    nums[rep] += 1
            for rep, num in nums.items():
                if num == max(nums.values()):
                    out.write("minNumRepMatches = {0}\n".format(rep))
                    max_rep = rep
                    break
            for lib, rep in matchs.items():
                if rep != max_rep:
                    out.write("minNumRepMatches_{0} = {1}\n".format(lib, rep))

    def _gen_config(self, project_strain_name, args_tss, gff, wig_folder,
                    fasta, config_file):
        '''generation of config files'''
        master_folder = "MasterTable_" + project_strain_name
        out_path = os.path.join(self.master, master_folder)
        self.helper.check_make_folder(out_path)
        out = open(config_file, "w")
        out.write("TSSinClusterSelectionMethod = HIGHEST\n")
        out.write("allowedCompareShift = 1\n")
        out.write("allowedRepCompareShift = 1\n")
        lib_num, num_id, rep_set, lib_dict, list_num_id = \
            self._import_lib(args_tss.libs, wig_folder, project_strain_name,
                             out, gff, args_tss.program, fasta)
        out.write("idList = ")
        out.write(",".join(list_num_id) + "\n")
        out.write("maxASutrLength = 100\n")
        out.write("maxGapLengthInGene = 500\n")
        out.write("maxNormalTo5primeFactor = {0}\n".format(
            args_tss.processing_factor))
        out.write("maxTSSinClusterDistance = {0}\n".format(args_tss.cluster +
                                                           1))
        out.write("maxUTRlength = {0}\n".format(args_tss.utr_length))
        out.write("min5primeToNormalFactor = {0}\n".format(
            args_tss.enrichment_factor))
        out.write("minCliffFactor = {0}\n".format(args_tss.factor))
        out.write("minCliffFactorDiscount = {0}\n".format(
            args_tss.factor_reduction))
        out.write("minCliffHeight = {0}\n".format(args_tss.height))
        out.write("minCliffHeightDiscount = {0}\n".format(
            args_tss.height_reduction))
        out.write("minNormalHeight = {0}\n".format(args_tss.base_height))
        self._print_repmatch(args_tss, out)
        out.write("minPlateauLength = 0\n")
        out.write("mode = cond\n")
        out.write("normPercentile = 0.9\n")
        if args_tss.program.lower() == "tss":
            self._print_lib(lib_num, lib_dict["nm"], out, wig_folder,
                            "normalMinus", rep_set)
            self._print_lib(lib_num, lib_dict["np"], out, wig_folder,
                            "normalPlus", rep_set)
        else:
            self._print_lib(lib_num, lib_dict["fm"], out, wig_folder,
                            "normalMinus", rep_set)
            self._print_lib(lib_num, lib_dict["fp"], out, wig_folder,
                            "normalPlus", rep_set)
        out.write("numReplicates = {0}\n".format(len(rep_set)))
        out.write("numberOfDatasets = {0}\n".format(lib_num))
        out.write("outputDirectory = {0}\n".format(out_path))
        for prefix_id in range(len(args_tss.output_prefixs)):
            out.write("outputPrefix_{0} = {1}\n".format(
                prefix_id + 1, args_tss.output_prefixs[prefix_id]))
        out.write("projectName = {0}\n".format(project_strain_name))
        out.write("superGraphCompatibility = igb\n")
        out.write("texNormPercentile = 0.5\n")
        out.write("writeGraphs = 0\n")
        out.write("writeNocornacFiles = 0\n")
        out.close()

    def _convert_gff(self, prefixs, args_tss):
        for prefix in prefixs:
            out_file = os.path.join(
                self.gff_outfolder,
                "_".join([prefix, args_tss.program]) + ".gff")
            gff_f = open(out_file, "w")
            out_path = os.path.join(self.master,
                                    "_".join(["MasterTable", prefix]))
            if "MasterTable.tsv" not in os.listdir(out_path):
                print("Error:there is not MasterTable file in {0}".format(
                    out_path))
                print("Please check configuration file.")
            else:
                if args_tss.program.lower() == "processing":
                    feature = "processing_site"
                elif args_tss.program.lower() == "tss":
                    feature = "TSS"
                self.converter.convert_mastertable2gff(
                    os.path.join(out_path, "MasterTable.tsv"), "ANNOgesic",
                    feature, prefix, out_file)
            gff_f.close()

    def _merge_manual(self, tsss, args_tss):
        '''if manual detected TSS is provided, it can merge manual detected TSS 
        and TSSpredator predicted TSS'''
        self.helper.check_make_folder(
            os.path.join(os.getcwd(), self.tmps["tss"]))
        for tss in tsss:
            for gff in os.listdir(args_tss.gffs):
                if (gff[:-4] == tss) and (".gff" in gff):
                    break
            filename = "_".join([tss, args_tss.program]) + ".gff"
            predict = os.path.join(self.gff_outfolder, filename)
            print("Running merge and classify manual ....")
            stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss)
            merge_manual_predict_tss(predict, stat_file,
                                     os.path.join(self.tmps["tss"], filename),
                                     os.path.join(args_tss.gffs, gff),
                                     args_tss)
            shutil.move(
                stat_file,
                os.path.join(args_tss.out_folder, "statistics", tss,
                             stat_file))
        self.helper.move_all_content(self.tmps["tss"], self.gff_outfolder,
                                     [".gff"])
        shutil.rmtree(self.tmps["tss"])

    def _validate(self, tsss, args_tss):
        '''validate TSS with genome annotation'''
        print("Running validation of annotation....")
        for tss in tsss:
            for gff in os.listdir(args_tss.gffs):
                if (gff[:-4] == tss) and (".gff" in gff):
                    break
            stat_file = os.path.join(self.stat_outfolder, tss,
                                     "".join(["stat_gene_vali_", tss, ".csv"]))
            out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff")
            if args_tss.program.lower() == "tss":
                compare_file = os.path.join(self.gff_outfolder,
                                            "_".join([tss, "TSS.gff"]))
            elif args_tss.program.lower() == "processing":
                compare_file = os.path.join(self.gff_outfolder,
                                            "_".join([tss, "processing.gff"]))
            validate_gff(compare_file, os.path.join(args_tss.gffs, gff),
                         stat_file, out_cds_file, args_tss.utr_length,
                         args_tss.program.lower())
            shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff))

    def _compare_ta(self, tsss, args_tss):
        '''compare TSS with transcript'''
        detect = False
        print("Running compare transcript assembly and TSS ...")
        self.multiparser.parser_gff(args_tss.ta_files, "transcript")
        self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"], None,
                                     "transcript")
        for tss in tsss:
            stat_out = os.path.join(
                self.stat_outfolder, tss,
                "".join(["stat_compare_TSS_transcript_", tss, ".csv"]))
            for ta in os.listdir(self.tmps["ta"]):
                filename = ta.split("_transcript")
                if (filename[0] == tss) and (filename[1] == ".gff"):
                    detect = True
                    break
            compare_file = os.path.join(self.gff_outfolder,
                                        "_".join([tss, "TSS.gff"]))
            if detect:
                stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file,
                            stat_out, self.tmps["ta_tss"], self.tmps["tss_ta"],
                            args_tss.fuzzy)
                self.helper.sort_gff(self.tmps["tss_ta"], compare_file)
                self.helper.sort_gff(self.tmps["ta_tss"],
                                     os.path.join(args_tss.ta_files, ta))
                os.remove(self.tmps["tss_ta"])
                os.remove(self.tmps["ta_tss"])
                detect = False

    def _stat_tss(self, tsss, feature):
        print("Running statistaics.....")
        for tss in tsss:
            compare_file = os.path.join(self.gff_outfolder,
                                        "_".join([tss, feature]) + ".gff")
            stat_tsspredator(
                compare_file, feature,
                os.path.join(
                    self.stat_outfolder, tss,
                    "_".join(["stat", feature, "class", tss]) + ".csv"),
                os.path.join(self.stat_outfolder, tss,
                             "_".join(["stat", feature, "libs", tss]) +
                             ".csv"))
            self.helper.move_all_content(
                os.getcwd(), os.path.join(self.stat_outfolder, tss),
                ["_class", ".png"])
            if os.path.exists(
                    os.path.join(self.stat_outfolder, "TSSstatistics.tsv")):
                shutil.move(
                    os.path.join(self.stat_outfolder, "TSSstatistics.tsv"),
                    os.path.join(self.stat_outfolder, tss,
                                 "TSSstatistics.tsv"))
            plot_venn(compare_file, feature)
            self.helper.move_all_content(
                os.getcwd(), os.path.join(self.stat_outfolder, tss),
                ["_venn", ".png"])

    def _set_gen_config(self, args_tss, input_folder):
        prefixs = []
        detect = False
        for fasta in os.listdir(self.fasta_path):
            for gff in os.listdir(self.gff_path):
                if fasta[:-3] == gff[:-4]:
                    prefix = fasta[:-3]
                    for wig in os.listdir(self.wig_path):
                        filename = wig.split("_STRAIN_")
                        if filename[1][:-4] == prefix:
                            detect = True
                            break
                    if detect:
                        prefixs.append(prefix)
                        config = os.path.join(
                            input_folder,
                            "_".join(["config", prefix]) + ".ini")
                        self._gen_config(prefix, args_tss,
                                         os.path.join(self.gff_path,
                                                      gff), self.wig_path,
                                         os.path.join(self.fasta_path, fasta),
                                         config)
        return prefixs

    def _merge_wigs(self, wig_folder, prefix, libs):
        self.helper.check_make_folder(
            os.path.join(os.getcwd(), self.tmps["tmp"]))
        for wig_file in os.listdir(wig_folder):
            for lib in libs:
                info = lib.split(":")
                if (info[0][:-4] in wig_file) and (info[-1] == "+") and (
                        prefix in wig_file) and (os.path.isfile(
                            os.path.join(wig_folder, wig_file))):
                    Helper().merge_file(
                        os.path.join(wig_folder, wig_file),
                        os.path.join("tmp", "merge_forward.wig"))
                if (info[0][:-4] in wig_file) and (info[-1] == "-") and (
                        prefix in wig_file) and (os.path.isfile(
                            os.path.join(wig_folder, wig_file))):
                    Helper().merge_file(
                        os.path.join(wig_folder, wig_file),
                        os.path.join("tmp", "merge_reverse.wig"))

    def _check_orphan(self, prefixs, wig_folder, args_tss):
        '''if genome has no locus tag, it can use for classify the TSS'''
        for prefix in prefixs:
            self._merge_wigs(wig_folder, prefix, args_tss.libs)
            tmp_tss = os.path.join(
                self.tmps["tmp"], "_".join([prefix,
                                            args_tss.program + ".gff"]))
            pre_tss = os.path.join(
                self.gff_outfolder,
                "_".join([prefix, args_tss.program + ".gff"]))
            check_orphan(pre_tss, os.path.join(args_tss.gffs, prefix + ".gff"),
                         "tmp/merge_forward.wig", "tmp/merge_reverse.wig",
                         tmp_tss)
            shutil.move(tmp_tss, pre_tss)
        shutil.rmtree("tmp")

    def _remove_files(self, args_tss):
        print("Remove temperary files and folders...")
        self.helper.remove_tmp(args_tss.fastas)
        self.helper.remove_tmp(args_tss.gffs)
        self.helper.remove_tmp(args_tss.wig_folder)
        self.helper.remove_tmp(args_tss.ta_files)
        if "merge_forward.wig" in os.listdir(os.getcwd()):
            os.remove("merge_forward.wig")
        if "merge_reverse.wig" in os.listdir(os.getcwd()):
            os.remove("merge_reverse.wig")

    def _deal_with_overlap(self, out_folder, args_tss):
        '''deal with the situation that TSS and 
        processing site at the same position'''
        if args_tss.overlap_feature.lower() == "both":
            pass
        else:
            print("Comparing TSS and Processing site...")
            if args_tss.program.lower() == "tss":
                for tss in os.listdir(out_folder):
                    if tss.endswith("_TSS.gff"):
                        ref = self.helper.get_correct_file(
                            args_tss.references, "_processing.gff",
                            tss.replace("_TSS.gff", ""), None, None)
                        filter_tss_pro(os.path.join(out_folder, tss), ref,
                                       args_tss.overlap_feature,
                                       args_tss.cluster)
            elif args_tss.program.lower() == "processing_site":
                for tss in os.listdir(out_folder):
                    if tss.endswith("_processing.gff"):
                        ref = self.helper.get_correct_file(
                            args_tss.references, "_TSS.gff",
                            tss.replace("_processing.gff", ""), None, None)
                        filter_tss_pro(os.path.join(out_folder, tss), ref,
                                       args_tss.overlap_feature,
                                       args_tss.cluster)

    def _low_expression(self, args_tss, gff_folder):
        '''deal with the low expressed TSS'''
        prefix = None
        self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs)
        for gff in os.listdir(gff_folder):
            if (args_tss.program.lower()
                    == "tss") and (gff.endswith("_TSS.gff")):
                prefix = gff.replace("_TSS.gff", "")
            elif (args_tss.program.lower()
                  == "processing") and (gff.endswith("_processing.gff")):
                prefix = gff.replace("_processing.gff", "")
            if prefix:
                out = open(
                    os.path.join(
                        self.stat_outfolder, prefix,
                        "_".join(["stat", prefix,
                                  "low_expression_cutoff.csv"])), "w")
                out.write("\t".join(["strain", "cutoff_coverage"]) + "\n")
                cutoff = filter_low_expression(
                    os.path.join(gff_folder,
                                 gff), args_tss, "tmp/merge_forward.wig",
                    "tmp/merge_reverse.wig", "tmp/without_low_expression.gff")
                out.write("\t".join([prefix, str(cutoff)]) + "\n")
                os.remove(os.path.join(gff_folder, gff))
                shutil.move("tmp/without_low_expression.gff",
                            os.path.join(gff_folder, gff))
                prefix = None
        out.close()

    def run_tsspredator(self, args_tss):
        input_folder = os.path.join(args_tss.out_folder, "configs")
        for gff in os.listdir(args_tss.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(
                    os.path.join(args_tss.gffs, gff))
        self.helper.check_make_folder(self.gff_outfolder)
        self.multiparser.parser_fasta(args_tss.fastas)
        self.multiparser.parser_gff(args_tss.gffs, None)
        self.multiparser.parser_wig(args_tss.wig_folder)
        prefixs = self._set_gen_config(args_tss, input_folder)
        for prefix in prefixs:
            out_path = os.path.join(self.master,
                                    "_".join(["MasterTable", prefix]))
            config_file = os.path.join(input_folder,
                                       "_".join(["config", prefix]) + ".ini")
            self._start_to_run(args_tss.tsspredator_path, config_file,
                               out_path, prefix)
            if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")):
                shutil.move(
                    os.path.join(out_path, "TSSstatistics.tsv"),
                    os.path.join(self.stat_outfolder, "TSSstatistics.tsv"))
        if args_tss.program.lower() == "processing_site":
            args_tss.program = "processing"
        self._convert_gff(prefixs, args_tss)
        if args_tss.check_orphan:
            print("checking the orphan TSS...")
            self._check_orphan(prefixs, os.path.join(args_tss.wig_folder,
                                                     "tmp"), args_tss)
        self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder, None,
                                     args_tss.program)
        datas = []
        for gff in os.listdir(self.gff_outfolder):
            if gff.endswith(".gff"):
                gff_folder = gff.replace(
                    "".join(["_", args_tss.program, ".gff"]), "")
                self.helper.check_make_folder(
                    os.path.join(self.stat_outfolder, gff_folder))
                datas.append(gff_folder)
        if args_tss.remove_low_expression is not None:
            self._low_expression(args_tss, self.gff_outfolder)
        if args_tss.manual is not None:
            self.multiparser.combine_wig(args_tss.gffs, self.wig_path, None,
                                         args_tss.libs)
            self._merge_manual(datas, args_tss)
        self._deal_with_overlap(self.gff_outfolder, args_tss)
        if args_tss.stat:
            self._stat_tss(datas, args_tss.program)
        if args_tss.validate:
            self._validate(datas, args_tss)
        if args_tss.ta_files is not None:
            self._compare_ta(datas, args_tss)
        self._remove_files(args_tss)
Beispiel #29
0
class RATT(object):
    '''annotation transfer'''

    def __init__(self, args_ratt):
        self.multiparser = Multiparser()
        self.converter = Converter()
        self.format_fixer = FormatFixer()
        self.helper = Helper()
        if args_ratt.ref_gbk:
            self.gbk = os.path.join(args_ratt.ref_gbk, "gbk_tmp")
            self.gbk_tmp = os.path.join(self.gbk, "tmp")
            self.embl = os.path.join(args_ratt.ref_gbk, "embls")
        if args_ratt.ref_embls:
            self.embl = args_ratt.ref_embls
        self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt")
        self.tmp_files = {"tar": os.path.join(args_ratt.tar_fastas, "tmp"),
                          "ref": os.path.join(args_ratt.ref_fastas, "tmp"),
                          "out_gff": os.path.join(args_ratt.gff_outfolder,
                                                  "tmp"),
                          "gff": os.path.join(args_ratt.gff_outfolder,
                                              "tmp.gff"),
                          "ptt": os.path.join(args_ratt.gff_outfolder,
                                              "tmp.ptt"),
                          "rnt": os.path.join(args_ratt.gff_outfolder,
                                              "tmp.rnt")}

    def _convert_to_pttrnt(self, gffs, files, log):
        for gff in files:
            if gff.endswith(".gff"):
                gff = os.path.join(gffs, gff)
                filename = gff.split("/")
                prefix = filename[-1][:-4]
                rnt = gff[:-3] + "rnt"
                ptt = gff[:-3] + "ptt"
                fasta = self.helper.get_correct_file(self.tmp_files["tar"],
                                                     ".fa", prefix, None, None)
                if fasta:
                    self.converter.convert_gff2rntptt(gff, fasta, ptt, rnt,
                                                      None, None)
                    log.write("\t" + ptt + " is generated.\n")
                    log.write("\t" + rnt + " is generated.\n")

    def _remove_files(self, args_ratt, out_gbk, log):
        self.helper.remove_all_content(args_ratt.gff_outfolder, ".gff", "file")
        self.helper.remove_all_content(args_ratt.gff_outfolder, ".ptt", "file")
        self.helper.remove_all_content(args_ratt.gff_outfolder, ".rnt", "file")
        log.write("Moving the final output files to {0}.\n".format(args_ratt.gff_outfolder))
        self.helper.move_all_content(self.tmp_files["out_gff"],
                                     args_ratt.gff_outfolder, None)
        log.write("Remove the temperary files.\n")
        shutil.rmtree(self.tmp_files["out_gff"])
        shutil.rmtree(self.tmp_files["tar"])
        shutil.rmtree(self.tmp_files["ref"])
        self.helper.remove_tmp_dir(args_ratt.tar_fastas)
        self.helper.remove_tmp_dir(args_ratt.ref_fastas)
        self.helper.remove_tmp_dir(args_ratt.ref_embls)
        self.helper.remove_tmp_dir(args_ratt.ref_gbk)

    def _convert_to_gff(self, ratt_result, args_ratt, files, log):
        name = ratt_result.split(".")
        filename = ".".join(name[1:-2]) + ".gff"
        output_file = os.path.join(args_ratt.output_path, filename)
        self.converter.convert_embl2gff(
             os.path.join(args_ratt.output_path, ratt_result), output_file)
        self.format_fixer.fix_ratt(output_file, ".".join(name[1:-2]),
                                   "tmp_gff")
        shutil.move("tmp_gff", output_file)
        shutil.copy(output_file, os.path.join(args_ratt.gff_outfolder,
                                              filename))
        log.write("\t" + os.path.join(args_ratt.gff_outfolder, filename) + 
                  " is generated.\n")
        files.append(filename)

    def _parser_embl_gbk(self, files):
        self.helper.check_make_folder(self.gbk)
        for file_ in files:
            close = False
            with open(file_, "r") as f_h:
                for line in f_h:
                    if (line.startswith("LOCUS")):
                        out = open(self.gbk_tmp, "w")
                        datas = line.split(" ")
                        for data in datas:
                            if (len(data) != 0) and (data != "LOCUS"):
                                filename = ".".join([data.strip(), "gbk"])
                                break
                    elif (line.startswith("VERSION")):
                        datas = line.split(" ")
                        for data in datas:
                            if (len(data) != 0) and (data != "VERSION"):
                                new_filename = ".".join([data.strip(), "gbk"])
                                break
                        if new_filename.find(filename):
                            filename = new_filename
                    if out:
                        out.write(line)
                    if line.startswith("//"):
                        out.close()
                        close = True
                        shutil.move(self.gbk_tmp,
                                    os.path.join(self.gbk, filename))
            if not close:
                out.close()
        return self.gbk

    def _convert_embl(self, ref_embls, log):
        '''convert gbk to embl'''
        detect_gbk = False
        gbks = []
        out_gbk = None
        for embl in os.listdir(ref_embls):
            if (embl.endswith(".gbk")) or (
                    embl.endswith(".gbff")) or (
                    embl.endswith(".gb")):
                detect_gbk = True
                gbks.append(os.path.join(ref_embls, embl))
        if not detect_gbk:
            log.write("--related_gbk_files is assigned, but not gbk files are detected.\n"
                      "The gbk file names need to be ended at .gbk, .gb, or .gbff. \n")
            print("Error: Please assign proper Genebank files!")
            sys.exit()
        elif detect_gbk:
            out_gbk = self._parser_embl_gbk(gbks)
            log.write("Running converter.py to convert gbk file to embl format.\n")
            self.converter.convert_gbk2embl(out_gbk)
            self.helper.check_make_folder(self.embl)
            self.helper.move_all_content(out_gbk, self.embl, [".embl"])
            log.write("\t" + self.embl + " is generated and the embl files are stored in it.\n")
        return out_gbk

    def _run_ratt(self, args_ratt, tar, ref, out, log):
        if (not os.path.exists(self.embl)) or (
                not os.path.exists(os.path.join(
                    self.tmp_files["tar"], tar + ".fa"))) or (
                not os.path.exists(os.path.join(
                    self.tmp_files["ref"], ref + ".fa"))):
            print("Error: Please check --compare_pair, the strain names "
                  "should be the same as the strain names in fasta, "
                  "genbank or embl files!")
            log.write("The strain names in --compare_pair should be the same "
                      "as the strain names in fasta, genbank, or embl files.\n")
            sys.exit()
        log.write("Make sure your RATT version is at least 1.64.\n")
        log.write("If the RATT can not run properly, please check the "
                  "RATT_HOME and PAGIT_HOME is assigned correctly.\n")
        log.write(" ".join([args_ratt.ratt_path, self.embl,
              os.path.join(self.tmp_files["tar"], tar + ".fa"),
              args_ratt.element, args_ratt.transfer_type,
              os.path.join(self.tmp_files["ref"], ref + ".fa")]) + "\n")
        call([args_ratt.ratt_path, self.embl,
              os.path.join(self.tmp_files["tar"], tar + ".fa"),
              args_ratt.element, args_ratt.transfer_type,
              os.path.join(self.tmp_files["ref"], ref + ".fa")],
             stdout=out, stderr=DEVNULL)
        log.write("Done!\n")

    def _format_and_run(self, args_ratt, log):
        print("Running RATT")
        for pair in args_ratt.pairs:
            ref = pair.split(":")[0]
            tar = pair.split(":")[1]
            out = open(self.ratt_log, "w+")
            self._run_ratt(args_ratt, tar, ref, out, log)
            log.write("The following files are generatd:\n")
            for filename in os.listdir():
                if ("final" in filename):
                    log.write("\t" + filename + "\n")
                    shutil.move(filename, os.path.join(args_ratt.output_path,
                                                       filename))
                elif (args_ratt.element in filename) or (
                      "query" in filename) or (
                      "Reference" in filename) or (
                      "Query" in filename) or (
                      "Sequences" in filename):
                    log.write("\t" + filename + "\n")
                    if os.path.isfile(filename):
                        os.remove(filename)
                    if os.path.isdir(filename):
                        shutil.rmtree(filename)
        out.close()

    def annotation_transfer(self, args_ratt, log):
        self.multiparser.parser_fasta(args_ratt.tar_fastas)
        self.multiparser.parser_fasta(args_ratt.ref_fastas)
        out_gbk = None
        if args_ratt.ref_embls is None:
            out_gbk = self._convert_embl(args_ratt.ref_gbki, log)
        self._format_and_run(args_ratt, log)
        files = []
        for data in os.listdir(args_ratt.output_path):
            if "final.embl" in data:
                log.write("Running converter.py to convert embl "
                          "files in {0} to gff, ptt, and rnt format.\n".format(data))
                self._convert_to_gff(data, args_ratt, files, log)
                self._convert_to_pttrnt(args_ratt.gff_outfolder, files, log)
        self.helper.check_make_folder(self.tmp_files["out_gff"])
        log.write("Merging the output of {0}.\n".format(data))
        for folder in os.listdir(args_ratt.tar_fastas):
            files = []
            if "_folder" in folder:
                datas = folder.split("_folder")
                prefix = ".".join(datas[0].split(".")[:-1])
                for file_ in os.listdir(os.path.join(args_ratt.tar_fastas,
                                                     folder)):
                    files.append(file_[:-3])
                for gff in os.listdir(args_ratt.gff_outfolder):
                    for file_ in files:
                        if (".gff" in gff) and (file_ == gff[:-4]):
                            self.helper.merge_file(os.path.join(
                                 args_ratt.gff_outfolder, gff),
                                 self.tmp_files["gff"])
                        if (".ptt" in gff) and (file_ == gff[:-4]):
                            self.helper.merge_file(os.path.join(
                                 args_ratt.gff_outfolder, gff),
                                 self.tmp_files["ptt"])
                        if (".rnt" in gff) and (file_ == gff[:-4]):
                            self.helper.merge_file(os.path.join(
                                 args_ratt.gff_outfolder, gff),
                                 self.tmp_files["rnt"])
                if os.path.exists(self.tmp_files["gff"]):
                    shutil.move(self.tmp_files["gff"], os.path.join(
                                self.tmp_files["out_gff"], prefix + ".gff"))
                    shutil.move(self.tmp_files["ptt"], os.path.join(
                                self.tmp_files["out_gff"], prefix + ".ptt"))
                    shutil.move(self.tmp_files["rnt"], os.path.join(
                                self.tmp_files["out_gff"], prefix + ".rnt"))
                else:
                    print("Error: Please check your fasta or "
                          "annotation files, they should only contain "
                          "the query genome. And make sure your RATT can "
                          "work properly (check $ANNOgesic/output/"
                          "annotation_transfer/ratt_log.txt).")
                    log.write("Please check your fasta or "
                              "annotation files, they should only contain "
                              "the query genome. And make sure your RATT can "
                              "work properly (check $ANNOgesic/output/"
                              "annotation_transfer/ratt_log.txt).\n")
        self._remove_files(args_ratt, out_gbk, log)
Beispiel #30
0
class sRNATargetPrediction(object):
    '''detection of sRNA-target interaction'''

    def __init__(self, args_tar):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.fixer = FormatFixer()
        self.gff_parser = Gff3Parser()
        self.target_seq_path = os.path.join(args_tar.out_folder, "target_seqs")
        self.srna_seq_path = os.path.join(args_tar.out_folder, "sRNA_seqs")
        self.rnaplex_path = os.path.join(args_tar.out_folder, "RNAplex_results")
        self.rnaup_path = os.path.join(args_tar.out_folder, "RNAup_results")
        self.intarna_path = os.path.join(args_tar.out_folder, "IntaRNA_results")
        self.merge_path = os.path.join(args_tar.out_folder, "merged_results")
        self.srna_path = os.path.join(args_tar.srnas, "tmp")
        self.fasta_path = os.path.join(args_tar.fastas, "tmp")
        self.gff_path = os.path.join(args_tar.gffs, "tmp")
        self.tmps = {"tmp": "tmp_srna_target", "rnaup": "tmp_rnaup",
                     "log": "tmp_log",
                     "all_fa": "tmp*.fa", "all_txt": "tmp*.txt"}

    def _check_gff(self, gffs):
        for gff in os.listdir(gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(gffs, gff))

    def _run_rnaplfold(self, rnaplfold_path, file_type, win_size, span,
                       unstr_region, seq_path, prefix, out_path, log):
        current = os.getcwd()
        os.chdir(out_path)
        command = " ".join([rnaplfold_path,
                            "-W", str(win_size),
                            "-L", str(span),
                            "-u", str(unstr_region),
                            "-O"])
        if file_type == "sRNA":
            log.write("<".join([command, os.path.join(current, seq_path,
                                "_".join([self.tmps["tmp"], prefix,
                                          file_type + ".fa"]))]) + "\n")
            os.system("<".join([command, os.path.join(current, seq_path,
                                "_".join([self.tmps["tmp"], prefix,
                                          file_type + ".fa"]))]))
        else:
            log.write("<".join([command, os.path.join(current, seq_path,
                                "_".join([prefix, file_type + ".fa"]))]) + "\n")
            os.system("<".join([command, os.path.join(current, seq_path,
                                "_".join([prefix, file_type + ".fa"]))]))
        os.chdir(current)

    def _wait_process(self, processes):
        for p in processes:
            p.wait()
            if p.stdout:
                p.stdout.close()
            if p.stdin:
                p.stdin.close()
            if p.stderr:
                p.stderr.close()
            try:
                p.kill()
            except OSError:
                pass
            time.sleep(5)

    def _sort_srna_fasta(self, fasta, prefix, path):
        out = open(os.path.join(path,
                   "_".join([self.tmps["tmp"], prefix, "sRNA.fa"])), "w")
        srnas = []
        with open(fasta) as f_h:
            for line in f_h:
                line = line.strip()
                if line.startswith(">"):
                    name = line[1:]
                else:
                    srnas.append({"name": name, "seq": line, "len": len(line)})
        srnas = sorted(srnas, key=lambda x: (x["len"]))
        for srna in srnas:
            out.write(">" + srna["name"].split("|")[0] + "\n")
            out.write(srna["seq"] + "\n")
        out.close()

    def _read_fasta(self, fasta_file):
        seq = ""
        with open(fasta_file, "r") as seq_f:
            for line in seq_f:
                line = line.strip()
                if line.startswith(">"):
                    continue
                else:
                    seq = seq + line
        return seq

    def _get_specific_seq(self, srna_file, seq_file, srna_out, querys):
        for query in querys:
            srna_datas = query.split(":")
            srna = {"seq_id": srna_datas[0], "strand": srna_datas[3],
                    "start": int(srna_datas[1]), "end": int(srna_datas[2])}
            gff_f = open(srna_file, "r")
            out = open(srna_out, "a")
            seq = self._read_fasta(seq_file)
            num = 0
            detect = False
            for entry in self.gff_parser.entries(gff_f):
                if (entry.seq_id == srna["seq_id"]) and (
                        entry.strand == srna["strand"]) and (
                        entry.start == srna["start"]) and (
                        entry.end == srna["end"]):
                    detect = True
                    if "ID" in entry.attributes.keys():
                        id_ = entry.attributes["ID"]
                    else:
                        id_ = entry.feature + str(num)
                    gene = self.helper.extract_gene(seq, entry.start,
                                                    entry.end, entry.strand)
                    out.write(">{0}|{1}|{2}|{3}|{4}\n{5}\n".format(
                              id_, entry.seq_id, entry.start,
                              entry.end, entry.strand, gene))
                    num += 1
            if not detect:
                print("Error: Some of the query sRNAs do not exist!")
                sys.exit()
            gff_f.close()
            out.close()

    def _gen_seq(self, prefixs, args_tar):
        print("Generating sRNA fasta files")
        for srna in os.listdir(self.srna_path):
            if srna.endswith("_sRNA.gff"):
                prefix = srna.replace("_sRNA.gff", "")
                prefixs.append(prefix)
                srna_out = os.path.join(self.srna_seq_path,
                                        "_".join([prefix, "sRNA.fa"]))
                if "all" in args_tar.query:
                    self.helper.get_seq(
                            os.path.join(self.srna_path, srna),
                            os.path.join(self.fasta_path, prefix + ".fa"),
                            srna_out)
                else:
                    if "_".join([prefix, "sRNA.fa"]) in os.listdir(
                       self.srna_seq_path):
                        os.remove(srna_out)
                    self._get_specific_seq(
                            os.path.join(self.srna_path, srna),
                            os.path.join(self.fasta_path, prefix + ".fa"),
                            srna_out, args_tar.query)
                self._sort_srna_fasta(srna_out, prefix, self.srna_seq_path)
        print("Generating target fasta files")
        for gff in os.listdir(self.gff_path):
            if gff.endswith(".gff"):
                prefix = gff.replace(".gff", "")
                potential_target(os.path.join(self.gff_path, gff),
                                 os.path.join(self.fasta_path, prefix + ".fa"),
                                 os.path.join(self.target_seq_path), args_tar)
                file_num = 1
                num = 0
                sub_prefix = os.path.join(self.target_seq_path,
                                          "_".join([prefix, "target"]))
                sub_out = open("_".join([sub_prefix, str(file_num) + ".fa"]),
                               "w")
                with open((sub_prefix + ".fa"), "r") as t_f:
                    for line in t_f:
                        line = line.strip()
                        if line.startswith(">"):
#                            line = line.replace("|", "_")
                            num += 1
                        if (num == 100):
                            num = 0
                            file_num += 1
                            sub_out.close()
                            sub_out = open("_".join([sub_prefix,
                                           str(file_num) + ".fa"]), "w")
                        sub_out.write(line + "\n")
                sub_out.close()

    def _run_rnaplex(self, prefix, rnaplfold_folder, args_tar, log):
        print("Running RNAplex of {0}".format(prefix))
        num_process = 0
        processes = []
        for seq in os.listdir(self.target_seq_path):
            if (prefix in seq) and ("_target_" in seq):
                print("Running RNAplex with {0}".format(seq))
                out_rnaplex = open(os.path.join(
                    self.rnaplex_path, prefix, "_".join([
                        prefix, "RNAplex", str(num_process) + ".txt"])), "w")
                num_process += 1
                log.write(" ".join([args_tar.rnaplex_path,
                           "-q", os.path.join(
                               self.srna_seq_path, "_".join([
                                   self.tmps["tmp"], prefix, "sRNA.fa"])),
                           "-t", os.path.join(self.target_seq_path, seq),
                           "-l", str(args_tar.inter_length),
                           "-e", str(args_tar.energy),
                           "-z", str(args_tar.duplex_dist),
                           "-a", rnaplfold_folder]) + "\n")
                p = Popen([args_tar.rnaplex_path,
                           "-q", os.path.join(
                               self.srna_seq_path, "_".join([
                                   self.tmps["tmp"], prefix, "sRNA.fa"])),
                           "-t", os.path.join(self.target_seq_path, seq),
                           "-l", str(args_tar.inter_length),
                           "-e", str(args_tar.energy),
                           "-z", str(args_tar.duplex_dist),
                           "-a", rnaplfold_folder], stdout=out_rnaplex)
                processes.append(p)
                if num_process % args_tar.core_plex == 0:
                    self._wait_process(processes)
        self._wait_process(processes)
        log.write("The prediction for {0} is done.\n".format(prefix))
        log.write("The following temporary files for storing results of {0} are "
                  "generated:\n".format(prefix))
        for file_ in os.listdir(os.path.join(self.rnaplex_path, prefix)):
            log.write("\t" + os.path.join(self.rnaplex_path, prefix, file_) + "\n")
        return num_process

    def _rna_plex(self, prefixs, args_tar, log):
        log.write("Using RNAplex and RNAplfold to predict sRNA targets.\n")
        log.write("Please make sure the version of Vienna RNA package is "
                  "at least 2.3.2.\n")
        for prefix in prefixs:
            print("Running RNAplfold of {0}".format(prefix))
            self.helper.check_make_folder(
                        os.path.join(self.rnaplex_path, prefix))
            rnaplfold_folder = os.path.join(self.rnaplex_path, prefix,
                                          "RNAplfold")
            os.mkdir(rnaplfold_folder)
            self._run_rnaplfold(
                args_tar.rnaplfold_path, "sRNA", args_tar.win_size_s,
                args_tar.span_s, args_tar.unstr_region_rnaplex_s,
                self.srna_seq_path, prefix, rnaplfold_folder, log)
            self._run_rnaplfold(
                args_tar.rnaplfold_path, "target", args_tar.win_size_t,
                args_tar.span_t, args_tar.unstr_region_rnaplex_t,
                self.target_seq_path, prefix, rnaplfold_folder, log)
            num_process = self._run_rnaplex(prefix, rnaplfold_folder, args_tar, log)
            rnaplex_file = os.path.join(self.rnaplex_path, prefix,
                                        "_".join([prefix, "RNAplex.txt"]))
            if ("_".join([prefix, "RNAplex.txt"]) in
                    os.listdir(os.path.join(self.rnaplex_path, prefix))):
                os.remove(rnaplex_file)
            for index in range(0, num_process):
                log.write("Using helper.py to merge the temporary files.\n")
                self.helper.merge_file(os.path.join(
                    self.rnaplex_path, prefix, "_".join([
                        prefix, "RNAplex", str(index) + ".txt"])),
                    rnaplex_file)
            log.write("\t" + rnaplex_file + " is generated.\n")
            self.helper.remove_all_content(os.path.join(
                 self.rnaplex_path, prefix), "_RNAplex_", "file")
            self.fixer.fix_rnaplex(rnaplex_file, self.tmps["tmp"])
            shutil.move(self.tmps["tmp"], rnaplex_file)
            shutil.rmtree(rnaplfold_folder)

    def _run_rnaup(self, num_up, processes, prefix, out_rnaup, out_log,
                   args_tar, log):
        for index in range(1, num_up + 1):
            out_tmp_up = open(os.path.join(
                args_tar.out_folder, "".join([self.tmps["rnaup"],
                                              str(index), ".txt"])), "w")
            out_err = open(os.path.join(
                args_tar.out_folder, "".join([self.tmps["log"],
                                              str(index), ".txt"])), "w")
            in_up = open(os.path.join(
                args_tar.out_folder, "".join([self.tmps["tmp"],
                                              str(index), ".fa"])), "r")
            log.write(" ".join([args_tar.rnaup_path,
                       "-u", str(args_tar.unstr_region_rnaup),
                       "-o", "--interaction_first"]) + "\n")
            p = Popen([args_tar.rnaup_path,
                       "-u", str(args_tar.unstr_region_rnaup),
                       "-o", "--interaction_first"],
                      stdin=in_up, stdout=out_tmp_up, stderr=out_err)
            processes.append(p)
        if len(processes) != 0:
            time.sleep(5)
            self._wait_process(processes)
            log.write("The following temporary files for storing results of {0} are "
                      "generated:\n".format(prefix))
            for file_ in os.listdir(os.path.join(args_tar.out_folder)):
                log.write("\t" + os.path.join(args_tar.out_folder, file_) + "\n")
            os.system("rm " + os.path.join(args_tar.out_folder,
                                           self.tmps["all_fa"]))
            self._merge_txt(num_up, out_rnaup, out_log, args_tar.out_folder)
            os.system("rm " + os.path.join(args_tar.out_folder,
                                           self.tmps["all_txt"]))

    def _merge_txt(self, num_up, out_rnaup, out_log, out_folder):
        for index in range(1, num_up + 1):
            self.helper.merge_file(
                os.path.join(out_folder, "".join([self.tmps["rnaup"],
                                                  str(index), ".txt"])),
                out_rnaup)
            self.helper.merge_file(
                os.path.join(out_folder, "".join([self.tmps["log"],
                                                  str(index), ".txt"])),
                out_log)

    def _get_continue(self, out_rnaup):
        '''For RNAup, it can continue running RNAup based on previous run'''
        srnas = []
        matchs = {}
        out = open("tmp.txt", "w")
        with open(out_rnaup) as f_h:
            for line in f_h:
                line = line.strip()
                if ">srna" in line:
                    srna = line[1:]
                    srnas.append(srna)
                    matchs[srna] = []
                else:
                    matchs[srna].append(line)
        srnas = srnas[:-1]
        for srna in srnas:
            out.write(">" + srna + "\n")
            for target in matchs[srna]:
                out.write(target + "\n")
        out.close()
        os.remove(out_rnaup)
        shutil.move("tmp.txt", out_rnaup)
        return srnas

    def _rnaup(self, prefixs, args_tar, log):
        log.write("Using RNAup to predict sRNA targets.\n")
        log.write("Please make sure the version of Vienna RNA package is "
                  "at least 2.3.2.\n")
        for prefix in prefixs:
            srnas = []
            print("Running RNAup of {0}".format(prefix))
            if not os.path.exists(os.path.join(self.rnaup_path, prefix)):
                os.mkdir(os.path.join(self.rnaup_path, prefix))
            num_up = 0
            processes = []
            out_rnaup = os.path.join(self.rnaup_path, prefix,
                                     "_".join([prefix + "_RNAup.txt"]))
            out_log = os.path.join(self.rnaup_path, prefix,
                                   "_".join([prefix + "_RNAup.log"]))
            if "_".join([prefix, "RNAup.txt"]) in \
                    os.listdir(os.path.join(self.rnaup_path, prefix)):
                if not args_tar.continue_rnaup:
                    os.remove(out_rnaup)
                    os.remove(out_log)
                else:
                    log.write("The data from the previous run is found.\n")
                    srnas = self._get_continue(out_rnaup)
                    log.write("The previous data is loaded.\n")
            with open(os.path.join(self.srna_seq_path, "_".join([
                    self.tmps["tmp"], prefix, "sRNA.fa"])), "r") as s_f:
                for line in s_f:
                    line = line.strip()
                    if line.startswith(">"):
                        if line[1:] in srnas:
                            start = False
                            continue
                        start = True
                        print("Running RNAup with {0}".format(line[1:]))
                        num_up += 1
                        out_up = open(os.path.join(args_tar.out_folder,
                                      "".join([self.tmps["tmp"],
                                               str(num_up), ".fa"])), "w")
                        out_up.write(line + "\n")
                    else:
                        if start:
                            out_up.write(line + "\n")
                            out_up.close()
                            self.helper.merge_file(os.path.join(
                                self.target_seq_path,
                                "_".join([prefix, "target.fa"])),
                                os.path.join(args_tar.out_folder,
                                             "".join([self.tmps["tmp"],
                                                      str(num_up), ".fa"])))
                            if num_up == args_tar.core_up:
                                self._run_rnaup(num_up, processes, prefix,
                                                out_rnaup, out_log, args_tar, log)
                                processes = []
                                num_up = 0
            self._run_rnaup(num_up, processes, prefix, out_rnaup, out_log,
                            args_tar, log)
            log.write("The prediction for {0} is done.\n".format(prefix))
            log.write("\t" + out_rnaup + " is complete generated and updated.\n")

    def _intarna(self, prefixs, args_tar, log):
        log.write("Using IntaRNA to predict sRNA targets.\n")
        log.write("Please make sure the version of IntaRNA is at least 2.0.4.\n")
        for prefix in prefixs:
            print("Running IntaRNA of {0}".format(prefix))
            intarna_file = os.path.join(self.intarna_path, prefix,
                                        prefix + "_IntaRNA.txt")
            self.helper.check_make_folder(
                        os.path.join(self.intarna_path, prefix))
            call([args_tar.intarna_path,
                  "-q", os.path.join(
                      self.srna_seq_path, "_".join([
                          self.tmps["tmp"], prefix, "sRNA.fa"])),
                  "-t", os.path.join(self.target_seq_path,
                                     prefix + "_target.fa"),
                  "--qAccW", str(args_tar.slide_win_srna),
                  "--qAccL", str(args_tar.max_loop_srna),
                  "--tAccW", str(args_tar.slide_win_target),
                  "--tAccL", str(args_tar.max_loop_target),
                  "--outMode", "C", "-m", args_tar.mode_intarna,
                  "--threads", str(args_tar.core_inta),
                  "--out", intarna_file])
            log.write("The prediction for {0} is done.\n".format(prefix))
            log.write("\t" + intarna_file + " is generated.\n")

    def _merge_rnaplex_rnaup(self, prefixs, args_tar, log):
        '''merge the result of IntaRNA, RNAup and RNAplex'''
        log.write("Running merge_rnaplex_rnaup.py to merge the results from "
                  "RNAplex, RNAup, and IntaRNA for generating finanl output.\n")
        log.write("The following files are generated:\n")
        for prefix in prefixs:
            rnaplex_file = None
            rnaup_file = None
            out_rnaplex = None
            out_rnaup = None
            intarna_file = None
            out_intarna = None
            self.helper.check_make_folder(os.path.join(
                                          self.merge_path, prefix))
            print("Ranking {0} now".format(prefix))
            if ("RNAplex" in args_tar.program):
                rnaplex_file = os.path.join(self.rnaplex_path, prefix,
                                            "_".join([prefix, "RNAplex.txt"]))
                out_rnaplex = os.path.join(
                        self.rnaplex_path, prefix,
                        "_".join([prefix, "RNAplex_rank.csv"]))
                self._remove_repeat(rnaplex_file, "RNAplex")
            if ("RNAup" in args_tar.program):
                rnaup_file = os.path.join(self.rnaup_path, prefix,
                                          "_".join([prefix, "RNAup.txt"]))
                out_rnaup = os.path.join(self.rnaup_path, prefix,
                                         "_".join([prefix, "RNAup_rank.csv"]))
                self._remove_repeat(rnaup_file, "RNAup")
            if ("IntaRNA" in args_tar.program):
                intarna_file = os.path.join(self.intarna_path, prefix,
                                            "_".join([prefix, "IntaRNA.txt"]))
                out_intarna = os.path.join(self.intarna_path, prefix,
                                           "_".join([prefix, "IntaRNA_rank.csv"]))
                self._remove_repeat(intarna_file, "IntaRNA")
            overlap_file = os.path.join(self.merge_path, prefix,
                                        "_".join([prefix, "overlap.csv"]))
            merge_file = os.path.join(self.merge_path, prefix,
                                      "_".join([prefix, "merge.csv"]))
            merge_srna_target(rnaplex_file, rnaup_file, intarna_file, args_tar,
                              out_rnaplex, out_rnaup, out_intarna,
                              os.path.join(self.fasta_path, prefix + ".fa"),
                              merge_file, overlap_file,
                              os.path.join(self.srna_path,
                                           "_".join([prefix, "sRNA.gff"])),
                              os.path.join(self.gff_path, prefix + ".gff"))
            if ("RNAplex" in args_tar.program):
                log.write("\t" + out_rnaplex + "\n")
            if ("RNAup" in args_tar.program):
                log.write("\t" + out_rnaup + "\n")
            if ("IntaRNA" in args_tar.program):
                log.write("\t" + out_intarna + "\n")
            if (os.path.exists(merge_file)):
                log.write("\t" + merge_file + "\n")
            if (os.path.exists(overlap_file)):
                log.write("\t" + overlap_file + "\n")

    def _remove_rnaplex(self, line, num, pre_num, pre, checks,
                        out_tmp, print_):
        if (line.startswith(">")):
            if (num % 2 == 1):
                print_ = False
                pre = line
                if (line not in checks):
                    checks[line] = []
                    print_ = True
            elif (num % 2 == 0) and (line not in checks[pre]):
                checks[pre].append(line)
                print_ = True
            num = num + 1
        else:
            if (print_):
                if (num != pre_num):
                    out_tmp.write(pre + "\n")
                    out_tmp.write(checks[pre][-1] + "\n")
                out_tmp.write(line + "\n")
                pre_num = num
        return num, pre_num, print_, pre,

    def _remove_rnaup(self, line, pre, num, pre_num, srna_info,
                      checks, out_tmp, print_, tar):
        if (line.startswith(">")):
            print_ = False
            tar = False
            if (pre.startswith(">")):
                if (pre not in checks):
                    checks[pre] = [line]
                    srna_info = pre
                    print_ = True
                else:
                    if (line not in checks[pre]):
                        checks[pre].append(line)
                        print_ = True
            else:
                if (num != 1):
                    if (line not in checks[srna_info]):
                        checks[srna_info].append(line)
                        print_ = True
        else:
            if (print_):
                if (pre_num != len(checks)):
                    out_tmp.write(srna_info + "\n")
                    out_tmp.write(checks[srna_info][-1] + "\n")
                    out_tmp.write(line + "\n")
                else:
                    if (not tar):
                        out_tmp.write(checks[srna_info][-1] + "\n")
                    out_tmp.write(line + "\n")
                pre_num = len(checks)
                tar = True
        pre = line
        num = num + 1
        return num, pre_num, print_, pre, tar, srna_info

    def _remove_intarna(self, line, checks, tar, srna_info, seq, out_tmp):
        if (line.startswith(".")) or (
                line.startswith("(")) or (
                line.startswith(")")):
            seq = line.split(";")[0]
            if (seq not in checks[tar][srna_info]):
                checks[tar][srna_info].append(seq)
                out_tmp.write(line + "\n")
        else:
            if (len(line.split(";")) >= 8):
                tar = line.split(";")[0]
                srna_info = line.split(";")[3]
                seq = line.split(";")[7]
                if (tar not in checks):
                    checks[tar] = {}
                    checks[tar][srna_info] = [seq]
                    out_tmp.write(line + "\n")
                else:
                    if (srna_info not in checks[tar]):
                        checks[tar][srna_info] = [seq]
                        out_tmp.write(line + "\n")
        return tar, srna_info, seq

    def _remove_repeat(self, interact_file, type_):
        checks = {}
        seq = ""
        pre = ""
        srna_info = ""
        num = 1
        tar = False
        pre_num = 0
        print_ = False
        out_tmp = open(interact_file + "tmp", "w")
        with open(interact_file) as fh:
            for line in fh:
                line = line.strip()
                if (type_ == "RNAplex"):
                    num, pre_num, print_, pre = self._remove_rnaplex(
                            line, num, pre_num, pre, checks, out_tmp, print_)
                elif (type_ == "RNAup"):
                    num, pre_num, print_, pre, tar, srna_info = (
                            self._remove_rnaup(
                                line, pre, num, pre_num,
                                srna_info, checks, out_tmp, print_, tar))
                elif (type_ == "IntaRNA"):
                    tar, srna_info, seq = self._remove_intarna(
                            line, checks, tar, srna_info, seq, out_tmp)
        out_tmp.close()
        shutil.move(interact_file + "tmp", interact_file)


    def run_srna_target_prediction(self, args_tar, log):
        self._check_gff(args_tar.gffs)
        self._check_gff(args_tar.srnas)
        self.multiparser.parser_gff(args_tar.gffs, None)
        self.multiparser.parser_fasta(args_tar.fastas)
        self.multiparser.parser_gff(args_tar.srnas, "sRNA")
        prefixs = []
        self._gen_seq(prefixs, args_tar)
        if ("RNAplex" in args_tar.program):
            self._rna_plex(prefixs, args_tar, log)
        self.helper.remove_all_content(self.target_seq_path,
                                       "_target_", "file")
        log.write("The temporary files for running RNAplex are deleted.\n")
        if ("RNAup" in args_tar.program):
            self._rnaup(prefixs, args_tar, log)
        if ("IntaRNA" in args_tar.program):
            self._intarna(prefixs, args_tar, log)
        self._merge_rnaplex_rnaup(prefixs, args_tar, log)
        self.helper.remove_all_content(args_tar.out_folder,
                                       self.tmps["tmp"], "dir")
        self.helper.remove_all_content(args_tar.out_folder,
                                       self.tmps["tmp"], "file")
        self.helper.remove_tmp_dir(args_tar.gffs)
        self.helper.remove_tmp_dir(args_tar.srnas)
        self.helper.remove_tmp_dir(args_tar.fastas)
        self.helper.remove_all_content(self.srna_seq_path, "tmp_", "file")
Beispiel #31
0
class SubLocal(object):
    '''detection of subcellular localization'''

    def __init__(self, args_sub):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.fixer = FormatFixer()
        self.gff_path = os.path.join(args_sub.gffs, "tmp")
        self.fasta_path = os.path.join(args_sub.fastas, "tmp")
        if args_sub.trans is not None:
            self.tran_path = os.path.join(args_sub.trans, "tmp")
        else:
            self.tran_path = None
        self.out_all = os.path.join(args_sub.out_folder, "all_CDSs")
        self.out_express = os.path.join(args_sub.out_folder, "expressed_CDSs")
        self.all_tmp_path = os.path.join(self.out_all, "tmp")
        self.express_tmp_path = os.path.join(self.out_express, "tmp")
        self.all_stat_path = os.path.join(self.out_all, "statistics")
        self.express_stat_path = os.path.join(self.out_express, "statistics")
        self.all_tmp_result = os.path.join(self.out_all, "tmp_results")
        self.express_tmp_result = os.path.join(self.out_express, "tmp_results")
        self.all_result = os.path.join(self.out_all, "psortb_results")
        self.express_result = os.path.join(self.out_express, "psortb_results")
        self.endfix_table = "table.csv"
        self.endfix_raw = "raw.txt"
        self._make_folder()

    def _make_folder(self):
        self.helper.check_make_folder(self.out_all)
        self.helper.check_make_folder(self.out_express)
        self.helper.check_make_folder(self.all_stat_path)
        self.helper.check_make_folder(self.express_stat_path)
        self.helper.check_make_folder(self.all_result)
        self.helper.check_make_folder(self.express_result)

    def _compare_cds_tran(self, gff_file, tran_file, log):
        '''compare CDS and transcript to find the expressed CDS'''
        log.write("Comparing transcripts and CDSs to get expressed CDSs.\n")
        out = open(os.path.join(self.out_all, "tmp_cds.gff"), "w")
        cdss = []
        fh = open(gff_file)
        th = open(tran_file)
        for entry in Gff3Parser().entries(fh):
            if entry.feature == "CDS":
                cdss.append(entry)
        trans = []
        for entry in Gff3Parser().entries(th):
            trans.append(entry)
        for cds in cdss:
            for ta in trans:
                if (cds.strand == ta.strand) and (
                        cds.seq_id == ta.seq_id):
                    if ((cds.end < ta.end) and (
                             cds.end > ta.start) and (
                             cds.start <= ta.start)) or (
                            (cds.start > ta.start) and (
                             cds.start < ta.end) and (
                             cds.end >= ta.end)) or (
                            (cds.end >= ta.end) and (
                             cds.start <= ta.start)) or (
                            (cds.end <= ta.end) and (
                             cds.start >= ta.start)):
                        out.write(cds.info + "\n")
                        break
        fh.close()
        th.close()
        out.close()
        log.write("\t" + os.path.join(self.out_all, "tmp_cds.gff") + " is "
                  "temporary generated.\n")

    def _get_protein_seq(self, gff, tmp_path, tran_path, args_sub, log):
        prefix = gff.replace(".gff", "")
        fasta = self.helper.get_correct_file(self.fasta_path, ".fa",
                                             prefix, None, None)
        dna_seq_file = os.path.join(tmp_path, "_".join([prefix, "dna.fa"]))
        print("Generating CDS fasta files of {0}".format(prefix))
        if tran_path is not None:
            log.write("Predicting subcellular localization for expressed "
                      "CDSs for {0}.\n".format(prefix))
            self._compare_cds_tran(os.path.join(self.gff_path, gff),
                                   os.path.join(tran_path, "_".join([
                                       prefix, "transcript.gff"])), log)
            log.write("Running helper.py to extract sequences for CDSs.\n")
            self.helper.get_cds_seq(os.path.join(self.out_all, "tmp_cds.gff"),
                                    fasta, dna_seq_file)
            os.remove(os.path.join(self.out_all, "tmp_cds.gff"))
        else:
            log.write("Predicting subcellular localization for all CDSs for "
                      "{0}.\n".format(prefix))
            log.write("Running helper.py to extract sequences for CDSs.\n")
            self.helper.get_cds_seq(os.path.join(self.gff_path, gff),
                                    fasta, dna_seq_file)
        log.write("\t" + dna_seq_file + " is generated.\n")
        print("Transfering DNA sequences to protein sequence of {0}".format(
            prefix))
        log.write("Running helper.py to translate DNA sequences to Protein "
                  "sequences.\n")
        tmp_file = os.path.join(args_sub.out_folder, "tmp")
        self.helper.translation(dna_seq_file, tmp_file)
        prot_seq_file = os.path.join(
                tmp_path, "_".join([prefix, "protein.fa"]))
        self.fixer.fix_emboss(tmp_file, prot_seq_file)
        log.write(prot_seq_file + " is generated.\n")
        os.remove(tmp_file)
        return prefix

    def _psortb(self, psortb_path, strain_type, prot_seq_file,
                out_raw, out_err, log):
        log.write(" ".join([psortb_path, strain_type, prot_seq_file]) + "\n")
        call([psortb_path, strain_type, prot_seq_file],
             stdout=out_raw, stderr=out_err)

    def _run_psortb(self, args_sub, prefix, out_folder, tmp_path, tmp_result, log):
        print("Running psortb of {0}".format(prefix))
        log.write("Running Psortb for predict subcellular localization for "
                  "{0}.\n".format(prefix))
        out_err = open(os.path.join(out_folder, "tmp_log"), "w")
        out_raw = open(os.path.join(tmp_result,
                       "_".join([prefix, self.endfix_raw])), "w")
        prot_seq_file = os.path.join(tmp_path,
                                     "_".join([prefix, "protein.fa"]))
        if args_sub.gram == "positive":
            self._psortb(args_sub.psortb_path, "-p", prot_seq_file,
                         out_raw, out_err, log)
        elif args_sub.gram == "negative":
            self._psortb(args_sub.psortb_path, "-n", prot_seq_file,
                         out_raw, out_err, log)
        else:
            log.write("Please assign \"positive\" or \"negative\" to "
                      "--bacteria_type.\n")
            print("Error: {0} is not a proper bacteria type! "
                  "Please assign positive or negative.".format(
                  args_sub.gram))
            sys.exit()
        log.write("\t" + os.path.join(tmp_result, "_".join([
            prefix, self.endfix_raw])) + " is temporary generated.\n")
        out_err.close()
        out_raw.close()

    def _extract_result(self, args_sub, tmp_psortb_path, prefix, gff_file, log):
        '''extract the result of psortb'''
        log.write("Running extract_psortb.py to extract the information of "
                  "localization.\n")
        extract_psortb(os.path.join(
            tmp_psortb_path, "_".join([prefix, self.endfix_raw])),
            os.path.join(tmp_psortb_path, "_".join([
                prefix, self.endfix_table])),
            None, None, args_sub.fuzzy)
        log.write("\t" + os.path.join(tmp_psortb_path, "_".join([
            prefix, self.endfix_table])) + " is tempoaray generated.\n")

    def _remove_header(self, out_all):
        out = open(out_all + "_tmp", "w")
        fh = open(out_all, "r")
        out.write("\t".join(["#Genome", "Protein", "Strand", "Start",
                             "End", "Location", "Score"]) + "\n")
        for row in csv.reader(fh, delimiter='\t'):
            if row[0] != "#Genome":
                out.write("\t".join(row) + "\n")
        out.close()
        fh.close()
        shutil.move(out_all + "_tmp", out_all)

    def _merge_and_stat(self, gffs, tmp_psortb_path, stat_path, psortb_result,
                        log):
        for folder in os.listdir(gffs):
            if folder.endswith(".gff_folder"):
                prefix = folder.replace(".gff_folder", "")
                self.helper.check_make_folder(
                     os.path.join(psortb_result, prefix))
                merge_table = os.path.join(
                        psortb_result, prefix,
                        "_".join([prefix, self.endfix_table]))
                for gff in os.listdir(os.path.join(gffs, folder)):
                    result = self.helper.get_correct_file(
                            tmp_psortb_path, "_" + self.endfix_raw,
                            gff.replace(".gff", ""), None, None)
                    shutil.copy(result, os.path.join(psortb_result, prefix))
                    result = self.helper.get_correct_file(
                            tmp_psortb_path, "_" + self.endfix_table,
                            gff.replace(".gff", ""), None, None)
                    self.helper.merge_file(result, merge_table)
                log.write("\t" + merge_table + "\n")
                self._remove_header(merge_table)
                self.helper.check_make_folder(os.path.join(stat_path, prefix))
                stat_folder = os.path.join(stat_path, prefix)
                stat_file = os.path.join(stat_folder, "_".join([
                                      "stat", prefix, "sublocal.csv"]))
                stat_sublocal(merge_table,
                              os.path.join(stat_folder, prefix),
                              stat_file)
                for file_ in os.listdir(stat_folder):
                    log.write("\t" + os.path.join(stat_folder, file_) + "\n")

    def _remove_tmps(self, args_sub):
        self.helper.remove_tmp_dir(args_sub.fastas)
        self.helper.remove_tmp_dir(args_sub.gffs)
        self.helper.remove_all_content(args_sub.out_folder, "tmp", "dir")
        self.helper.remove_all_content(self.out_all, "tmp", "dir")
        self.helper.remove_all_content(self.out_express, "tmp", "dir")
        os.remove(os.path.join(self.out_all, "tmp_log"))
        if args_sub.trans is not None:
            os.remove(os.path.join(self.out_express, "tmp_log"))
            self.helper.remove_tmp_dir(args_sub.trans)

    def run_sub_local(self, args_sub, log):
        for gff in os.listdir(args_sub.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(
                                                 args_sub.gffs, gff))
        self.multiparser.parser_gff(args_sub.gffs, None)
        self.multiparser.parser_fasta(args_sub.fastas)
        if args_sub.trans is not None:
            self.multiparser.parser_gff(args_sub.trans, "transcript")
            self.helper.check_make_folder(self.express_tmp_path)
            self.helper.check_make_folder(self.express_tmp_result)
        self.helper.check_make_folder(self.all_tmp_path)
        self.helper.check_make_folder(self.all_tmp_result)
        for gff in os.listdir(self.gff_path):
            if args_sub.trans is not None:
                print("Running expressed genes now")
                prefix = self._get_protein_seq(gff, self.express_tmp_path,
                                               self.tran_path, args_sub, log)
                self._run_psortb(args_sub, prefix, self.out_express,
                                 self.express_tmp_path,
                                 self.express_tmp_result, log)
                self._extract_result(args_sub, self.express_tmp_result, prefix,
                                     os.path.join(self.gff_path, gff), log)
            print("Running all genes now")
            prefix = self._get_protein_seq(gff, self.all_tmp_path, None,
                                           args_sub, log)
            self._run_psortb(args_sub, prefix, self.out_all,
                             self.all_tmp_path, self.all_tmp_result, log)
            self._extract_result(args_sub, self.all_tmp_result, prefix,
                                 os.path.join(self.gff_path, gff), log)
        log.write("Running stat_sublocal.py to do statistics, generate "
                  "merged tables, and plot figures.\n")
        log.write("The following files are generated:\n")
        self._merge_and_stat(args_sub.gffs, self.all_tmp_result,
                             self.all_stat_path, self.all_result, log)
        if args_sub.trans is not None:
            self._merge_and_stat(args_sub.gffs, self.express_tmp_result,
                                 self.express_stat_path, self.express_result, log)
        self._remove_tmps(args_sub)
Beispiel #32
0
class GoTermFinding(object):
    '''Retrieving the GO term'''

    def __init__(self, args_go):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.out_all = os.path.join(args_go.out_folder, "all_CDSs")
        self.out_express = os.path.join(args_go.out_folder, "expressed_CDSs")
        self.result_all_path = os.path.join(self.out_all, "GO_term_results")
        self.result_express_path = os.path.join(self.out_express,
                                                "GO_term_results")
        self.gff_path = os.path.join(args_go.gffs, "tmp")
        if args_go.trans is not None:
            self.tran_path = os.path.join(args_go.trans, "tmp")
        else:
            self.tran_path = None
        self.stat_all_path = os.path.join(self.out_all, "statistics")
        self.stat_express_path = os.path.join(self.out_express,
                                              "statistics")
        self.all_strain = "all_genomes_uniprot.csv"

    def _retrieve_go(self, uniprot, out_path, type_):
        prefixs = []
        for gff in os.listdir(self.gff_path):
            prefix = gff.replace(".gff", "")
            prefixs.append(prefix)
            self.helper.check_make_folder(os.path.join(out_path, prefix))
            out_file = os.path.join(out_path, prefix,
                                    "_".join([prefix, "uniprot.csv"]))
            print("Extracting GO terms of {0} from UniProt".format(prefix))
            if self.tran_path is not None:
                tran_file = os.path.join(self.tran_path,
                                         "_".join([prefix, "transcript.gff"]))
            else:
                tran_file = None
            retrieve_uniprot(uniprot, os.path.join(self.gff_path, gff),
                             out_file, tran_file, type_)

    def _merge_files(self, gffs, out_path, out_folder):
        '''merge the files according to the input genome folder'''
        folders = []
        for folder in os.listdir(gffs):
            if folder.endswith("gff_folder"):
                folder_prefix = folder.replace(".gff_folder", "")
                folder_path = os.path.join(out_folder, folder_prefix)
                self.helper.check_make_folder(folder_path)
                folders.append(folder_path)
                filenames = []
                for gff in os.listdir(os.path.join(gffs, folder)):
                    if gff.endswith(".gff"):
                        filenames.append(gff.replace(".gff", ""))
                out_all = os.path.join(folder_path, self.all_strain)
                if len(filenames) > 1:
                    if self.all_strain in os.listdir(folder_path):
                        os.remove(out_all)
                    for filename in filenames:
                        csv_file = "_".join([filename, "uniprot.csv"])
                        self.helper.merge_file(os.path.join(out_path,
                                               filename, csv_file), out_all)
                        shutil.copy(os.path.join(out_path, filename, csv_file),
                                    folder_path)
                else:
                    shutil.copyfile(os.path.join(out_path, filenames[0],
                                    "_".join([filenames[0], "uniprot.csv"])),
                                    out_all)
        self.helper.remove_all_content(out_path, None, "dir")
        self.helper.remove_all_content(out_path, None, "file")
        for folder in folders:
            folder_prefix = folder.split("/")[-1]
            shutil.move(folder, os.path.join(out_path, folder_prefix))

    def _stat(self, out_path, stat_path, go, goslim, out_folder):
        for folder in os.listdir(out_path):
            strain_stat_path = os.path.join(stat_path, folder)
            self.helper.check_make_folder(strain_stat_path)
            fig_path = os.path.join(strain_stat_path, "figs")
            if "fig" not in os.listdir(strain_stat_path):
                os.mkdir(fig_path)
            map2goslim(goslim, go,
                       os.path.join(out_path, folder, self.all_strain),
                       os.path.join(strain_stat_path,
                                    "_".join(["stat", folder + ".csv"])),
                       out_folder)
            self.helper.move_all_content(out_folder, fig_path,
                                         ["_three_roots.png"])
            self.helper.move_all_content(out_folder, fig_path,
                                         ["_molecular_function.png"])
            self.helper.move_all_content(out_folder, fig_path,
                                         ["_cellular_component.png"])
            self.helper.move_all_content(out_folder, fig_path,
                                         ["_biological_process.png"])

    def run_go_term(self, args_go):
        for gff in os.listdir(args_go.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(
                                                 args_go.gffs, gff))
        self.multiparser.parser_gff(args_go.gffs, None)
        if args_go.trans is not None:
            self.multiparser.parser_gff(args_go.trans, "transcript")
        print("Computing all CDSs")
        self._retrieve_go(args_go.uniprot, self.result_all_path, "all")
        self._merge_files(args_go.gffs, self.result_all_path, self.out_all)
        self._stat(self.result_all_path, self.stat_all_path, args_go.go,
                   args_go.goslim, self.out_all)
        if args_go.trans is not None:
            print("Computing express CDSs")
            self._retrieve_go(args_go.uniprot, self.result_express_path,
                              "express")
            self._merge_files(args_go.gffs, self.result_express_path,
                              self.out_express)
            self._stat(self.result_express_path, self.stat_express_path,
                       args_go.go, args_go.goslim, self.out_express)
        self.helper.remove_tmp_dir(args_go.gffs)
        if args_go.trans is not None:
            self.helper.remove_tmp_dir(args_go.trans)
Beispiel #33
0
class SubLocal(object):

    def __init__(self, args_sub):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.fixer = FormatFixer()
        self.gff_path = os.path.join(args_sub.gffs, "tmp")
        self.fasta_path = os.path.join(args_sub.fastas, "tmp")
        if args_sub.trans is not None:
            self.tran_path = os.path.join(args_sub.trans, "tmp")
        else:
            self.tran_path = None
        self.out_all = os.path.join(args_sub.out_folder, "all_CDS")
        self.out_express = os.path.join(args_sub.out_folder, "expressed_CDS")
        self.all_tmp_path = os.path.join(self.out_all, "tmp")
        self.express_tmp_path = os.path.join(self.out_express, "tmp")
        self.all_stat_path = os.path.join(self.out_all, "statistics")
        self.express_stat_path = os.path.join(self.out_express, "statistics")
        self.all_tmp_result = os.path.join(self.out_all, "tmp_results")
        self.express_tmp_result = os.path.join(self.out_express, "tmp_results")
        self.all_result = os.path.join(self.out_all, "psortb_results")
        self.express_result = os.path.join(self.out_express, "psortb_results")
        self.endfix_table = "table.csv"
        self.endfix_raw = "raw.txt"
        self._make_folder()

    def _make_folder(self):
        self.helper.check_make_folder(self.out_all)
        self.helper.check_make_folder(self.out_express)
        self.helper.check_make_folder(self.all_stat_path)
        self.helper.check_make_folder(self.express_stat_path)
        self.helper.check_make_folder(self.all_result)
        self.helper.check_make_folder(self.express_result)

    def _compare_cds_tran(self, gff_file, tran_file):
        out = open(os.path.join(self.out_all, "tmp_cds.gff"), "w")
        cdss = []
        fh = open(gff_file)
        th = open(tran_file)
        for entry in Gff3Parser().entries(fh):
            if entry.feature == "CDS":
                cdss.append(entry)
        trans = []
        for entry in Gff3Parser().entries(th):
            trans.append(entry)
        for cds in cdss:
            for ta in trans:
                if (cds.strand == ta.strand) and (
                        cds.seq_id == ta.seq_id):
                    if ((cds.end < ta.end) and (
                             cds.end > ta.start) and (
                             cds.start <= ta.start)) or (
                            (cds.start > ta.start) and (
                             cds.start < ta.end) and (
                             cds.end >= ta.end)) or (
                            (cds.end >= ta.end) and (
                             cds.start <= ta.start)) or (
                            (cds.end <= ta.end) and (
                             cds.start >= ta.start)):
                        out.write(cds.info + "\n")
                        break
        fh.close()
        th.close()
        out.close()

    def _get_protein_seq(self, gff, tmp_path, tran_path):
        prefix = gff.replace(".gff", "")
        fasta = self.helper.get_correct_file(self.fasta_path, ".fa",
                                             prefix, None, None)
        dna_seq_file = os.path.join(tmp_path, "_".join([prefix, "dna.fa"]))
        print("Generate CDS fasta files of {0}".format(prefix))
        if tran_path is not None:
            self._compare_cds_tran(os.path.join(self.gff_path, gff),
                                   os.path.join(tran_path, "_".join([
                                       prefix, "transcript.gff"])))
            self.helper.get_cds_seq(os.path.join(self.out_all, "tmp_cds.gff"),
                                    fasta, dna_seq_file)
            os.remove(os.path.join(self.out_all, "tmp_cds.gff"))
        else:
            self.helper.get_cds_seq(os.path.join(self.gff_path, gff),
                                    fasta, dna_seq_file)
        print("transfer DNA seq to protein seq of {0}".format(prefix))
        self.helper.translation(dna_seq_file, "tmp")
        prot_seq_file = os.path.join(
                tmp_path, "_".join([prefix, "protein.fa"]))
        self.fixer.fix_emboss("tmp", prot_seq_file)
        os.remove("tmp")
        return prefix

    def _psortb(self, psortb_path, strain_type, prot_seq_file,
                out_raw, out_err):
        call([psortb_path, strain_type, prot_seq_file],
             stdout=out_raw, stderr=out_err)

    def _run_psortb(self, args_sub, prefix, out_folder, tmp_path, tmp_result):
        print("Running psortb of {0}".format(prefix))
        out_err = open(os.path.join(out_folder, "tmp_log"), "w")
        out_raw = open(os.path.join(tmp_result,
                       "_".join([prefix, self.endfix_raw])), "w")
        prot_seq_file = os.path.join(tmp_path,
                                     "_".join([prefix, "protein.fa"]))
        if args_sub.gram == "positive":
            self._psortb(args_sub.psortb_path, "-p", prot_seq_file,
                         out_raw, out_err)
        elif args_sub.gram == "negative":
            self._psortb(args_sub.psortb_path, "-n", prot_seq_file,
                         out_raw, out_err)
        else:
            print("Error:It is not a proper bacteria type - {0}!!".format(
                  args_sub.gram))
            sys.exit()
        out_err.close()
        out_raw.close()

    def _extract_result(self, args_sub, tmp_psortb_path, prefix, gff_file):
        if args_sub.merge:
            print("Merge to gff...")
            extract_psortb(os.path.join(
                tmp_psortb_path, "_".join([prefix, self.endfix_raw])),
                os.path.join(tmp_psortb_path, "_".join([
                    prefix, self.endfix_table])),
                gff_file, os.path.join(prefix + ".gff"),
                args_sub.fuzzy)
            shutil.move(prefix + ".gff", gff_file)
        else:
            extract_psortb(os.path.join(
                tmp_psortb_path, "_".join([prefix, self.endfix_raw])),
                os.path.join(tmp_psortb_path, "_".join([
                    prefix, self.endfix_table])),
                None, None, args_sub.fuzzy)

    def _merge_and_stat(self, gffs, tmp_psortb_path, stat_path, psortb_result):
        for folder in os.listdir(gffs):
            if folder.endswith(".gff_folder"):
                prefix = folder.replace(".gff_folder", "")
                self.helper.check_make_folder(
                     os.path.join(psortb_result, prefix))
                merge_table = os.path.join(
                        psortb_result, prefix,
                        "_".join([prefix, self.endfix_table]))
                for gff in os.listdir(os.path.join(gffs, folder)):
                    result = self.helper.get_correct_file(
                            tmp_psortb_path, "_" + self.endfix_raw,
                            gff.replace(".gff", ""), None, None)
                    shutil.copy(result, os.path.join(psortb_result, prefix))
                    result = self.helper.get_correct_file(
                            tmp_psortb_path, "_" + self.endfix_table,
                            gff.replace(".gff", ""), None, None)
                    self.helper.merge_file(result, merge_table)
                self.helper.check_make_folder(os.path.join(stat_path, prefix))
                stat_sublocal(merge_table,
                              os.path.join(
                                  stat_path, prefix, prefix),
                              os.path.join(
                                  stat_path, prefix, "_".join([
                                      "stat", prefix, "sublocal.csv"])))

    def _remove_tmps(self, args_sub):
        self.helper.remove_tmp(args_sub.fastas)
        self.helper.remove_tmp(args_sub.gffs)
        self.helper.remove_all_content(args_sub.out_folder, "tmp", "dir")
        self.helper.remove_all_content(self.out_all, "tmp", "dir")
        self.helper.remove_all_content(self.out_express, "tmp", "dir")
        os.remove(os.path.join(self.out_all, "tmp_log"))
        if args_sub.trans is not None:
            os.remove(os.path.join(self.out_express, "tmp_log"))

    def run_sub_local(self, args_sub):
        for gff in os.listdir(args_sub.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(
                                                 args_sub.gffs, gff))
        self.multiparser.parser_gff(args_sub.gffs, None)
        self.multiparser.parser_fasta(args_sub.fastas)
        if args_sub.trans is not None:
            self.multiparser.parser_gff(args_sub.trans, "transcript")
            self.helper.check_make_folder(self.express_tmp_path)
            self.helper.check_make_folder(self.express_tmp_result)
        self.helper.check_make_folder(self.all_tmp_path)
        self.helper.check_make_folder(self.all_tmp_result)
        for gff in os.listdir(self.gff_path):
            if args_sub.trans is not None:
                print("Running expressed gene now...")
                prefix = self._get_protein_seq(gff, self.express_tmp_path,
                                               self.tran_path)
                self._run_psortb(args_sub, prefix, self.out_express,
                                 self.express_tmp_path,
                                 self.express_tmp_result)
                self._extract_result(args_sub, self.express_tmp_result, prefix,
                                     os.path.join(self.gff_path, gff))
            print("Running all gene now...")
            prefix = self._get_protein_seq(gff, self.all_tmp_path, None)
            self._run_psortb(args_sub, prefix, self.out_all,
                             self.all_tmp_path, self.all_tmp_result)
            self._extract_result(args_sub, self.all_tmp_result, prefix,
                                 os.path.join(self.gff_path, gff))
        self._merge_and_stat(args_sub.gffs, self.all_tmp_result,
                             self.all_stat_path, self.all_result)
        if args_sub.trans is not None:
            self._merge_and_stat(args_sub.gffs, self.express_tmp_result,
                                 self.express_stat_path, self.express_result)
        self._remove_tmps(args_sub)
Beispiel #34
0
class TranscriptDetection(object):
    '''doing for transcript detection'''

    def __init__(self, args_tran):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.gff_outfolder = os.path.join(args_tran.out_folder, "gffs")
        self.tran_path = os.path.join(self.gff_outfolder, "tmp")
        self.stat_path = os.path.join(args_tran.out_folder, "statistics")
        self.tmps = {"gff": "tmp.gff", "merge": "tmp_merge",
                     "tran": os.path.join(args_tran.out_folder, "tmp_tran"),
                     "tss_ta": os.path.join(self.gff_outfolder, "tmp_tss_ta"),
                     "ta_tss": os.path.join(self.gff_outfolder, "tmp_ta_tss"),
                     "ta_gff": os.path.join(self.gff_outfolder, "tmp_ta_gff"),
                     "gff_ta": os.path.join(self.gff_outfolder, "tmp_gff_ta"),
                     "uni": os.path.join(self.gff_outfolder, "tmp_uni"),
                     "overlap": os.path.join(
                         self.gff_outfolder, "tmp_overlap")}
        self.frag = "transcript_fragment.gff"
        self.tex = "transcript_tex_notex.gff"
        self.endfix_tran = "transcript.gff"

    def _compute_transcript(self, wig_f, wig_r, wig_folder, wig_type, strain,
                            libs, args_tran):
        print("Computing transcripts for {0}".format(strain))
        out = os.path.join(args_tran.out_folder, "_".join([strain, wig_type]))
        detect_transcript(wig_f, wig_r, wig_folder, libs, out, wig_type, args_tran)

    def _compute(self, wig_type, wigs, libs, args_tran):
        strains = []
        wig_folder = os.path.join(wigs, "tmp")
        for wig in os.listdir(wig_folder):
            if wig.endswith("_forward.wig"):
                strains.append(wig.replace("_forward.wig", ""))
        for strain in strains:
            f_file = os.path.join(wig_folder, "_".join(
                [strain, "forward.wig"]))
            r_file = os.path.join(wig_folder, "_".join(
                [strain, "reverse.wig"]))
            self._compute_transcript(f_file, r_file, wigs, wig_type,
                                     strain, libs, args_tran)
        return strains

    def _compare_tss(self, tas, args_tran, log):
        self.multiparser.parser_gff(args_tran.compare_tss, "TSS")
        self.multiparser.combine_gff(
                self.gff_outfolder,
                os.path.join(args_tran.compare_tss, "tmp"),
                "transcript", "TSS")
        print("Comaring of transcripts and TSSs")
        log.write("Running stat_TA_comparison.py to compare transcripts "
                  "with TSSs.\n")
        tss_folder = os.path.join(args_tran.compare_tss, "tmp")
        for ta in tas:
            ta_file = os.path.join(self.gff_outfolder,
                                   "_".join([ta, self.endfix_tran]))
            stat_tss_out = os.path.join(
                    self.stat_path, "".join([
                        "stat_compare_transcript_TSS_",
                        ta, ".csv"]))
            for tss in os.listdir(tss_folder):
                filename = tss.split("_TSS")
                if (filename[0] == ta) and (tss.endswith(".gff")):
                    stat_ta_tss(ta_file, os.path.join(tss_folder, tss),
                                stat_tss_out, self.tmps["ta_tss"],
                                self.tmps["tss_ta"], args_tran.fuzzy)
                    os.remove(ta_file)
                    os.remove(os.path.join(tss_folder, tss))
                    self.helper.sort_gff(self.tmps["ta_tss"], ta_file)
                    self.helper.sort_gff(
                            self.tmps["tss_ta"], os.path.join(
                                args_tran.compare_tss, tss))
                    os.remove(self.tmps["tss_ta"])
                    os.remove(self.tmps["ta_tss"])
            log.write("\t" + stat_tss_out + "\n")

    def _compare_cds(self, tas, args_tran, log):
        self.multiparser.parser_gff(args_tran.gffs, None)
        self.multiparser.combine_gff(
            self.gff_outfolder, os.path.join(args_tran.gffs, "tmp"),
            "transcript", None)
        print("Comaring of transcripts and genome annotations")
        cds_folder = os.path.join(args_tran.gffs, "tmp")
        log.write("Running stat_TA_comparison.py to compare transcripts "
                  "with genome annotations.\n")
        for ta in tas:
            ta_file = os.path.join(self.gff_outfolder,
                                   "_".join([ta, self.endfix_tran]))
            stat_gff_out = os.path.join(self.stat_path, "".join([
                "stat_compare_transcript_genome_", ta, ".csv"]))
            for gff in os.listdir(cds_folder):
                if (gff[:-4] == ta) and (gff.endswith(".gff")):
                    cds_file = os.path.join(cds_folder, gff)
                    stat_ta_gff(ta_file, cds_file, stat_gff_out,
                                self.tmps["ta_gff"], self.tmps["gff_ta"],
                                args_tran.c_feature)
                    os.remove(ta_file)
                    os.remove(os.path.join(args_tran.gffs, gff))
                    self.helper.sort_gff(self.tmps["ta_gff"], ta_file)
                    self.helper.sort_gff(self.tmps["gff_ta"], os.path.join(
                        args_tran.gffs, gff))
                    os.remove(self.tmps["ta_gff"])
                    os.remove(self.tmps["gff_ta"])
            log.write("\t" + stat_gff_out + ".\n")

    def _compare_tss_cds(self, tas, args_tran, log):
        '''compare transcript with CDS and TSS'''
        if (args_tran.compare_tss is not None) and (
                args_tran.c_feature is not None):
            self.multiparser.parser_gff(self.gff_outfolder, "transcript")
            self._compare_cds(tas, args_tran, log)
            self._compare_tss(tas, args_tran, log)
        elif (args_tran.c_feature is not None) and (
                args_tran.compare_tss is None):
            self.multiparser.parser_gff(self.gff_outfolder, "transcript")
            self._compare_cds(tas, args_tran, log)
        elif (args_tran.c_feature is None) and (
                args_tran.compare_tss is not None):
            self.multiparser.parser_gff(self.gff_outfolder, "transcript")
            self._compare_tss(tas, args_tran, log)

    def _for_one_wig(self, type_, args_tran):
        '''running transcript detection to one type of wig files'''
        if type_ == "tex_notex":
            libs = args_tran.tlibs
            wigs = args_tran.tex_wigs
        else:
            libs = args_tran.flibs
            wigs = args_tran.frag_wigs
        print("Importing {0} wig files".format(type_))
        strains = self._compute(type_, wigs, libs, args_tran)
        for strain in strains:
            out = os.path.join(self.gff_outfolder, "_".join([
                strain, "transcript", type_ + ".gff"]))
            print(os.path.join(args_tran.out_folder,
                                 "_".join([strain, type_])))
            self.helper.sort_gff(os.path.join(args_tran.out_folder,
                                 "_".join([strain, type_])), out)
            os.remove(os.path.join(args_tran.out_folder,
                                   "_".join([strain, type_])))
        return strains

    def _for_two_wigs(self, strains, args_tran, log):
        '''merge the results of fragemented and tex treated libs'''
        if (args_tran.frag_wigs is not None) and (
                args_tran.tex_wigs is not None):
            log.write("Running combine_frag_tex.py to merge the results from "
                      "fragmented libs and dRNA-Seq libs.\n")
            print("Merging fragmented and tex treated ones")
            for strain in strains:
                frag_gff = os.path.join(self.gff_outfolder,
                                        "_".join([strain, self.frag]))
                tex_gff = os.path.join(self.gff_outfolder,
                                       "_".join([strain, self.tex]))
                final_gff = os.path.join(self.gff_outfolder,
                                         "_".join([strain, self.endfix_tran]))
                for gff in os.listdir(self.gff_outfolder):
                    if "_transcript_" in gff:
                        filename = gff.split("_transcript_")
                        if (strain == filename[0]) and (
                                "tex_notex.gff" == filename[1]):
                            tex_file = gff
                        elif (strain == filename[0]) and (
                                "fragment.gff" == filename[1]):
                            frag_file = gff
                combine(os.path.join(self.gff_outfolder, frag_file),
                        os.path.join(self.gff_outfolder, tex_file),
                        args_tran.tolerance,
                        os.path.join(self.gff_outfolder,
                                     "_".join([strain, self.endfix_tran])))
                os.remove(frag_gff)
                os.remove(tex_gff)
                log.write("\t" + final_gff + " is generated.\n")
        else:
            if args_tran.frag_wigs is not None:
                for strain in strains:
                    frag_gff = os.path.join(
                            self.gff_outfolder, "_".join([strain, self.frag]))
                    final_gff = os.path.join(
                            self.gff_outfolder,
                            "_".join([strain, self.endfix_tran]))
                    shutil.move(frag_gff, final_gff)
                    log.write("\t" + final_gff + " is generated.\n")
            elif args_tran.tex_wigs is not None:
                for strain in strains:
                    tex_gff = os.path.join(
                            self.gff_outfolder, "_".join([strain, self.tex]))
                    final_gff = os.path.join(
                            self.gff_outfolder,
                            "_".join([strain, self.endfix_tran]))
                    shutil.move(tex_gff, final_gff)
                    log.write("\t" + final_gff + " is generated.\n")

    def _post_modify(self, tas, args_tran):
        '''modify the transcript by comparing with genome annotation'''
        for ta in tas:
            for gff in os.listdir(args_tran.gffs):
                if (".gff" in gff) and (gff[:-4] == ta):
                    break
            print("Modifying {0} by refering to {1}".format(ta, gff))
            fill_gap(os.path.join(args_tran.gffs, gff),
                     os.path.join(self.tran_path,
                     "_".join([ta, self.endfix_tran])),
                     "overlap", self.tmps["overlap"], args_tran.modify)
            fill_gap(os.path.join(args_tran.gffs, gff),
                     os.path.join(self.tran_path,
                     "_".join([ta, self.endfix_tran])),
                     "uni", self.tmps["uni"], args_tran.modify)
            tmp_merge = os.path.join(self.gff_outfolder, self.tmps["merge"])
            if self.tmps["merge"] in self.gff_outfolder:
                os.remove(tmp_merge)
            self.helper.merge_file(self.tmps["overlap"], tmp_merge)
            self.helper.merge_file(self.tmps["uni"], tmp_merge)
            tmp_out = os.path.join(self.gff_outfolder, "_".join(["tmp", ta]))
            self.helper.sort_gff(tmp_merge, tmp_out)
            os.remove(self.tmps["overlap"])
            os.remove(self.tmps["uni"])
            os.remove(tmp_merge)
            final_out = os.path.join(self.gff_outfolder,
                                     "_".join(["final", ta]))
            longer_ta(tmp_out, args_tran.length, final_out)
            shutil.move(final_out,
                        os.path.join(self.tmps["tran"],
                                     "_".join([ta, self.endfix_tran])))
            os.remove(tmp_out)
        shutil.rmtree(self.gff_outfolder)
        shutil.move(self.tmps["tran"], self.gff_outfolder)

    def _remove_file(self, args_tran):
        if "tmp_wig" in os.listdir(args_tran.out_folder):
            shutil.rmtree(os.path.join(args_tran.out_folder, "tmp_wig"))
        if "merge_wigs" in os.listdir(args_tran.out_folder):
            shutil.rmtree(os.path.join(args_tran.out_folder, "merge_wigs"))
        self.helper.remove_tmp_dir(args_tran.gffs)
        self.helper.remove_tmp_dir(args_tran.compare_tss)
        self.helper.remove_tmp_dir(args_tran.terms)
        self.helper.remove_tmp(os.path.join(args_tran.out_folder, "gffs"))
        self.helper.remove_tmp(self.gff_outfolder)

    def _compare_term_tran(self, args_tran, log):
        '''searching the associated terminator to transcript'''
        if args_tran.terms is not None:
            print("Comparing between terminators and transcripts")
            self.multiparser.parser_gff(args_tran.terms, "term")
            if args_tran.gffs is not None:
                self.multiparser.combine_gff(
                    args_tran.gffs,
                    os.path.join(args_tran.terms, "tmp"), None, "term")
            log.write("Running compare_tran_term.py to compare transcripts "
                      "with terminators.\n")
            compare_term_tran(self.gff_outfolder,
                              os.path.join(args_tran.terms, "tmp"),
                              args_tran.fuzzy_term, args_tran.fuzzy_term,
                              args_tran.out_folder, "transcript",
                              args_tran.terms, self.gff_outfolder)
            for file_ in os.listdir(os.path.join(args_tran.out_folder, "statistics")):
                if file_.startswith("stat_compare_transcript_terminator_"):
                    log.write("\t" + file_ + " is generated.\n")

    def _re_table(self, args_tran, log):
        log.write("Running re_table.py to generate coverage information.\n")
        log.write("The following files are updated:\n")
        for gff in os.listdir(self.gff_outfolder):
            if os.path.isfile(os.path.join(self.gff_outfolder, gff)):
                tran_table = os.path.join(args_tran.out_folder, "tables",
                                          gff.replace(".gff", ".csv"))
                reorganize_table(args_tran.libs, args_tran.merge_wigs,
                                 "Coverage_details", tran_table)
                log.write("\t" + tran_table + "\n")

    def _list_files(self, folder, log, end):
        log.write("The following files in {0} are generated:\n".format(folder))
        for file_ in os.listdir(folder):
            if (end is not None) and (file_.endswith(end)):
                log.write("\t" + file_ + "\n")
            elif end is None:
                log.write("\t" + file_ + "\n")


    def run_transcript(self, args_tran, log):
        if (args_tran.frag_wigs is None) and (args_tran.tex_wigs is None):
            log.write("No wig file is assigned.\n")
            print("Error: There is no wiggle file!\n")
            sys.exit()
        if args_tran.frag_wigs is not None:
            log.write("Running transcript_detection.py for detecting "
                      "transcripts based on fragmented libs.\n")
            strains = self._for_one_wig("fragment", args_tran)
        if args_tran.tex_wigs is not None:
            log.write("Running transcript_detection.py for detecting "
                      "transcripts based on dRNA-Seq libs.\n")
            strains = self._for_one_wig("tex_notex", args_tran)
        self._for_two_wigs(strains, args_tran, log)
        tas = []
        if "none" not in args_tran.modify:
            for gff in os.listdir(args_tran.gffs):
                if gff.endswith(".gff"):
                    self.helper.sort_gff(os.path.join(args_tran.gffs, gff),
                                         self.tmps["gff"])
                    shutil.move(self.tmps["gff"],
                                os.path.join(args_tran.gffs, gff))
            self.multiparser.combine_gff(args_tran.gffs, os.path.join(
                args_tran.gffs, "tmp"), None, None)
            self.multiparser.parser_gff(self.gff_outfolder, "transcript")
            self.multiparser.combine_gff(args_tran.gffs, self.tran_path,
                                         None, "transcript")
            self.helper.check_make_folder(self.tmps["tran"])
            for ta in os.listdir(self.tran_path):
                if ta.endswith(".gff"):
                    if os.path.getsize(os.path.join(self.tran_path, ta)) != 0:
                        tas.append(ta.replace("_" + self.endfix_tran, ""))
            log.write("Running fill_gap.py to modify transcripts "
                      "based on genome annotations.\n")
            self._post_modify(tas, args_tran)
        self._compare_tss_cds(tas, args_tran, log)
        self._compare_term_tran(args_tran, log)
        print("Generating tables for the details")
        log.write("Running gen_table_tran.py to generate the table of transcripts.\n")
        gen_table_transcript(self.gff_outfolder, args_tran)
        self._list_files(os.path.join(args_tran.out_folder, "tables"), log, None)
        log.write("Running plot_tran to plot the distribution of the length of "
                  "the transcripts.\n")
        plot_tran(self.gff_outfolder, self.stat_path, args_tran.max_dist)
        self._list_files(self.stat_path, log, ".png")
        self._re_table(args_tran, log)
        self._remove_file(args_tran)
Beispiel #35
0
class Ribos(object):
    '''detection of riboswitch and RNA thermometer'''

    def __init__(self, args_ribo):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.gff_parser = Gff3Parser()
        self.gff_path = os.path.join(args_ribo.gffs, "tmp")
        self.tss_path = os.path.join(args_ribo.tsss, "tmp")
        self.tran_path = os.path.join(args_ribo.trans, "tmp")
        self.fasta_path = os.path.join(args_ribo.fastas, "tmp")
        if (args_ribo.program == "both") or (
                args_ribo.program == "riboswitch"):
            (self.ribos_stat_folder, self.ribos_gff_outfolder,
             self.ribos_table_folder, self.ribos_scan_folder,
             self.ribos_tmp_files, self.ribos_rfam,
             self.ribos_suffixs) = self._create_out_folders(
                args_ribo.ribos_out_folder, "riboswitch",
                args_ribo.database)
        if (args_ribo.program == "both") or (
                args_ribo.program == "thermometer"):
            (self.thermo_stat_folder, self.thermo_gff_outfolder,
             self.thermo_table_folder, self.thermo_scan_folder,
             self.thermo_tmp_files, self.thermo_rfam,
             self.thermo_suffixs) = self._create_out_folders(
                args_ribo.thermo_out_folder, "RNA_thermometer",
                args_ribo.database)

    def _create_out_folders(self, out_folder, feature, database):
        stat_folder = os.path.join(out_folder, "statistics")
        gff_outfolder = os.path.join(out_folder, "gffs")
        table_folder = os.path.join(out_folder, "tables")
        scan_folder = os.path.join(out_folder, "scan_Rfam_results")
        tmp_files = {"fasta": os.path.join(
                              out_folder, "tmp_fasta"),
                     "scan": os.path.join(
                              out_folder, "tmp_scan"),
                     "table": os.path.join(
                              out_folder, "tmp_table")}
        rfam = os.path.join(database, "Rfam_" + feature + ".cm")
        suffixs = {"csv": feature + ".csv",
                   "txt": feature + "_prescan.txt",
                   "re_txt": feature + "_scan.txt",
                   "re_csv": feature + "_scan.csv"}
        return (stat_folder, gff_outfolder, table_folder, scan_folder,
                tmp_files, rfam, suffixs)

    def _run_cmscan(self, args_ribo, seq, type_, prefix, tmp_files,
                    suffixs, rfam):
        scan_file = os.path.join(tmp_files["scan"],
                                 "_".join([prefix, suffixs[type_]]))
        scan = open(scan_file, "w")
        call([args_ribo.cmscan_path, "--incE",
              str(args_ribo.e_value), "--acc", rfam, seq], stdout=scan)
        scan.close()
        return scan_file

    def _scan_extract_rfam(self, prefixs, args_ribo, tmp_files, suffixs,
                           feature, rfam):
        '''extract the seq of candidates and scanning the candidates'''
        for gff in os.listdir(self.gff_path):
            if gff.endswith(".gff"):
                prefix = gff.replace(".gff", "")
                first_seq = os.path.join(tmp_files["fasta"],
                                         prefix + ".fa")
                prefixs.append(prefix)
                print("Extracting sequences of candidates for {0}".format(
                      prefix))
                extract_potential_rbs(
                      os.path.join(self.fasta_path, prefix + ".fa"),
                      os.path.join(self.gff_path, gff),
                      os.path.join(self.tss_path, prefix + "_TSS.gff"),
                      os.path.join(self.tran_path, prefix + "_transcript.gff"),
                      first_seq, args_ribo, feature)
                print("Pre-scanning of {0}".format(prefix))
                first_scan_file = self._run_cmscan(
                        args_ribo, first_seq, "txt", prefix, tmp_files,
                        suffixs, rfam)
                sec_seq = os.path.join(tmp_files["fasta"],
                                       "_".join([prefix, "regenerate.fa"]))
                first_table = os.path.join(
                        tmp_files["table"],
                        "_".join([prefix, suffixs["csv"]]))
                regenerate_seq(first_scan_file, first_seq,
                               first_table, sec_seq)
                print("Scanning of {0}".format(prefix))
                sec_scan_file = self._run_cmscan(
                        args_ribo, sec_seq, "re_txt", prefix, tmp_files,
                        suffixs, rfam)
                sec_table = os.path.join(
                        tmp_files["table"],
                        "_".join([prefix, suffixs["re_csv"]]))
                reextract_rbs(sec_scan_file, first_table, sec_table)
                shutil.move(sec_table, first_table)
                modify_table(first_table, args_ribo.output_all)
        return prefixs

    def _merge_results(self, args_ribo, scan_folder, suffixs, tmp_files,
                       table_folder, stat_folder, feature_id, gff_outfolder,
                       feature):
        '''merge the results from the results of two searching'''
        for gff in os.listdir(args_ribo.gffs):
            if gff.endswith(".gff"):
                prefix = gff.replace(".gff", "")
                print("Merging results of {0}".format(prefix))
                pre_strain = ""
                self.helper.check_make_folder(os.path.join(
                                              scan_folder, prefix))
                fh = open(os.path.join(args_ribo.gffs, gff))
                for entry in self.gff_parser.entries(fh):
                    if entry.seq_id != pre_strain:
                        if len(pre_strain) == 0:
                            shutil.copyfile(os.path.join(
                                tmp_files["table"],
                                "_".join([entry.seq_id, suffixs["csv"]])),
                                os.path.join(
                                    table_folder,
                                    "_".join([prefix, suffixs["csv"]])))
                        else:
                            self.helper.merge_file(os.path.join(
                                tmp_files["table"],
                                "_".join([entry.seq_id, suffixs["csv"]])),
                                os.path.join(
                                    table_folder,
                                    "_".join([prefix, suffixs["csv"]])))
                        shutil.copy(os.path.join(
                            tmp_files["scan"],
                            "_".join([entry.seq_id, suffixs["txt"]])),
                            os.path.join(scan_folder, prefix))
                        shutil.copy(os.path.join(
                            tmp_files["scan"],
                            "_".join([entry.seq_id, suffixs["re_txt"]])),
                            os.path.join(scan_folder, prefix))
                        pre_strain = entry.seq_id
                out_stat = os.path.join(
                        stat_folder,
                        "_".join(["stat", prefix, feature + ".txt"]))
                print("Computing statistics of {0}".format(prefix))
                stat_and_covert2gff(os.path.join(
                    table_folder, "_".join([prefix, suffixs["csv"]])),
                    feature_id, os.path.join(gff_outfolder,
                        "_".join([prefix, feature + ".gff"])),
                    args_ribo.fuzzy, out_stat, feature)
                fh.close()

    def _remove_tmp(self, args_ribo):
        self.helper.remove_tmp_dir(args_ribo.gffs)
        self.helper.remove_tmp_dir(args_ribo.fastas)
        self.helper.remove_tmp_dir(args_ribo.trans)
        self.helper.remove_tmp_dir(args_ribo.tsss)

    def _remove_overlap(self, gff_path, tmp_files, suffixs):
        for gff in os.listdir(gff_path):
            if gff.endswith(".gff"):
                rbs_overlap(
                    os.path.join(os.path.join(
                        tmp_files["table"],
                        "_".join([gff.replace(".gff", ""),
                                  suffixs["csv"]]))),
                    os.path.join(gff_path, gff))

    def _core_prediction(self, args_ribo, feature_id, rfam, tmp_files,
                         table_folder, feature, scan_folder, suffixs,
                         stat_folder, gff_outfolder, out_folder):
        '''main part of detection'''
        rbs_from_rfam(feature_id, args_ribo.rfam, rfam)
        print("Compressing Rfam of " + feature)
        call([args_ribo.cmpress_path, "-F", rfam])
        prefixs = []
        self.helper.check_make_folder(tmp_files["fasta"])
        self.helper.check_make_folder(tmp_files["scan"])
        self.helper.check_make_folder(tmp_files["table"])
        prefixs = self._scan_extract_rfam(
                prefixs, args_ribo, tmp_files, suffixs, feature, rfam)
        self._remove_overlap(self.gff_path, tmp_files, suffixs)
        self._merge_results(args_ribo, scan_folder, suffixs, tmp_files,
                            table_folder, stat_folder, feature_id,
                            gff_outfolder, feature)
        mapping_ribos(table_folder, feature_id, feature)
        self.helper.remove_all_content(out_folder, "tmp", "dir")

    def run_ribos(self, args_ribo):
        if args_ribo.fuzzy_rbs > 6:
            print("Error: --fuzzy_rbs should be equal or less than 6!!")
            sys.exit()
        self.multiparser.parser_gff(args_ribo.gffs, None)
        self.multiparser.parser_fasta(args_ribo.fastas)
        self.multiparser.parser_gff(args_ribo.trans, "transcript")
        self.multiparser.parser_gff(args_ribo.tsss, "TSS")
        for gff in os.listdir(args_ribo.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(
                                                 args_ribo.gffs, gff))
        if (args_ribo.program.lower() == "both") or (
                args_ribo.program.lower() == "riboswitch"):
            print("Detecting riboswtiches now")
            self._core_prediction(
                    args_ribo, args_ribo.ribos_id, self.ribos_rfam,
                    self.ribos_tmp_files, self.ribos_table_folder,
                    "riboswitch", self.ribos_scan_folder, self.ribos_suffixs,
                    self.ribos_stat_folder, self.ribos_gff_outfolder,
                    args_ribo.ribos_out_folder)
        if (args_ribo.program.lower() == "both") or (
                args_ribo.program.lower() == "thermometer"):
            print("Detecting RNA thermometers now")
            self._core_prediction(
                    args_ribo, args_ribo.thermo_id, self.thermo_rfam,
                    self.thermo_tmp_files, self.thermo_table_folder,
                    "RNA_thermometer", self.thermo_scan_folder,
                    self.thermo_suffixs, self.thermo_stat_folder,
                    self.thermo_gff_outfolder, args_ribo.thermo_out_folder)
        self._remove_tmp(args_ribo)
Beispiel #36
0
class TSSpredator(object):

    def __init__(self, args_tss):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.master = os.path.join(args_tss.out_folder, "MasterTables")
        self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta":
                     "tmp_tss", "tmp": "tmp"}
        if args_tss.ta_files is not None:
            self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp")
        else:
            self.tmps["ta"] = None
        self.gff_path = os.path.join(args_tss.gffs, "tmp")
        if args_tss.manual is not None:
            self.manual_path = os.path.join(args_tss.manual, "tmp")
        self.wig_path = os.path.join(args_tss.wig_folder, "tmp")
        self.fasta_path = os.path.join(args_tss.fastas, "tmp")
        self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics")
        self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs")

    def _assign_dict(self, lib_datas):
        return {"wig": lib_datas[0],
                "tex": lib_datas[1],
                "condition": int(lib_datas[2]),
                "replicate": lib_datas[3],
                "strand": lib_datas[4]}

    def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix, rep_set):
        for num_id in range(1, lib_num+1):
            cond_list = []
            for lib in lib_list:
                if num_id == lib["condition"]:
                    cond_list.append(lib)
            cond_sort_list = sorted(cond_list, key=lambda k: k['replicate'])
            reps = []
            for cond in cond_sort_list:
                out.write("{0}_{1}{2} = {3}\n".format(
                          prefix, cond["condition"], cond["replicate"],
                          os.path.join(wig_folder, cond["wig"])))
                reps.append(cond["replicate"])
            for rep in sorted(rep_set):
                if rep not in reps:
                    out.write("{0}_{1}{2} = \n".format(
                              prefix, cond["condition"], rep))

    def _start_to_run(self, tsspredator_path, config_file, out_path, prefix, log):
        print("Running TSSpredator for " + prefix)
        log.write("Make sure the version of TSSpredator is at least 1.06.\n")
        out = open(os.path.join(out_path, "log.txt"), "w")
        err = open(os.path.join(out_path, "err.txt"), "w")
        log.write(" ".join(["java", "-jar", tsspredator_path,
                            config_file]) + "\n")
        call(["java", "-jar", tsspredator_path,
              config_file], stdout=out, stderr=err)
        out.close()
        err.close()
        log.write("Done!\n")
        log.write("The following files are generated in {0}:\n".format(out_path))
        for file_ in os.listdir(out_path):
            log.write("\t" + file_ + "\n")

    def _import_lib(self, libs, wig_folder, project_strain_name,
                    out, gff, program, fasta):
        lib_dict = {"fp": [], "fm": [], "nm": [], "np": []}
        lib_num = 0
        rep_set = set()
        list_num_id = []
        for lib in libs:
            lib_datas = lib.split(":")
            if not lib_datas[0].endswith(".wig"):
                print("Error: Wiggle files are not end with .wig!")
                sys.exit()
            for wig in os.listdir(wig_folder):
                filename = wig.split("_STRAIN_")
                if (filename[0] == lib_datas[0][:-4]) and (
                        filename[1][:-4] == project_strain_name):
                    lib_datas[0] = wig
            if int(lib_datas[2]) > lib_num:
                lib_num = int(lib_datas[2])
            if lib_datas[3] not in rep_set:
                rep_set.add(lib_datas[3])
            if (lib_datas[1] == "tex") and (lib_datas[4] == "+"):
                lib_dict["fp"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"):
                lib_dict["fm"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"):
                lib_dict["np"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"):
                lib_dict["nm"].append(self._assign_dict(lib_datas))
        for num_id in range(1, lib_num+1):
            out.write("annotation_{0} = {1}\n".format(num_id, gff))
        if program.lower() == "tss":
            self._print_lib(lib_num, lib_dict["fm"], out,
                            wig_folder, "fivePrimeMinus", rep_set)
            self._print_lib(lib_num, lib_dict["fp"], out,
                            wig_folder, "fivePrimePlus", rep_set)
        elif program.lower() == "ps":
            self._print_lib(lib_num, lib_dict["nm"], out,
                            wig_folder, "fivePrimeMinus", rep_set)
            self._print_lib(lib_num, lib_dict["np"], out,
                            wig_folder, "fivePrimePlus", rep_set)
        else:
            print("Error: Wrong program name! Please assing tss "
                  "or processing_site.")
            sys.exit()
        for num_id in range(1, lib_num+1):
            out.write("genome_{0} = {1}\n".format(num_id, fasta))
        for num_id in range(1, lib_num+1):
            list_num_id.append(str(num_id))
        return lib_num, num_id, rep_set, lib_dict, list_num_id

    def _print_repmatch(self, args_tss, out):
        '''check replicate match'''
        detect_all = False
        for rep in args_tss.repmatch:
            if "all" in rep:
                detect_all = True
                match = rep.split("_")[-1]
                out.write("minNumRepMatches = {0}\n".format(match))
                break
        if not detect_all:
            nums = {}
            matchs = {}
            for match in args_tss.repmatch:
                lib = match.split("_")[0]
                rep = match.split("_")[-1]
                matchs[lib] = rep
                if rep not in nums.keys():
                    nums[rep] = 1
                else:
                    nums[rep] += 1
            for rep, num in nums.items():
                if num == max(nums.values()):
                    out.write("minNumRepMatches = {0}\n".format(rep))
                    max_rep = rep
                    break
            for lib, rep in matchs.items():
                if rep != max_rep:
                    out.write("minNumRepMatches_{0} = {1}\n".format(
                        lib, rep))

    def _extract_best_para(self, args_tss, prefix, log):
        detect = False
        for best_file in os.listdir(args_tss.auto_load):
            if best_file == "_".join(["best", prefix + ".csv"]):
                bh = open(os.path.join(args_tss.auto_load, best_file),"r" )
                lines = bh.readlines()
                bh.close()
                if len(lines[len(lines)-1].split("\t")) < 8:
                    print("Error: some information in {0} is missing. "
                          "It may be due to that \"optimize_tss_ps\" did "
                          "not finish successfully.".format(best_file))
                    log.write("Error: some information in {0} is missing. "
                              "It may be due to that \"optimize_tss_ps\" did "
                              "not finish successfully.\n".format(best_file))
                    sys.exit()
                else:
                    para_info = lines[len(lines)-1].split("\t")[1].split("_")
                    detect_all =  all(elem in para_info
                            for elem in ["he", "rh", "fa", "rf",
                                         "bh", "ef", "pf"])
                    if (not detect_all) or (len(para_info) != 14):
                        print("Error: {0} is complete. Some parameters are "
                              "missing!".format(best_file))
                        log.write("Error: {0} is complete. Some parameters "
                                  "are missing!\n".format(best_file))
                        sys.exit()
                    else:
                        detect = True
                        height = para_info[para_info.index("he") + 1]
                        height_reduction = para_info[
                            para_info.index("rh") + 1]
                        factor = para_info[para_info.index("fa") + 1]
                        factor_reduction = para_info[
                            para_info.index("rf") + 1]
                        base_height = para_info[
                            para_info.index("bh") + 1]
                        enrichment_factor = para_info[
                            para_info.index("ef") + 1]
                        processing_factor = para_info[
                            para_info.index("pf") + 1]
        if detect:
            return height, height_reduction, factor, factor_reduction, \
                   base_height, enrichment_factor, processing_factor
        else:
            print("Error: No best_{0}.csv can be found in {1}! ".format(
                prefix, args_tss.auto_load))
            log.write("Error: No best_{0}.csv can be found in {1}\n".format(
                prefix, args_tss.auto_load))
            sys.exit()

    def _get_input_para(self, args_tss, prefix, log):
        if args_tss.genome_order is None:
            height = args_tss.height[0]
            height_reduction = args_tss.height_reduction[0]
            factor = args_tss.factor[0]
            factor_reduction = args_tss.factor_reduction[0]
            base_height = args_tss.base_height[0]
            enrichment_factor = args_tss.enrichment_factor[0]
            processing_factor = args_tss.processing_factor[0]
        else:
            if prefix not in args_tss.genome_order:
                print("Error: the parameters for {0} were not assigned!".format(
                    prefix))
                log.write("Error: the parameters for {0} were not assigned!\n".format(
                    prefix))
                sys.exit()
            else:
                index = args_tss.genome_order.index(prefix)
                height = args_tss.height[index]
                height_reduction = args_tss.height_reduction[index]
                factor = args_tss.factor[index]
                factor_reduction = args_tss.factor_reduction[index]
                base_height = args_tss.base_height[index]
                enrichment_factor = args_tss.enrichment_factor[index]
                processing_factor = args_tss.processing_factor[index]
        return height, height_reduction, factor, factor_reduction, \
               base_height, enrichment_factor, processing_factor

    def _gen_config(self, project_strain_name, args_tss, gff,
                    wig_folder, fasta, config_file, log):
        '''generation of config files'''
        log.write("Generating config files for TSSpredator.\n")
        if args_tss.auto_load is not None:
            height, height_reduction, factor, factor_reduction, \
            base_height, enrichment_factor, processing_factor = \
            self._extract_best_para(args_tss, project_strain_name, log)
        else:
            height, height_reduction, factor, factor_reduction, \
            base_height, enrichment_factor, processing_factor = \
            self._get_input_para(args_tss, project_strain_name, log)
        master_folder = "MasterTable_" + project_strain_name
        out_path = os.path.join(self.master, master_folder)
        self.helper.check_make_folder(out_path)
        out = open(config_file, "w")
        out.write("TSSinClusterSelectionMethod = HIGHEST\n")
        out.write("allowedCompareShift = 1\n")
        out.write("allowedRepCompareShift = 1\n")
        lib_num, num_id, rep_set, lib_dict, list_num_id = \
            self._import_lib(args_tss.libs, wig_folder, project_strain_name,
                             out, gff, args_tss.program, fasta)
        out.write("idList = ")
        out.write(",".join(list_num_id) + "\n")
        out.write("maxASutrLength = 100\n")
        out.write("maxGapLengthInGene = 500\n")
        out.write("maxNormalTo5primeFactor = {0}\n".format(
                  processing_factor))
        out.write("maxTSSinClusterDistance = {0}\n".format(
                  args_tss.cluster + 1))
        out.write("maxUTRlength = {0}\n".format(args_tss.utr_length))
        out.write("min5primeToNormalFactor = {0}\n".format(
                  enrichment_factor))
        out.write("minCliffFactor = {0}\n".format(factor))
        out.write("minCliffFactorDiscount = {0}\n".format(
                  factor_reduction))
        out.write("minCliffHeight = {0}\n".format(height))
        out.write("minCliffHeightDiscount = {0}\n".format(
                  height_reduction))
        out.write("minNormalHeight = {0}\n".format(base_height))
        self._print_repmatch(args_tss, out)
        out.write("minPlateauLength = 0\n")
        out.write("mode = cond\n")
        out.write("normPercentile = 0.9\n")
        if args_tss.program.lower() == "tss":
            self._print_lib(lib_num, lib_dict["nm"], out,
                            wig_folder, "normalMinus", rep_set)
            self._print_lib(lib_num, lib_dict["np"], out,
                            wig_folder, "normalPlus", rep_set)
        else:
            self._print_lib(lib_num, lib_dict["fm"], out,
                            wig_folder, "normalMinus", rep_set)
            self._print_lib(lib_num, lib_dict["fp"], out,
                            wig_folder, "normalPlus", rep_set)
        out.write("numReplicates = {0}\n".format(len(rep_set)))
        out.write("numberOfDatasets = {0}\n".format(lib_num))
        out.write("outputDirectory = {0}\n".format(out_path))
        for prefix_id in range(len(args_tss.output_prefixs)):
            out.write("outputPrefix_{0} = {1}\n".format(
                      prefix_id + 1, args_tss.output_prefixs[prefix_id]))
        out.write("projectName = {0}\n".format(project_strain_name))
        out.write("superGraphCompatibility = igb\n")
        out.write("texNormPercentile = 0.5\n")
        out.write("writeGraphs = 0\n")
        out.write("writeNocornacFiles = 0\n")
        log.write("\t" + config_file + " is generated.\n")
        out.close()

    def _convert_gff(self, prefixs, args_tss, log):
        for prefix in prefixs:
            out_file = os.path.join(self.gff_outfolder, "_".join([
                           prefix, args_tss.program]) + ".gff")
            gff_f = open(out_file, "w")
            out_path = os.path.join(self.master, "_".join([
                           "MasterTable", prefix]))
            if "MasterTable.tsv" not in os.listdir(out_path):
                print("Error: There is not MasterTable file in {0} ".format(
                      out_path))
                print("Please check configuration file.")
                log.write("not MasterTable file is found in {0}\n".format(
                           out_path))
            else:
                if args_tss.program.lower() == "processing":
                    feature = "processing_site"
                elif args_tss.program.lower() == "tss":
                    feature = "TSS"
                self.converter.convert_mastertable2gff(
                    os.path.join(out_path, "MasterTable.tsv"),
                    "ANNOgesic", feature, prefix, out_file)
                log.write("\t" + out_file + "is generated.\n")
            gff_f.close()

    def _merge_manual(self, tsss, args_tss):
        '''if manual detected TSS is provided, it can merge manual detected TSS 
        and TSSpredator predicted TSS'''
        self.helper.check_make_folder(os.path.join(os.getcwd(),
                                      self.tmps["tss"]))
        for tss in tsss:
            for gff in os.listdir(args_tss.gffs):
                if (gff[:-4] == tss) and (".gff" in gff):
                    break
            filename = "_".join([tss, args_tss.program]) + ".gff"
            predict = os.path.join(self.gff_outfolder, filename)
            manual = os.path.join(self.manual_path, tss + ".gff")
            fasta = os.path.join(self.fasta_path, tss + ".fa")
            stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss)
            if os.path.exists(manual):
                print("Merging and classiflying manually-detected "
                      "TSSs for {0}".format(tss))
                merge_manual_predict_tss(
                    predict, stat_file,
                    os.path.join(self.tmps["tss"], filename),
                    os.path.join(args_tss.gffs, gff), args_tss, manual, fasta)
            if os.path.exists(stat_file):
                shutil.move(stat_file, os.path.join(
                    args_tss.out_folder, "statistics", tss, stat_file))
        self.helper.move_all_content(self.tmps["tss"],
                                     self.gff_outfolder, [".gff"])
        shutil.rmtree(self.tmps["tss"])

    def _validate(self, tsss, args_tss, log):
        '''validate TSS with genome annotation'''
        print("Validating TSSs with genome annotations")
        log.write("Running validate_gene.py to compare genome "
                  "annotations and TSSs/PSs.\n")
        for tss in tsss:
            for gff in os.listdir(args_tss.gffs):
                if (gff[:-4] == tss) and (".gff" in gff):
                    break
            stat_file = os.path.join(
                    self.stat_outfolder, tss,
                    "".join(["stat_gene_vali_", tss, ".csv"]))
            out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff")
            if args_tss.program.lower() == "tss":
                compare_file = os.path.join(self.gff_outfolder,
                                            "_".join([tss, "TSS.gff"]))
            elif args_tss.program.lower() == "processing":
                compare_file = os.path.join(self.gff_outfolder,
                                            "_".join([tss, "processing.gff"]))
            validate_gff(compare_file, os.path.join(args_tss.gffs, gff),
                         stat_file, out_cds_file, args_tss.utr_length,
                         args_tss.program.lower())
            log.write("\t" + stat_file + " is generated.\n")
            shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff))

    def _compare_ta(self, tsss, args_tss, log):
        '''compare TSS with transcript'''
        detect = False
        log.write("Running stat_TA_comparison to compare transcripts "
                  "and TSSs/PSs.\n")
        print("Comparing transcripts and TSSs")
        self.multiparser.parser_gff(args_tss.ta_files, "transcript")
        self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"],
                                     None, "transcript")
        for tss in tsss:
            stat_out = os.path.join(
                    self.stat_outfolder, tss, "".join([
                        "stat_compare_TSS_transcript_",
                        tss, ".csv"]))
            for ta in os.listdir(self.tmps["ta"]):
                filename = ta.split("_transcript")
                if (filename[0] == tss) and (filename[1] == ".gff"):
                    detect = True
                    break
            compare_file = os.path.join(self.gff_outfolder,
                                        "_".join([tss, "TSS.gff"]))
            if detect:
                stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file,
                            stat_out, self.tmps["ta_tss"],
                            self.tmps["tss_ta"], args_tss.fuzzy)
                self.helper.sort_gff(self.tmps["tss_ta"], compare_file)
                self.helper.sort_gff(self.tmps["ta_tss"],
                                     os.path.join(args_tss.ta_files, ta))
                os.remove(self.tmps["tss_ta"])
                os.remove(self.tmps["ta_tss"])
                detect = False
            log.write("\t" + stat_out + " is generated.\n")

    def _stat_tss(self, tsss, feature, log):
        print("Running statistaics")
        for tss in tsss:
            compare_file = os.path.join(self.gff_outfolder,
                                        "_".join([tss, feature]) + ".gff")
            stat_tsspredator(
                compare_file, feature,
                os.path.join(self.stat_outfolder, tss, "_".join([
                    "stat", feature, "class", tss]) + ".csv"),
                os.path.join(self.stat_outfolder, tss, "_".join([
                    "stat", feature, "libs", tss]) + ".csv"))
            self.helper.move_all_content(os.getcwd(), os.path.join(
                self.stat_outfolder, tss), ["_class", ".png"])
            if os.path.exists(os.path.join(
                    self.stat_outfolder, "TSSstatistics.tsv")):
                shutil.move(
                    os.path.join(
                        self.stat_outfolder, "TSSstatistics.tsv"),
                    os.path.join(
                        self.stat_outfolder, tss, "TSSstatistics.tsv"))
            plot_venn(compare_file, feature)
            self.helper.move_all_content(os.getcwd(), os.path.join(
                self.stat_outfolder, tss), ["_venn", ".png"])
            log.write("The following files in {0} are generated:\n".format(
                (os.path.join(self.stat_outfolder, tss))))
            for file_ in os.listdir(os.path.join(
                    self.stat_outfolder, tss)):
                log.write("\t" + file_ + "\n")

    def _get_prefixs(self, args_tss):
        prefixs = []
        detect = False
        for fasta in os.listdir(self.fasta_path):
            run = False
            for gff in os.listdir(self.gff_path):
                if fasta[:-3] == gff[:-4]:
                    prefix = fasta[:-3]
                    for wig in os.listdir(self.wig_path):
                        filename = wig.split("_STRAIN_")
                        if filename[1][:-4] == prefix:
                            detect = True
                            break
                    if detect:
                        prefixs.append(prefix)
        return prefixs

    def _merge_wigs(self, wig_folder, prefix, libs):
        self.helper.check_make_folder(os.path.join(os.getcwd(),
                                      self.tmps["tmp"]))
        for wig_file in os.listdir(wig_folder):
            for lib in libs:
                info = lib.split(":")
                if (info[0][:-4] in wig_file) and (info[-1] == "+") and (
                        prefix in wig_file) and (
                        os.path.isfile(os.path.join(wig_folder, wig_file))):
                    Helper().merge_file(
                            os.path.join(wig_folder, wig_file),
                            os.path.join("tmp", "merge_forward.wig"))
                if (info[0][:-4] in wig_file) and (info[-1] == "-") and (
                        prefix in wig_file) and (
                        os.path.isfile(os.path.join(wig_folder, wig_file))):
                    Helper().merge_file(
                            os.path.join(wig_folder, wig_file),
                            os.path.join("tmp", "merge_reverse.wig"))

    def _check_orphan(self, prefixs, wig_folder, args_tss):
        '''if genome has no locus tag, it can use for classify the TSS'''
        for prefix in prefixs:
            self._merge_wigs(wig_folder, prefix, args_tss.libs)
            tmp_tss = os.path.join(self.tmps["tmp"], "_".join([
                          prefix, args_tss.program + ".gff"]))
            pre_tss = os.path.join(self.gff_outfolder, "_".join([
                          prefix, args_tss.program + ".gff"]))
            check_orphan(pre_tss, os.path.join(
                args_tss.gffs, prefix + ".gff"),
                "tmp/merge_forward.wig", "tmp/merge_reverse.wig", tmp_tss)
            shutil.move(tmp_tss, pre_tss)
        shutil.rmtree("tmp")

    def _remove_files(self, args_tss):
        print("Remove temperary files and folders")
        self.helper.remove_tmp_dir(args_tss.fastas)
        self.helper.remove_tmp_dir(args_tss.gffs)
        self.helper.remove_tmp_dir(args_tss.ta_files)
        if "merge_forward.wig" in os.listdir(os.getcwd()):
            os.remove("merge_forward.wig")
        if "merge_reverse.wig" in os.listdir(os.getcwd()):
            os.remove("merge_reverse.wig")
        shutil.rmtree(args_tss.wig_folder)
        if args_tss.manual is not None:
            shutil.rmtree(args_tss.manual)

    def _deal_with_overlap(self, out_folder, args_tss):
        '''deal with the situation that TSS and 
        processing site at the same position'''
        if not args_tss.overlap_feature:
            pass
        else:
            print("Comparing TSSs and Processing sites")
            if args_tss.program.lower() == "tss":
                for tss in os.listdir(out_folder):
                    if tss.endswith("_TSS.gff"):
                        ref = self.helper.get_correct_file(
                                args_tss.overlap_gffs, "_processing.gff",
                                tss.replace("_TSS.gff", ""), None, None)
                        filter_tss_pro(os.path.join(out_folder, tss),
                                       ref, args_tss.program,
                                       args_tss.cluster)
            elif args_tss.program.lower() == "processing":
                for tss in os.listdir(out_folder):
                    if tss.endswith("_processing.gff"):
                        ref = self.helper.get_correct_file(
                                args_tss.overlap_gffs, "_TSS.gff",
                                tss.replace("_processing.gff", ""), None, None)
                        filter_tss_pro(os.path.join(out_folder, tss),
                                       ref, args_tss.program,
                                       args_tss.cluster)

    def _low_expression(self, args_tss, gff_folder):
        '''deal with the low expressed TSS'''
        prefix = None
        self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs)
        for gff in os.listdir(gff_folder):
            if (args_tss.program.lower() == "tss") and (
                    gff.endswith("_TSS.gff")):
                prefix = gff.replace("_TSS.gff", "")
            elif (args_tss.program.lower() == "processing") and (
                    gff.endswith("_processing.gff")):
                prefix = gff.replace("_processing.gff", "")
            if prefix:
                out = open(os.path.join(
                    self.stat_outfolder, prefix, "_".join([
                        "stat", prefix, "low_expression_cutoff.csv"])), "w")
                out.write("\t".join(["Genome", "Cutoff_coverage"]) + "\n")
                cutoff = filter_low_expression(
                        os.path.join(gff_folder, gff), args_tss,
                        "tmp/merge_forward.wig", "tmp/merge_reverse.wig",
                        "tmp/without_low_expression.gff")
                out.write("\t".join([prefix, str(cutoff)]) + "\n")
                os.remove(os.path.join(gff_folder, gff))
                shutil.move("tmp/without_low_expression.gff",
                            os.path.join(gff_folder, gff))
                prefix = None
        out.close()

    def run_tsspredator(self, args_tss, log):
        input_folder = os.path.join(args_tss.out_folder, "configs")
        for gff in os.listdir(args_tss.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(
                                                 args_tss.gffs, gff))
        self.helper.check_make_folder(self.gff_outfolder)
        self.multiparser.parser_fasta(args_tss.fastas)
        self.multiparser.parser_gff(args_tss.gffs, None)
        self.multiparser.parser_wig(args_tss.wig_folder)
        prefixs = self._get_prefixs(args_tss)
        for prefix in prefixs:
            config = os.path.join(input_folder,
                                  "_".join(["config", prefix]) + ".ini")
            self._gen_config(
                prefix, args_tss,
                os.path.join(self.gff_path, prefix + ".gff"), self.wig_path,
                os.path.join(self.fasta_path, prefix + ".fa"), config, log)
            out_path = os.path.join(
                    self.master, "_".join(["MasterTable", prefix]))
            config_file = os.path.join(
                    input_folder, "_".join(["config", prefix]) + ".ini")
            self._start_to_run(args_tss.tsspredator_path, config_file,
                               out_path, prefix, log)
            if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")):
                shutil.move(os.path.join(out_path, "TSSstatistics.tsv"),
                            os.path.join(
                                self.stat_outfolder, "TSSstatistics.tsv"))
        if args_tss.program.lower() == "ps":
            args_tss.program = "processing"
        self._convert_gff(prefixs, args_tss, log)
        if args_tss.check_orphan:
            print("checking the orphan TSSs")
            log.write("Running check_orphan.py to re-check orphan TSSs.\n")
            self._check_orphan(prefixs,
                               os.path.join(args_tss.wig_folder, "tmp"),
                               args_tss)
        self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder,
                                     None, args_tss.program)
        datas = []
        for gff in os.listdir(self.gff_outfolder):
            if gff.endswith(".gff"):
                gff_folder = gff.replace("".join(["_", args_tss.program,
                                                  ".gff"]), "")
                self.helper.check_make_folder(
                     os.path.join(self.stat_outfolder, gff_folder))
                datas.append(gff_folder)
        if args_tss.remove_low_expression is not None:
            log.write("Running filter_low_expression.py to filter out "
                      "low expressed TSS/PS.\n")
            self._low_expression(args_tss, self.gff_outfolder)
        if args_tss.manual is not None:
            self.multiparser.parser_gff(args_tss.manual, None)
            self.multiparser.combine_gff(args_tss.gffs, self.manual_path,
                                         None, None)
            self.multiparser.combine_fasta(args_tss.gffs, self.fasta_path,
                                         None)
            self.multiparser.combine_wig(args_tss.gffs, self.wig_path,
                                         None, args_tss.libs)
            log.write("Running merge_manual.py to merge the manual TSSs.\n")
            self._merge_manual(datas, args_tss)
        log.write("Running filter_TSS_pro.py to deal with the overlap "
                  "position between TSS and PS.\n")
        self._deal_with_overlap(self.gff_outfolder, args_tss)
        log.write("Running stat_TSSpredator.py to do statistics.\n")
        self._stat_tss(datas, args_tss.program, log)
        if args_tss.validate:
            self._validate(datas, args_tss, log)
        if args_tss.ta_files is not None:
            self._compare_ta(datas, args_tss, log)
        self._remove_files(args_tss)
Beispiel #37
0
class TargetFasta(object):
    '''detection of sRNA target interaction'''
    def __init__(self, tar_folder, ref_folder):
        self.multiparser = Multiparser()
        self.seq_editer = SeqEditer()
        self.helper = Helper()
        self.folders = {"tmp_tar": os.path.join(tar_folder, "tmp")}

    def gen_folder(self, out_folder, ref_files):
        new_ref_folder = os.path.join(out_folder, "tmp_reference")
        self.helper.check_make_folder(new_ref_folder)
        for file_ in ref_files:
            shutil.copy(file_, new_ref_folder)
        self.folders["tmp_ref"] = os.path.join(new_ref_folder, "tmp")
        self.multiparser.parser_fasta(new_ref_folder)
        if os.path.exists(self.folders["tmp_tar"]):
            shutil.rmtree(self.folders["tmp_tar"])
        os.mkdir(self.folders["tmp_tar"])
        return new_ref_folder

    def get_target_fasta(self, mut_table, tar_folder, ref_files, combine,
                         out_folder):
        pass
        new_ref_folder = self.gen_folder(out_folder, ref_files)
        self.seq_editer.modify_seq(self.folders["tmp_ref"], mut_table,
                                   self.folders["tmp_tar"])
        print("Updating the reference sequences")
        mh = open(mut_table, "r")
        pre_strain = None
        out = None
        for row in csv.reader(mh, delimiter='\t'):
            strain = row[1]
            if not row[0].startswith("#"):
                if (pre_strain != row[1]):
                    fasta = os.path.join(out_folder, "fasta_files",
                                         strain + ".fa")
                    if out is not None:
                        out.close()
                    out = open(fasta, "w")
                    if strain + ".fa" in os.listdir(self.folders["tmp_tar"]):
                        with open(
                                os.path.join(self.folders["tmp_tar"],
                                             strain + ".fa")) as f_h:
                            for line in f_h:
                                out.write(line)
                    else:
                        print("Error: No fasta information of {0}.fa".format(
                            strain))
        out.close()
        if combine:
            out_seq = "updated_genomes.fa"
            if os.path.exists(out_seq):
                os.remove(out_seq)
            for seq in os.listdir(os.path.join(out_folder, "fasta_files")):
                if seq.endswith(".fa"):
                    os.system(" ".join([
                        "cat",
                        os.path.join(out_folder, "fasta_files", seq), ">>",
                        out_seq
                    ]))
                    os.remove(os.path.join(out_folder, "fasta_files", seq))
            shutil.move(out_seq,
                        os.path.join(out_folder, "fasta_files", out_seq))
        shutil.rmtree(self.folders["tmp_tar"])
        shutil.rmtree(self.folders["tmp_ref"])
        if "tmp_reference" in os.listdir(out_folder):
            shutil.rmtree(new_ref_folder)
        print("Please use the new fasta files to remapping again.")
Beispiel #38
0
class RATT(object):

    def __init__(self, args_ratt):
        self.multiparser = Multiparser()
        self.converter = Converter()
        self.format_fixer = FormatFixer()
        self.helper = Helper()
        self.gbk = os.path.join(args_ratt.ref_embls, "gbk_tmp")
        self.gbk_tmp = os.path.join(self.gbk, "tmp")
        self.embl = os.path.join(args_ratt.ref_embls, "embls")
        self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt")
        self.tmp_files = {"tar": os.path.join(args_ratt.tar_fastas, "tmp"),
                          "ref": os.path.join(args_ratt.ref_fastas, "tmp"),
                          "out_gff": os.path.join(args_ratt.gff_outfolder,
                                                  "tmp"),
                          "gff": os.path.join(args_ratt.gff_outfolder,
                                              "tmp.gff"),
                          "ptt": os.path.join(args_ratt.gff_outfolder,
                                              "tmp.ptt"),
                          "rnt": os.path.join(args_ratt.gff_outfolder,
                                              "tmp.rnt")}

    def _convert_to_pttrnt(self, gffs, files):
        for gff in files:
            if gff.endswith(".gff"):
                gff = os.path.join(gffs, gff)
                filename = gff.split("/")
                prefix = filename[-1][:-4]
                rnt = gff[:-3] + "rnt"
                ptt = gff[:-3] + "ptt"
                fasta = self.helper.get_correct_file(self.tmp_files["tar"],
                                                     ".fa", prefix, None, None)
                if fasta:
                    self.converter.convert_gff2rntptt(gff, fasta, ptt, rnt,
                                                      None, None)

    def _remove_files(self, args_ratt, out_gbk):
        self.helper.remove_all_content(args_ratt.gff_outfolder, ".gff", "file")
        self.helper.remove_all_content(args_ratt.gff_outfolder, ".ptt", "file")
        self.helper.remove_all_content(args_ratt.gff_outfolder, ".rnt", "file")
        self.helper.move_all_content(self.tmp_files["out_gff"],
                                     args_ratt.gff_outfolder, None)
        shutil.rmtree(self.tmp_files["out_gff"])
        shutil.rmtree(self.tmp_files["tar"])
        shutil.rmtree(self.tmp_files["ref"])
        shutil.rmtree(self.embl)
        self.helper.remove_all_content(args_ratt.tar_fastas, "_folder", "dir")
        self.helper.remove_all_content(args_ratt.ref_fastas, "_folder", "dir")
        if out_gbk:
            shutil.rmtree(out_gbk)

    def _convert_to_gff(self, ratt_result, args_ratt, files):
        name = ratt_result.split(".")
        filename = ".".join(name[1:-2]) + ".gff"
        output_file = os.path.join(args_ratt.output_path, filename)
        self.converter.convert_embl2gff(
             os.path.join(args_ratt.output_path, ratt_result), output_file)
        self.format_fixer.fix_ratt(output_file, ".".join(name[1:-2]),
                                   "tmp_gff")
        shutil.move("tmp_gff", output_file)
        shutil.copy(output_file, os.path.join(args_ratt.gff_outfolder,
                                              filename))
        files.append(filename)

    def _parser_embl_gbk(self, files):
        self.helper.check_make_folder(self.gbk)
        for file_ in files:
            close = False
            with open(file_, "r") as f_h:
                for line in f_h:
                    if (line.startswith("LOCUS")):
                        out = open(self.gbk_tmp, "w")
                        datas = line.split(" ")
                        for data in datas:
                            if (len(data) != 0) and (data != "LOCUS"):
                                filename = ".".join([data, "gbk"])
                                break
                    elif (line.startswith("VERSION")):
                        datas = line.split(" ")
                        for data in datas:
                            if (len(data) != 0) and (data != "VERSION"):
                                new_filename = ".".join([data, "gbk"])
                                break
                        if new_filename.find(filename):
                            filename = new_filename
                    if out:
                        out.write(line)
                    if line.startswith("//"):
                        out.close()
                        close = True
                        shutil.move(self.gbk_tmp,
                                    os.path.join(self.gbk, filename))
            if not close:
                out.close()
        return self.gbk

    def _convert_embl(self, ref_embls):
        detect_gbk = False
        gbks = []
        out_gbk = None
        for embl in os.listdir(ref_embls):
            if embl.endswith(".gbk"):
                detect_gbk = True
                gbks.append(os.path.join(ref_embls, embl))
        if not detect_gbk:
            print("Error: please assign proper folder for Genebank file!!!")
            sys.exit()
        elif detect_gbk:
            out_gbk = self._parser_embl_gbk(gbks)
            self.converter.convert_gbk2embl(out_gbk)
            self.helper.check_make_folder(self.embl)
            self.helper.move_all_content(out_gbk, self.embl, [".embl"])
        return out_gbk

    def _run_ratt(self, args_ratt, tar, ref, out):
        call([args_ratt.ratt_path, self.embl,
              os.path.join(self.tmp_files["tar"], tar + ".fa"),
              args_ratt.element, args_ratt.transfer_type,
              os.path.join(self.tmp_files["ref"], ref + ".fa")],
             stdout=out, stderr=DEVNULL)

    def _format_and_run(self, args_ratt):
        print("Running RATT...")
        for pair in args_ratt.pairs:
            ref = pair.split(":")[0]
            tar = pair.split(":")[1]
            out = open(self.ratt_log, "w+")
            print(tar)
            self._run_ratt(args_ratt, tar, ref, out)
            for filename in os.listdir():
                if ("final" in filename):
                    shutil.move(filename, os.path.join(args_ratt.output_path,
                                                       filename))
                elif (args_ratt.element in filename) or (
                      "query" in filename) or (
                      "Reference" in filename) or (
                      "Query" in filename) or (
                      "Sequences" in filename):
                    if os.path.isfile(filename):
                        os.remove(filename)
                    if os.path.isdir(filename):
                        shutil.rmtree(filename)
        out.close()

    def annotation_transfer(self, args_ratt):
        self.multiparser.parser_fasta(args_ratt.tar_fastas)
        self.multiparser.parser_fasta(args_ratt.ref_fastas)
        out_gbk = self._convert_embl(args_ratt.ref_embls)
        self._format_and_run(args_ratt)
        if args_ratt.convert:
            files = []
            for data in os.listdir(args_ratt.output_path):
                if "final.embl" in data:
                    self._convert_to_gff(data, args_ratt, files)
                    self._convert_to_pttrnt(args_ratt.gff_outfolder, files)
            self.helper.check_make_folder(self.tmp_files["out_gff"])
            for folder in os.listdir(args_ratt.tar_fastas):
                files = []
                if "_folder" in folder:
                    datas = folder.split("_folder")
                    prefix = datas[0][:-3]
                    for file_ in os.listdir(os.path.join(args_ratt.tar_fastas,
                                                         folder)):
                        files.append(file_[:-3])
                    for gff in os.listdir(args_ratt.gff_outfolder):
                        for file_ in files:
                            if (".gff" in gff) and (file_ == gff[:-4]):
                                self.helper.merge_file(os.path.join(
                                     args_ratt.gff_outfolder, gff),
                                     self.tmp_files["gff"])
                            if (".ptt" in gff) and (file_ == gff[:-4]):
                                self.helper.merge_file(os.path.join(
                                     args_ratt.gff_outfolder, gff),
                                     self.tmp_files["ptt"])
                            if (".rnt" in gff) and (file_ == gff[:-4]):
                                self.helper.merge_file(os.path.join(
                                     args_ratt.gff_outfolder, gff),
                                     self.tmp_files["rnt"])
                    shutil.move(self.tmp_files["gff"], os.path.join(
                                self.tmp_files["out_gff"], prefix + ".gff"))
                    shutil.move(self.tmp_files["ptt"], os.path.join(
                                self.tmp_files["out_gff"], prefix + ".ptt"))
                    shutil.move(self.tmp_files["rnt"], os.path.join(
                                self.tmp_files["out_gff"], prefix + ".rnt"))
        self._remove_files(args_ratt, out_gbk)
Beispiel #39
0
class Terminator(object):
    '''detection of terminator'''

    def __init__(self, args_term):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.gff_parser = Gff3Parser()
        self.gff_path = os.path.join(args_term.gffs, "tmp")
        self.fasta_path = os.path.join(args_term.fastas, "tmp")
        self.tran_path = os.path.join(args_term.trans, "tmp")
        self.outfolder = {"term": os.path.join(args_term.out_folder, "gffs"),
                          "csv": os.path.join(args_term.out_folder, "tables")}
        self.terms = {"all": os.path.join(self.outfolder["term"],
                                          "all_candidates"),
                      "express": os.path.join(self.outfolder["term"],
                                              "expressed_candidates"),
                      "best": os.path.join(self.outfolder["term"],
                                           "best_candidates"),
                      "non": os.path.join(self.outfolder["term"],
                                          "non_expressed_candidates")}
        self.csvs = {"all": os.path.join(self.outfolder["csv"],
                                         "all_candidates"),
                     "express": os.path.join(self.outfolder["csv"],
                                             "expressed_candidates"),
                     "best": os.path.join(self.outfolder["csv"],
                                          "best_candidates"),
                     "non": os.path.join(self.outfolder["csv"],
                                         "non_expressed_candidates")}
        self.combine_path = os.path.join(self.gff_path, "combine")
        self.tmps = {"transterm": os.path.join(os.getcwd(), "tmp_transterm"),
                     "hp": "transtermhp", "hp_gff": "transtermhp.gff",
                     "hp_path": "tmp_transterm/tmp",
                     "term_table": os.path.join(os.getcwd(), "tmp_term_table"),
                     "merge": os.path.join(os.getcwd(), "tmp_merge_gff"),
                     "gff": "tmp.gff",
                     "folder": os.path.join(os.getcwd(), "tmp")}
        self.suffixs = {"gff": "term.gff", "csv": "term.csv",
                        "allgff": "term_all.gff"}
        if args_term.srnas:
            self.srna_path = os.path.join(args_term.srnas, "tmp")
        else:
            self.srna_path = None
        self._make_gff_folder()

    def _combine_annotation(self, combine_file, files):
        with open(combine_file, 'w') as result:
            for file_ in files:
                if (file_.endswith(".ptt")) and (os.stat(file_).st_size == 0):
                    print("Warning: No CDS information, "
                          "TransTermHP can not work!")
                    return "NO_CDS"
                if os.path.exists(file_) and (
                        os.stat(file_).st_size != 0):
                    check_start = False
                    fh = open(file_, 'r')
                    for line in fh:
                        if check_start:
                            result.write(line)
                        if "Location" in line:
                            check_start = True
                    if "\n" not in line:
                        result.write("\n")
                    fh.close()
        return "Normal"

    def _make_gff_folder(self):
        self.helper.check_make_folder(self.terms["all"])
        self.helper.check_make_folder(self.csvs["all"])
        self.helper.check_make_folder(self.terms["best"])
        self.helper.check_make_folder(self.csvs["best"])
        self.helper.check_make_folder(self.terms["express"])
        self.helper.check_make_folder(self.csvs["express"])
        self.helper.check_make_folder(self.terms["non"])
        self.helper.check_make_folder(self.csvs["non"])

    def _convert_gff2rntptt(self, gff_path, fasta_path, sRNAs, log):
        file_types = {}
        prefixs = []
        for gff in os.listdir(gff_path):
            if gff.endswith(".gff"):
                filename = gff.split("/")
                prefix = filename[-1][:-4]
                prefixs.append(prefix)
                gff_file = os.path.join(gff_path, gff)
                rnt_file = os.path.join(gff_path, gff.replace(".gff", ".rnt"))
                ptt_file = os.path.join(gff_path, gff.replace(".gff", ".ptt"))
                fasta = self.helper.get_correct_file(
                             fasta_path, ".fa", prefix, None, None)
                if not fasta:
                    log.write("{0}.fa can not be found.\n".format(prefix))
                    print("Error: {0}.fa can not be found!".format(prefix))
                    sys.exit()
                if sRNAs:
                    self.multiparser.parser_gff(sRNAs, "sRNA")
                    srna = self.helper.get_correct_file(
                            self.srna_path, "_sRNA.gff", prefix, None, None)
                    if (srna) and (fasta):
                        log.write("Running converter.py to convert {0} and "
                                  "{1} to {2}, {3}, and {4}.\n".format(
                            gff_file, srna, ptt_file, rnt_file,
                            srna.replace(".gff", ".rnt")))
                        self.converter.convert_gff2rntptt(
                            gff_file, fasta, ptt_file, rnt_file, srna,
                            srna.replace(".gff", ".rnt"))
                        file_types[prefix] = "srna"
                        log.write("The following files are generated:\n")
                        log.write("\t{0}\n\t{1}\n\t{2}\n".format(
                            ptt_file, rnt_file, srna.replace(".gff", ".rnt")))
                    if (not srna) and (fasta):
                        log.write("Running converter.py to convert {0} "
                                  "to {1}, and {2}.\n".format(
                            gff_file, ptt_file, rnt_file))
                        self.converter.convert_gff2rntptt(
                            gff_file, fasta, ptt_file, rnt_file, None, None)
                        file_types[prefix] = "normal"
                        log.write("The following files are generated:\n")
                        log.write("\t{0}\n\t{1}\n".format(ptt_file, rnt_file))
                else:
                    log.write("Running converter.py to convert {0} "
                              "to {1}, and {2}.\n".format(
                        gff_file, ptt_file, rnt_file))
                    self.converter.convert_gff2rntptt(
                        gff_file, fasta, ptt_file, rnt_file, None, None)
                    file_types[prefix] = "normal"
                    log.write("The following files are generated:\n")
                    log.write("\t{0}\n\t{1}\n".format(ptt_file, rnt_file))
        return file_types, prefixs

    def _combine_ptt_rnt(self, gff_path, file_types, srna_path):
        self.helper.check_make_folder(self.combine_path)
        for prefix, file_type in file_types.items():
            combine_file = os.path.join(self.combine_path, prefix + '.ptt')
            if file_type == "normal":
                files = [os.path.join(gff_path, prefix + ".ptt"),
                         os.path.join(gff_path, prefix + ".rnt")]
                check = self._combine_annotation(combine_file, files)
            elif file_type == "srna":
                files = [os.path.join(gff_path, prefix + ".ptt"),
                         os.path.join(gff_path, prefix + ".rnt"),
                         os.path.join(srna_path,
                                      "_".join([prefix, "sRNA.rnt"]))]
                check = self._combine_annotation(combine_file, files)
        return check

    def _TransTermHP(self, fasta, file_, out_path, prefix, out, args_term, log):
        call([args_term.TransTermHP_path, "-p", args_term.expterm_path,
              fasta, os.path.join(self.combine_path, file_), "--t2t-perf",
              os.path.join(out_path, "_".join([
                  prefix,
                  "terminators_within_robust_tail-to-tail_regions.t2t"])),
              "--bag-output", os.path.join(out_path, "_".join([
                  prefix, "best_terminator_after_gene.bag"]))],
             stdout=out)
        log.write(" ".join([args_term.TransTermHP_path, "-p", args_term.expterm_path,
              fasta, os.path.join(self.combine_path, file_), "--t2t-perf",
              os.path.join(out_path, "_".join([
                  prefix,
                  "terminators_within_robust_tail-to-tail_regions.t2t"])),
              "--bag-output", os.path.join(out_path, "_".join([
                  prefix, "best_terminator_after_gene.bag"]))]) + "\n")

    def _run_TransTermHP(self, args_term, log):
        self.helper.check_make_folder(self.tmps["transterm"])
        log.write("Running TransTermHP.\n")
        log.write("Make sure the version is at least 2.09.\n")
        for file_ in os.listdir(self.combine_path):
            if ".ptt" in file_:
                prefix = file_.replace(".ptt", "")
                fasta = self.helper.get_correct_file(
                             self.fasta_path, ".fa", prefix, None, None)
                if not fasta:
                    log.write("{0}.fa can not be found!.\n".format(prefix))
                    print("Error: {0}.fa can not be found!".format(prefix))
                    sys.exit()
                out_path = os.path.join(args_term.hp_folder, prefix)
                self.helper.check_make_folder(out_path)
                out = open(os.path.join(out_path,
                           "_".join([prefix, "terminators.txt"])), "w")
                self._TransTermHP(fasta, file_, out_path,
                                  prefix, out, args_term, log)
                log.write("Done!\n")
                log.write("The following files are generated in {0}.\n".format(
                    out_path))
                for file_ in os.listdir(out_path):
                    log.write("\t" + file_ + "\n")
                out.close()
        shutil.rmtree(self.combine_path)

    def _convert_to_gff(self, prefixs, args_term, log):
        log.write("Running coverter.py to convert the results of TransTermHP "
                  "to gff3 format.\n")
        for prefix in prefixs:
            for folder in os.listdir(args_term.hp_folder):
                if prefix == folder:
                    out_path = os.path.join(args_term.hp_folder, folder)
                    for file_ in os.listdir(out_path):
                        if file_.endswith(".bag"):
                            out_file = os.path.join(
                                    self.tmps["transterm"],
                                    "_".join([prefix, self.tmps["hp_gff"]]))
                            self.converter.convert_transtermhp2gff(
                                 os.path.join(out_path, file_), out_file)
                            log.write("\t" + out_file + " is generated.\n")
        self.multiparser.combine_gff(args_term.gffs, self.tmps["transterm"],
                                     None, self.tmps["hp"])

    def _combine_wigs(self, args_term):
        if (args_term.tex_wigs is not None) and (
                args_term.frag_wigs is not None):
            folder = args_term.tex_wigs.split("/")
            folder = "/".join(folder[:-1])
            merge_wigs = os.path.join(folder, "merge_wigs")
            self.helper.check_make_folder(merge_wigs)
            for wig in os.listdir(args_term.tex_wigs):
                if os.path.isdir(os.path.join(args_term.tex_wigs, wig)):
                    pass
                else:
                    shutil.copy(os.path.join(args_term.tex_wigs, wig),
                                merge_wigs)
            for wig in os.listdir(args_term.frag_wigs):
                if os.path.isdir(os.path.join(args_term.frag_wigs, wig)):
                    pass
                else:
                    shutil.copy(os.path.join(args_term.frag_wigs, wig),
                                merge_wigs)
        elif (args_term.tex_wigs is not None):
            merge_wigs = args_term.tex_wigs
        elif (args_term.frag_wigs is not None):
            merge_wigs = args_term.frag_wigs
        else:
            print("Error: Wiggle files are not assigned!")
            sys.exit()
        return merge_wigs

    def _merge_sRNA(self, sRNAs, prefixs, gff_path):
        '''searching the terminator with sRNA information'''
        if sRNAs is not None:
            self.multiparser.parser_gff(sRNAs, "sRNA")
            self.helper.check_make_folder(self.tmps["merge"])
            for prefix in prefixs:
                tmp_gff = os.path.join(self.tmps["merge"], self.tmps["gff"])
                if self.tmps["gff"] in os.listdir(self.tmps["merge"]):
                    os.remove(tmp_gff)
                self.helper.merge_file(os.path.join(gff_path, prefix + ".gff"),
                                       tmp_gff)
                self.helper.merge_file(os.path.join(
                    self.srna_path, "_".join([prefix, "sRNA.gff"])), tmp_gff)
                self.helper.sort_gff(tmp_gff, os.path.join(
                    self.tmps["merge"], prefix + ".gff"))
                os.remove(tmp_gff)
            merge_path = self.tmps["merge"]
        else:
            merge_path = gff_path
        return merge_path

    def _move_file(self, term_outfolder, csv_outfolder):
        for gff in os.listdir(term_outfolder):
            if gff.endswith("_term.gff"):
                self.helper.sort_gff(os.path.join(term_outfolder, gff),
                                     self.tmps["gff"])
                shutil.move(self.tmps["gff"],
                            os.path.join(term_outfolder, gff))
                prefix = gff.replace("_term.gff", "")
                new_gff = os.path.join(self.terms["all"], "_".join([
                        prefix, self.suffixs["allgff"]]))
                csv_file = os.path.join(
                        os.path.join(self.csvs["all"], "_".join([
                            prefix, self.suffixs["csv"]])))
                out = open(new_gff, "w")
                out.write("##gff-version 3\n")
                out.close()
                self.helper.merge_file(
                        os.path.join(term_outfolder, gff),
                        os.path.join(
                            self.terms["all"], "_".join([
                                prefix, self.suffixs["allgff"]])))
                os.remove(os.path.join(term_outfolder, gff))
                pre_strain = ""
                if ("_".join([prefix, self.suffixs["csv"]]) in
                        os.listdir(self.csvs["all"])):
                    os.remove(csv_file)
                out_csv = open(csv_file, "w")
                out_csv.write("\t".join(["Genome", "Name", "Start", "End",
                              "Strand", "Detect", "Coverage_decrease",
                              "Coverage_detail"]) + "\n")
                out_csv.close()
                fh = open(new_gff)
                for entry in self.gff_parser.entries(fh):
                    if entry.seq_id != pre_strain:
                        self.helper.merge_file(os.path.join(
                            self.tmps["term_table"], "_".join([
                                entry.seq_id, "term_raw.csv"])),
                            os.path.join(self.csvs["all"], "_".join([
                                prefix, self.suffixs["csv"]])))
                    pre_strain = entry.seq_id
                fh.close()

    def _run_rnafold(self, RNAfold_path, tmp_seq, tmp_sec, prefix, log):
        log.write("Computing secondray structures of {0}.\n".format(prefix))
        log.write("Make sure the version of Vienna RNA package is at least 2.3.2.\n")
        print("Computing secondray structures of {0}".format(prefix))
        self.helper.check_make_folder(self.tmps["folder"])
        pre_cwd = os.getcwd()
        os.chdir(self.tmps["folder"])
        log.write(" ".join([RNAfold_path, "<", os.path.join("..", tmp_seq),
                  ">", os.path.join("..", tmp_sec)]) + "\n")
        os.system(" ".join([RNAfold_path, "<", os.path.join("..", tmp_seq),
                  ">", os.path.join("..", tmp_sec)]))
        log.write("Done!\n")
        log.write("\t" + tmp_sec + " is generated for storing secondary "
                  "structure.\n")
        os.chdir(pre_cwd)
        shutil.rmtree(self.tmps["folder"])

    def _compute_intersection_forward_reverse(
            self, prefixs, merge_path, wig_path, merge_wigs, args_term, log):
        '''the approach for searching gene converged region terminator'''
        log.write("Searching terminators which located in gene converged "
                  "region.\n")
        for prefix in prefixs:
            tmp_seq = os.path.join(args_term.out_folder,
                                   "_".join(["inter_seq", prefix]))
            tmp_index = os.path.join(args_term.out_folder,
                                     "_".join(["inter_index", prefix]))
            tmp_sec = os.path.join(args_term.out_folder,
                                   "_".join(["inter_sec", prefix]))
            tran_file = os.path.join(self.tran_path,
                                     "_".join([prefix, "transcript.gff"]))
            gff_file = os.path.join(merge_path, prefix + ".gff")
            tmp_cand = tmp_cand = os.path.join(args_term.out_folder,
                                     "_".join(["term_candidates", prefix]))
            if os.path.exists(tran_file):
                print("Extracting sequences of {0}".format(prefix))
                log.write("Running get_inter_seq.py to extract the potential "
                          "sequences from {0}.\n".format(prefix))
                intergenic_seq(os.path.join(self.fasta_path, prefix + ".fa"),
                               tran_file, gff_file, tmp_seq, tmp_index, args_term)
                log.write("\t" + tmp_seq + " is generated for storing the "
                          "potential sequences.\n")
                self._run_rnafold(args_term.RNAfold_path, tmp_seq, tmp_sec,
                                  prefix, log)
                log.write("Running extract_sec_info.py to extract the "
                          "information of secondary structure from {0}.\n".format(
                          prefix))
                extract_info_sec(tmp_sec, tmp_seq, tmp_index)
                os.remove(tmp_index)
                log.write("Running get_polyT.py to detect the "
                          "terminator candidates for {0}.\n".format(prefix))
                poly_t(tmp_seq, tmp_sec, gff_file, tran_file, tmp_cand, args_term)
                log.write("\t" + tmp_cand + " which temporary stores terminator "
                          "candidates is generated.\n")
            print("Detecting terminators for " + prefix)
            log.write("Running detect_coverage_term.py to gain "
                      "high-confidence terminators for {0}.\n".format(prefix))
            detect_coverage(
                tmp_cand, os.path.join(merge_path, prefix + ".gff"),
                os.path.join(self.tran_path, "_".join([
                    prefix, "transcript.gff"])),
                os.path.join(self.fasta_path, prefix + ".fa"),
                os.path.join(wig_path, "_".join([prefix, "forward.wig"])),
                os.path.join(wig_path, "_".join([prefix, "reverse.wig"])),
                os.path.join(self.tmps["hp_path"], "_".join([
                    prefix, self.tmps["hp_gff"]])), merge_wigs,
                os.path.join(self.outfolder["term"], "_".join([
                    prefix, self.suffixs["gff"]])),
                os.path.join(self.tmps["term_table"], "_".join([
                    prefix, "term_raw.csv"])), args_term)
        self.multiparser.combine_gff(args_term.gffs, self.outfolder["term"],
                                     None, "term")
        self._move_file(self.outfolder["term"], self.outfolder["csv"])

    def _remove_tmp_file(self, merge_wigs, args_term):
        self.helper.remove_tmp_dir(args_term.gffs)
        self.helper.remove_tmp_dir(args_term.fastas)
        if args_term.srnas is not None:
            self.helper.remove_tmp(args_term.srnas)
            shutil.rmtree(self.tmps["merge"])
        if (args_term.tex_wigs is not None) and (
                args_term.frag_wigs is not None):
            shutil.rmtree(merge_wigs)
        self.helper.remove_tmp_dir(args_term.trans)
        if "tmp_wig" in os.listdir(args_term.out_folder):
            shutil.rmtree(os.path.join(args_term.out_folder, "tmp_wig"))
        self.helper.remove_tmp(self.outfolder["term"])
        shutil.rmtree(self.tmps["transterm"])
        shutil.rmtree(self.tmps["term_table"])
        self.helper.remove_all_content(args_term.out_folder,
                                       "inter_seq_", "file")
        self.helper.remove_all_content(self.outfolder["term"],
                                       "_term.gff", "file")
        self.helper.remove_all_content(args_term.out_folder,
                                       "inter_sec_", "file")
        self.helper.remove_all_content(args_term.out_folder,
                                       "term_candidates_", "file")

    def _compute_stat(self, args_term, log):
        new_prefixs = []
        for gff in os.listdir(self.terms["all"]):
            if gff.endswith("_term_all.gff"):
                out_tmp = open(self.tmps["gff"], "w")
                out_tmp.write("##gff-version 3\n")
                new_prefix = gff.replace("_term_all.gff", "")
                new_prefixs.append(gff.replace("_term_all.gff", ""))
                num = 0
                fh = open(os.path.join(self.terms["all"], gff))
                for entry in self.gff_parser.entries(fh):
                    name = '%0*d' % (5, num)
                    entry.attributes["ID"] = (
                            entry.seq_id + "_terminator" + str(num))
                    entry.attributes["Name"] = "_".join(["terminator_" + name])
                    entry.attribute_string = ";".join([
                        "=".join(items) for items in entry.attributes.items()])
                    out_tmp.write("\t".join([entry.info_without_attributes,
                                  entry.attribute_string]) + "\n")
                    num += 1
                out_tmp.close()
                fh.close()
                shutil.move(self.tmps["gff"], os.path.join(self.terms["all"],
                            "_".join([new_prefix, self.suffixs["gff"]])))
        log.write("Running stat_term.py to do statistics.\n")
        stat_path = os.path.join(args_term.out_folder, "statistics")
        log.write("The following files are generated:\n")
        for prefix in new_prefixs:
            stat_term(os.path.join(self.terms["all"],
                      "_".join([prefix, self.suffixs["gff"]])),
                      os.path.join(self.csvs["all"],
                      "_".join([prefix, self.suffixs["csv"]])),
                      os.path.join(stat_path,
                      "_".join(["stat", prefix + ".csv"])),
                      os.path.join(self.terms["best"],
                      "_".join([prefix, "term"])),
                      os.path.join(self.terms["express"],
                      "_".join([prefix, "term"])),
                      os.path.join(self.terms["non"],
                      "_".join([prefix, "term"])))
            shutil.move(os.path.join(self.terms["best"],
                        "_".join([prefix, self.suffixs["csv"]])),
                        os.path.join(self.csvs["best"],
                        "_".join([prefix, self.suffixs["csv"]])))
            shutil.move(os.path.join(self.terms["express"],
                        "_".join([prefix, self.suffixs["csv"]])),
                        os.path.join(self.csvs["express"],
                        "_".join([prefix, self.suffixs["csv"]])))
            shutil.move(os.path.join(self.terms["non"],
                        "_".join([prefix, self.suffixs["csv"]])),
                        os.path.join(self.csvs["non"],
                        "_".join([prefix, self.suffixs["csv"]])))
            os.remove(os.path.join(self.terms["all"],
                      "_".join([prefix, self.suffixs["allgff"]])))
            log.write("\t" + os.path.join(self.terms["all"],
                      "_".join([prefix, self.suffixs["gff"]])) + "\n")
            log.write("\t" + os.path.join(self.terms["best"],
                      "_".join([prefix, self.suffixs["gff"]])) + "\n")
            log.write("\t" + os.path.join(self.terms["express"],
                      "_".join([prefix, self.suffixs["gff"]])) + "\n")
            log.write("\t" + os.path.join(self.terms["non"],
                      "_".join([prefix, self.suffixs["gff"]])) + "\n")
            log.write("\t" + os.path.join(self.csvs["all"],
                      "_".join([prefix, self.suffixs["csv"]])) + "\n")
            log.write("\t" + os.path.join(stat_path,
                      "_".join(["stat", prefix + ".csv"])) + "\n")
            log.write("\t" + os.path.join(self.csvs["best"],
                        "_".join([prefix, self.suffixs["csv"]])) + "\n")
            log.write("\t" + os.path.join(self.csvs["express"],
                        "_".join([prefix, self.suffixs["csv"]])) + "\n")
            log.write("\t" + os.path.join(self.csvs["non"],
                        "_".join([prefix, self.suffixs["csv"]])) + "\n")

    def _check_gff_file(self, folder):
        for file_ in os.listdir(folder):
            if file_.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(folder, file_))

    def _compare_term_tran(self, args_term, prefixs, log):
        '''searching the associated terminator to transcript'''
        self.multiparser.combine_gff(args_term.gffs, self.tran_path,
                                     None, "transcript")
        prefixs = []
        print("Comparing terminators with transcripts now")
        for file_ in os.listdir(self.tran_path):
            if file_.endswith("_transcript.gff"):
                prefixs.append(file_.replace("_transcript.gff", ""))
        log.write("Running compare_tran_term.py for comparing transcripts "
                  "and terminators.\n")
        log.write("The following files are generated:\n")
        for type_ in ("best_candidates", "expressed_candidates",
                      "all_candidates"):
            compare_term_tran(self.tran_path,
                              os.path.join(self.outfolder["term"], type_),
                              args_term.fuzzy_up_ta, args_term.fuzzy_down_ta,
                              args_term.out_folder, "terminator",
                              self.outfolder["term"], args_term.trans)
            for prefix in prefixs:
                shutil.move(
                    os.path.join(
                        args_term.out_folder, "statistics",
                        "stat_compare_transcript_terminator_" + prefix + ".csv"),
                    os.path.join(
                        args_term.out_folder, "statistics",
                        "_".join(["stat_compare_terminator_transcript", prefix,
                                  type_ + ".csv"])))
                log.write("\t" + os.path.join(
                        args_term.out_folder, "statistics",
                        "_".join(["stat_compare_terminator_transcript", prefix,
                                  type_ + ".csv"])) + "\n")

    def _re_table(self, args_term, prefixs, log):
        log.write("Running re_table.py to generate coverage information.\n")
        log.write("The following files are updated:\n")
        for type_ in ["all_candidates", "best_candidates",
                      "expressed_candidates", "non_expressed_candidates"]:
            for table in os.listdir(os.path.join(
                    args_term.out_folder, "tables", type_)):
                term_table = os.path.join(args_term.out_folder, "tables",
                                          type_, table)
                reorganize_table(args_term.libs, args_term.merge_wigs,
                                 "Coverage_detail", term_table)
                log.write("\t" + term_table + "\n")

    def run_terminator(self, args_term, log):
        self._check_gff_file(args_term.gffs)
        self._check_gff_file(args_term.trans)
        self.multiparser.parser_fasta(args_term.fastas)
        if (not args_term.gffs) or (not args_term.fastas):
            print("Error: Please assign gff files "
                  "and fasta files!")
            sys.exit()
        file_types, prefixs = self._convert_gff2rntptt(
                self.gff_path, self.fasta_path, args_term.srnas, log)
        check = self._combine_ptt_rnt(self.gff_path, file_types,
                                      self.srna_path)
        self._run_TransTermHP(args_term, log)
        self._convert_to_gff(prefixs, args_term, log)
        self.helper.remove_tmp(self.gff_path)
        self.multiparser.parser_gff(args_term.trans, "transcript")
        self.helper.check_make_folder(self.tmps["term_table"])
        if check != "NO_CDS":
            self.multiparser.parser_gff(self.tmps["transterm"],
                                        self.tmps["hp"])
        merge_path = self._merge_sRNA(args_term.srnas, prefixs, self.gff_path)
        self._compute_intersection_forward_reverse(
                prefixs, merge_path, args_term.wig_path,
                args_term.merge_wigs, args_term, log)
        self._compute_stat(args_term, log)
        self._compare_term_tran(args_term, prefixs, log)
        self._re_table(args_term, prefixs, log)
        self._remove_tmp_file(args_term.merge_wigs, args_term)
Beispiel #40
0
class SubLocal(object):
    '''detection of subcellular localization'''

    def __init__(self, args_sub):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.fixer = FormatFixer()
        self.gff_path = os.path.join(args_sub.gffs, "tmp")
        self.fasta_path = os.path.join(args_sub.fastas, "tmp")
        if args_sub.trans is not None:
            self.tran_path = os.path.join(args_sub.trans, "tmp")
        else:
            self.tran_path = None
        self.out_all = os.path.join(args_sub.out_folder, "all_CDS")
        self.out_express = os.path.join(args_sub.out_folder, "expressed_CDS")
        self.all_tmp_path = os.path.join(self.out_all, "tmp")
        self.express_tmp_path = os.path.join(self.out_express, "tmp")
        self.all_stat_path = os.path.join(self.out_all, "statistics")
        self.express_stat_path = os.path.join(self.out_express, "statistics")
        self.all_tmp_result = os.path.join(self.out_all, "tmp_results")
        self.express_tmp_result = os.path.join(self.out_express, "tmp_results")
        self.all_result = os.path.join(self.out_all, "psortb_results")
        self.express_result = os.path.join(self.out_express, "psortb_results")
        self.endfix_table = "table.csv"
        self.endfix_raw = "raw.txt"
        self._make_folder()

    def _make_folder(self):
        self.helper.check_make_folder(self.out_all)
        self.helper.check_make_folder(self.out_express)
        self.helper.check_make_folder(self.all_stat_path)
        self.helper.check_make_folder(self.express_stat_path)
        self.helper.check_make_folder(self.all_result)
        self.helper.check_make_folder(self.express_result)

    def _compare_cds_tran(self, gff_file, tran_file):
        '''compare CDS and transcript to find the expressed CDS'''
        out = open(os.path.join(self.out_all, "tmp_cds.gff"), "w")
        cdss = []
        fh = open(gff_file)
        th = open(tran_file)
        for entry in Gff3Parser().entries(fh):
            if entry.feature == "CDS":
                cdss.append(entry)
        trans = []
        for entry in Gff3Parser().entries(th):
            trans.append(entry)
        for cds in cdss:
            for ta in trans:
                if (cds.strand == ta.strand) and (
                        cds.seq_id == ta.seq_id):
                    if ((cds.end < ta.end) and (
                             cds.end > ta.start) and (
                             cds.start <= ta.start)) or (
                            (cds.start > ta.start) and (
                             cds.start < ta.end) and (
                             cds.end >= ta.end)) or (
                            (cds.end >= ta.end) and (
                             cds.start <= ta.start)) or (
                            (cds.end <= ta.end) and (
                             cds.start >= ta.start)):
                        out.write(cds.info + "\n")
                        break
        fh.close()
        th.close()
        out.close()

    def _get_protein_seq(self, gff, tmp_path, tran_path):
        prefix = gff.replace(".gff", "")
        fasta = self.helper.get_correct_file(self.fasta_path, ".fa",
                                             prefix, None, None)
        dna_seq_file = os.path.join(tmp_path, "_".join([prefix, "dna.fa"]))
        print("Generating CDS fasta files of {0}".format(prefix))
        if tran_path is not None:
            self._compare_cds_tran(os.path.join(self.gff_path, gff),
                                   os.path.join(tran_path, "_".join([
                                       prefix, "transcript.gff"])))
            self.helper.get_cds_seq(os.path.join(self.out_all, "tmp_cds.gff"),
                                    fasta, dna_seq_file)
            os.remove(os.path.join(self.out_all, "tmp_cds.gff"))
        else:
            self.helper.get_cds_seq(os.path.join(self.gff_path, gff),
                                    fasta, dna_seq_file)
        print("Transfering DNA seq to protein seq of {0}".format(prefix))
        self.helper.translation(dna_seq_file, "tmp")
        prot_seq_file = os.path.join(
                tmp_path, "_".join([prefix, "protein.fa"]))
        self.fixer.fix_emboss("tmp", prot_seq_file)
        os.remove("tmp")
        return prefix

    def _psortb(self, psortb_path, strain_type, prot_seq_file,
                out_raw, out_err):
        call([psortb_path, strain_type, prot_seq_file],
             stdout=out_raw, stderr=out_err)

    def _run_psortb(self, args_sub, prefix, out_folder, tmp_path, tmp_result):
        print("Running psortb of {0}".format(prefix))
        out_err = open(os.path.join(out_folder, "tmp_log"), "w")
        out_raw = open(os.path.join(tmp_result,
                       "_".join([prefix, self.endfix_raw])), "w")
        prot_seq_file = os.path.join(tmp_path,
                                     "_".join([prefix, "protein.fa"]))
        if args_sub.gram == "positive":
            self._psortb(args_sub.psortb_path, "-p", prot_seq_file,
                         out_raw, out_err)
        elif args_sub.gram == "negative":
            self._psortb(args_sub.psortb_path, "-n", prot_seq_file,
                         out_raw, out_err)
        else:
            print("Error: It is not a proper bacteria type - {0}!!".format(
                  args_sub.gram))
            sys.exit()
        out_err.close()
        out_raw.close()

    def _extract_result(self, args_sub, tmp_psortb_path, prefix, gff_file):
        '''extract the result of psortb'''
        if args_sub.merge:
            print("Merging gff")
            extract_psortb(os.path.join(
                tmp_psortb_path, "_".join([prefix, self.endfix_raw])),
                os.path.join(tmp_psortb_path, "_".join([
                    prefix, self.endfix_table])),
                gff_file, os.path.join(prefix + ".gff"),
                args_sub.fuzzy)
            shutil.move(prefix + ".gff", gff_file)
        else:
            extract_psortb(os.path.join(
                tmp_psortb_path, "_".join([prefix, self.endfix_raw])),
                os.path.join(tmp_psortb_path, "_".join([
                    prefix, self.endfix_table])),
                None, None, args_sub.fuzzy)

    def _merge_and_stat(self, gffs, tmp_psortb_path, stat_path, psortb_result):
        for folder in os.listdir(gffs):
            if folder.endswith(".gff_folder"):
                prefix = folder.replace(".gff_folder", "")
                self.helper.check_make_folder(
                     os.path.join(psortb_result, prefix))
                merge_table = os.path.join(
                        psortb_result, prefix,
                        "_".join([prefix, self.endfix_table]))
                for gff in os.listdir(os.path.join(gffs, folder)):
                    result = self.helper.get_correct_file(
                            tmp_psortb_path, "_" + self.endfix_raw,
                            gff.replace(".gff", ""), None, None)
                    shutil.copy(result, os.path.join(psortb_result, prefix))
                    result = self.helper.get_correct_file(
                            tmp_psortb_path, "_" + self.endfix_table,
                            gff.replace(".gff", ""), None, None)
                    self.helper.merge_file(result, merge_table)
                self.helper.check_make_folder(os.path.join(stat_path, prefix))
                stat_sublocal(merge_table,
                              os.path.join(
                                  stat_path, prefix, prefix),
                              os.path.join(
                                  stat_path, prefix, "_".join([
                                      "stat", prefix, "sublocal.csv"])))

    def _remove_tmps(self, args_sub):
        self.helper.remove_tmp_dir(args_sub.fastas)
        self.helper.remove_tmp_dir(args_sub.gffs)
        self.helper.remove_all_content(args_sub.out_folder, "tmp", "dir")
        self.helper.remove_all_content(self.out_all, "tmp", "dir")
        self.helper.remove_all_content(self.out_express, "tmp", "dir")
        os.remove(os.path.join(self.out_all, "tmp_log"))
        if args_sub.trans is not None:
            os.remove(os.path.join(self.out_express, "tmp_log"))
            self.helper.remove_tmp_dir(args_sub.trans)

    def run_sub_local(self, args_sub):
        for gff in os.listdir(args_sub.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(
                                                 args_sub.gffs, gff))
        self.multiparser.parser_gff(args_sub.gffs, None)
        self.multiparser.parser_fasta(args_sub.fastas)
        if args_sub.trans is not None:
            self.multiparser.parser_gff(args_sub.trans, "transcript")
            self.helper.check_make_folder(self.express_tmp_path)
            self.helper.check_make_folder(self.express_tmp_result)
        self.helper.check_make_folder(self.all_tmp_path)
        self.helper.check_make_folder(self.all_tmp_result)
        for gff in os.listdir(self.gff_path):
            if args_sub.trans is not None:
                print("Running expressed gene now")
                prefix = self._get_protein_seq(gff, self.express_tmp_path,
                                               self.tran_path)
                self._run_psortb(args_sub, prefix, self.out_express,
                                 self.express_tmp_path,
                                 self.express_tmp_result)
                self._extract_result(args_sub, self.express_tmp_result, prefix,
                                     os.path.join(self.gff_path, gff))
            print("Running all gene now")
            prefix = self._get_protein_seq(gff, self.all_tmp_path, None)
            self._run_psortb(args_sub, prefix, self.out_all,
                             self.all_tmp_path, self.all_tmp_result)
            self._extract_result(args_sub, self.all_tmp_result, prefix,
                                 os.path.join(self.gff_path, gff))
        self._merge_and_stat(args_sub.gffs, self.all_tmp_result,
                             self.all_stat_path, self.all_result)
        if args_sub.trans is not None:
            self._merge_and_stat(args_sub.gffs, self.express_tmp_result,
                                 self.express_stat_path, self.express_result)
        self._remove_tmps(args_sub)
Beispiel #41
0
class SNPCalling(object):
    '''detection of SNP'''
    def __init__(self, args_snp):
        self.multiparser = Multiparser()
        self.seq_editer = SeqEditer()
        self.helper = Helper()
        if args_snp.types == "related_genome":
            file_type = "compare_related_and_reference_genomes"
        else:
            file_type = "mutations_of_reference_genomes"
        self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs")
        self.stat_path = os.path.join(args_snp.out_folder, file_type,
                                      "statistics")
        self.fig_path = os.path.join(self.stat_path, "figs")
        self.helper.check_make_folder(self.fig_path)
        self.outputs = {
            "table": os.path.join(args_snp.out_folder, file_type,
                                  "SNP_tables"),
            "raw": os.path.join(args_snp.out_folder, file_type,
                                "SNP_raw_outputs"),
            "tmp": os.path.join(args_snp.out_folder, "tmp_bcf"),
            "depth": os.path.join(args_snp.out_folder, "tmp_depth")
        }
        self.bams = {
            "whole": os.path.join(args_snp.out_folder, "whole_reads.bam"),
            "sort": os.path.join(args_snp.out_folder,
                                 "whole_reads_sorted.bam"),
            "bams": []
        }
        self.header = os.path.join(args_snp.out_folder, "header")
        self.baqs = {
            "with": "with_BAQ",
            "without": "without_BAQ",
            "extend": "extend_BAQ"
        }

    def _transcript_snp(self, fasta, out_table_prefix, type_, prefix,
                        bam_datas, table_path, args_snp):
        seq_path = os.path.join(self.seq_path, self.baqs[type_], prefix)
        for bam in bam_datas:
            stat_prefix = os.path.join(
                self.stat_path, "_".join([
                    "stat", "_".join([prefix, self.baqs[type_],
                                      bam["sample"]]), "SNP"
                ]))
            snp_file = os.path.join(
                self.outputs["raw"], prefix,
                "_".join([prefix, self.baqs[type_], bam["sample"] + ".vcf"]))
            snp_detect(
                fasta, snp_file, self.outputs["depth"] + bam["sample"],
                "_".join([out_table_prefix, bam["sample"]]),
                os.path.join(seq_path, "_".join([prefix, bam["sample"]])),
                bam["bam_number"], stat_prefix, args_snp, bam["rep"])
            self.helper.move_all_content(table_path, self.fig_path, [".png"])

    def _get_para(self, args_snp):
        if args_snp.caller == "c":
            bcf_para = "-vcO"
        else:
            bcf_para = "-vmO"
        return bcf_para

    def _run_tools(self, fasta_file, type_, args_snp, bam_datas):
        bcf_para = self._get_para(args_snp)
        for bam in bam_datas:
            bam_file = os.path.join(args_snp.out_folder,
                                    bam["sample"] + ".bam")
            if type_ == "with":
                command = [args_snp.samtools_path, "mpileup", "-t", "DP"]
            elif type_ == "without":
                command = [args_snp.samtools_path, "mpileup", "-t", "DP", "-B"]
            elif type_ == "extend":
                command = [args_snp.samtools_path, "mpileup", "-t", "DP", "-E"]
            if args_snp.rg:
                command = command + ["-ugf", fasta_file, bam_file]
            else:
                command = command + [
                    "--ignore-RG", "-ugf", fasta_file, bam_file
                ]
            os.system(" ".join(command) + ">" + self.outputs["tmp"])
            bam["vcf"] = os.path.join(
                self.outputs["raw"],
                "_".join([self.baqs[type_], bam["sample"] + ".vcf"]))
            if args_snp.chrom == "1":
                call([
                    args_snp.bcftools_path, "call", "--ploidy", args_snp.chrom,
                    self.outputs["tmp"], bcf_para, "v", "-o", bam["vcf"]
                ])
            elif args_snp.chrom == "2":
                call([
                    args_snp.bcftools_path, "call", self.outputs["tmp"],
                    bcf_para, "v", "-o", bam["vcf"]
                ])

    def _parse_vcf_by_fa(self, args_snp, type_, num_prog):
        seq_names = []
        fa_prefixs = []
        for fa in os.listdir(args_snp.fastas):
            if (fa != "all.fa") and (not fa.endswith(".fai")):
                with open(os.path.join(args_snp.fastas, fa)) as fh:
                    for line in fh:
                        line = line.strip()
                        if line.startswith(">"):
                            seq_names.append(line[1:])
                fa_prefix = ".".join(fa.split(".")[:-1])
                fa_prefixs.append(fa_prefix)
                vcf_folder = os.path.join(self.outputs["raw"], fa_prefix)
                if num_prog == 0:
                    self.helper.check_make_folder(vcf_folder)
                    self.helper.check_make_folder(
                        os.path.join(self.outputs["table"], fa_prefix))
                self.helper.check_make_folder(
                    os.path.join(self.seq_path, self.baqs[type_], fa_prefix))
                for vcf in os.listdir(self.outputs["raw"]):
                    if vcf.endswith(".vcf"):
                        out = open(
                            os.path.join(vcf_folder, "_".join([fa_prefix,
                                                               vcf])), "w")
                        with open(os.path.join(self.outputs["raw"],
                                               vcf)) as vh:
                            for line in vh:
                                line = line.strip()
                                if line.startswith("#"):
                                    out.write(line + "\n")
                                else:
                                    if line.split("\t")[0] in seq_names:
                                        out.write(line + "\n")
                        out.close()
        for vcf in os.listdir(self.outputs["raw"]):
            if vcf.endswith(".vcf"):
                os.remove(os.path.join(self.outputs["raw"], vcf))
        return fa_prefixs

    def _run_sub(self, args_snp, all_fasta, type_, bam_datas, num_prog):
        self._run_tools(all_fasta, type_, args_snp, bam_datas)
        fa_prefixs = self._parse_vcf_by_fa(args_snp, type_, num_prog)
        for fa_prefix in fa_prefixs:
            for fasta in os.listdir(args_snp.fastas):
                if fa_prefix in fasta:
                    fasta_file = os.path.join(args_snp.fastas, fasta)
            table_path = os.path.join(self.outputs["table"], fa_prefix)
            table_prefix = os.path.join(
                table_path, "_".join([fa_prefix, self.baqs[type_]]))
            self._transcript_snp(fasta_file, table_prefix, type_, fa_prefix,
                                 bam_datas, table_path, args_snp)

    def _run_program(self, all_fasta, bam_datas, args_snp):
        num_prog = 0
        for index in args_snp.program:
            if index == "with_BAQ":
                type_ = "with"
                print("Running SNP calling with BAQ")
            elif index == "without_BAQ":
                type_ = "without"
                print("Running SNP calling without BAQ")
            elif index == "extend_BAQ":
                print("Running SNP calling extend BAQ")
                type_ = "extend"
            else:
                print("Error: No correct program, please assign "
                      "\"with_BAQ\", \"without_BAQ\", \"extend_BAQ\"!")
                sys.exit()
            self._run_sub(args_snp, all_fasta, type_, bam_datas, num_prog)
            num_prog += 1

    def _run_bam(self, samtools_path, sub_command, bam_file, type_file):
        if sub_command == "merge":
            command = (" ".join(
                [samtools_path, sub_command, self.bams["whole"], bam_file]))
        elif sub_command == "sort":
            if type_file == "all":
                command = (" ".join([
                    samtools_path, sub_command, "-o", bam_file,
                    self.bams["whole"]
                ]))
            else:
                command = (" ".join(
                    [samtools_path, sub_command, "-o", bam_file, type_file]))
        os.system(command)

    def _merge_bams(self, args_snp, bam_datas):
        bams = []
        num_normal = 0
        num_frag = 0
        for bam in bam_datas:
            bam["bam_number"] = 0
            out_bam = os.path.join(args_snp.out_folder, bam["sample"] + ".bam")
            if len(bam["bams"]) == 1:
                print("Sorting BAM files of " + bam["sample"])
                self._run_bam(args_snp.samtools_path, "sort", out_bam,
                              bam["bams"][0])
                bam["bam_number"] = 1
            else:
                print("Merging BAM files of " + bam["sample"])
                self._run_bam(args_snp.samtools_path, "merge",
                              " ".join(bam["bams"]), "all")
                print("Sorting BAM files of " + bam["sample"])
                self._run_bam(args_snp.samtools_path, "sort", out_bam, "all")
                bam["bam_number"] += 1
            if os.path.exists(self.bams["whole"]):
                os.remove(self.bams["whole"])
            out_depth = open(self.outputs["depth"] + bam["sample"], "w")
            call([args_snp.samtools_path, "index", out_bam])
            call([args_snp.samtools_path, "depth", out_bam], stdout=out_depth)
            out_depth.close()

    def _modify_header(self, fastas):
        for fasta in os.listdir(fastas):
            if fasta.endswith("fasta") or \
               fasta.endswith("fa") or \
               fasta.endswith("fna"):
                self.seq_editer.modify_header(os.path.join(fastas, fasta))

    def _get_header(self, samtools_path, bam, seq_names):
        command = " ".join([samtools_path, "view", "-H", bam])
        os.system(">".join([command, self.header]))
        fh = open(self.header, "r")
        for row in csv.reader(fh, delimiter="\t"):
            if row[0] == "@SQ":
                if row[1].split(":")[1] not in seq_names:
                    seq_names.append(row[1].split(":")[1])
        fh.close()

    def _get_genome_name(self, args_snp, bam_datas):
        seq_names = []
        for bam in bam_datas:
            bam_file = os.path.join(args_snp.out_folder,
                                    bam["sample"] + ".bam")
            self._get_header(args_snp.samtools_path, bam_file, seq_names)
        return seq_names

    def _remove_bams(self, bam_datas, args_snp):
        for bam in bam_datas:
            bam_file = os.path.join(args_snp.out_folder,
                                    bam["sample"] + ".bam")
            if os.path.exists(bam_file):
                os.remove(bam_file)
            if os.path.exists(bam_file + ".bai"):
                os.remove(bam_file + ".bai")
            if os.path.exists(self.header):
                os.remove(self.header)
            os.remove(self.outputs["depth"] + bam["sample"])

    def _extract_bams(self, bams):
        bam_datas = []
        for bam in bams:
            datas = bam.split(":")
            if len(datas) != 2:
                print("Error: the format of --bam_files is wrong!")
                sys.exit()
            for file_ in datas[-1].split(","):
                if not os.path.exists(file_):
                    print("Error: there are some Bam files "
                          "which do not exist!")
                    sys.exit()
            bam_datas.append({
                "sample": datas[0],
                "rep": len(datas[-1].split(",")),
                "bams": datas[-1].split(",")
            })
        return bam_datas

    def _merge_fasta(self, fastas):
        all_fasta = os.path.join(fastas, "all.fa")
        names = []
        out = open(all_fasta, "w")
        print_ = False
        for fasta in os.listdir(fastas):
            if (fasta.endswith(".fa")) or (fasta.endswith(".fasta")) or (
                    fasta.endswith(".fna")):
                with open(os.path.join(fastas, fasta)) as fh:
                    for line in fh:
                        line = line.strip()
                        if line.startswith(">"):
                            if line not in names:
                                print_ = True
                                names.append(line)
                            else:
                                print_ = False
                        if print_:
                            out.write(line + "\n")
        out.close()
        return all_fasta

    def run_snp_calling(self, args_snp):
        self._modify_header(args_snp.fastas)
        all_fasta = self._merge_fasta(args_snp.fastas)
        bam_datas = self._extract_bams(args_snp.bams)
        self._merge_bams(args_snp, bam_datas)
        if ("with_BAQ" not in args_snp.program) and (
                "without_BAQ"
                not in args_snp.program) and ("extend_BAQ"
                                              not in args_snp.program):
            print("Error: Please assign a correct programs: "
                  "\"with_BAQ\", \"without_BAQ\", \"extend_BAQ\".")
            sys.exit()
        else:
            print("Detecting mutations now")
            self._run_program(all_fasta, bam_datas, args_snp)
            os.remove(self.outputs["tmp"])
            os.remove(all_fasta)
            os.remove(all_fasta + ".fai")
        self.helper.remove_tmp_dir(args_snp.fastas)
        self._remove_bams(bam_datas, args_snp)
Beispiel #42
0
class Ribos(object):
    '''detection of riboswitch and RNA thermometer'''

    def __init__(self, args_ribo):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.gff_parser = Gff3Parser()
        self.gff_path = os.path.join(args_ribo.gffs, "tmp")
        if args_ribo.tsss is not None:
            self.tss_path = os.path.join(args_ribo.tsss, "tmp")
        else:
            self.tss_path = None
        self.tran_path = os.path.join(args_ribo.trans, "tmp")
        self.fasta_path = os.path.join(args_ribo.fastas, "tmp")
        if (args_ribo.program == "both") or (
                args_ribo.program == "riboswitch"):
            (self.ribos_stat_folder, self.ribos_gff_outfolder,
             self.ribos_table_folder, self.ribos_scan_folder,
             self.ribos_tmp_files, self.ribos_rfam,
             self.ribos_suffixs) = self._create_out_folders(
                args_ribo.ribos_out_folder, "riboswitch",
                args_ribo.database)
        if (args_ribo.program == "both") or (
                args_ribo.program == "thermometer"):
            (self.thermo_stat_folder, self.thermo_gff_outfolder,
             self.thermo_table_folder, self.thermo_scan_folder,
             self.thermo_tmp_files, self.thermo_rfam,
             self.thermo_suffixs) = self._create_out_folders(
                args_ribo.thermo_out_folder, "RNA_thermometer",
                args_ribo.database)

    def _create_out_folders(self, out_folder, feature, database):
        stat_folder = os.path.join(out_folder, "statistics")
        gff_outfolder = os.path.join(out_folder, "gffs")
        table_folder = os.path.join(out_folder, "tables")
        scan_folder = os.path.join(out_folder, "scan_Rfam_results")
        tmp_files = {"fasta": os.path.join(
                              out_folder, "tmp_fasta"),
                     "scan": os.path.join(
                              out_folder, "tmp_scan"),
                     "table": os.path.join(
                              out_folder, "tmp_table")}
        rfam = os.path.join(database, "Rfam_" + feature + ".cm")
        suffixs = {"csv": feature + ".csv",
                   "txt": feature + "_prescan.txt",
                   "re_txt": feature + "_scan.txt",
                   "re_csv": feature + "_scan.csv"}
        return (stat_folder, gff_outfolder, table_folder, scan_folder,
                tmp_files, rfam, suffixs)

    def _run_cmscan(self, args_ribo, seq, type_, prefix, tmp_files,
                    suffixs, rfam, log):
        scan_file = os.path.join(tmp_files["scan"],
                                 "_".join([prefix, suffixs[type_]]))
        scan = open(scan_file, "w")
        if args_ribo.cutoff.split("_")[0] == "e":
            value = args_ribo.cutoff.split("_")[-1]
            log.write(" ".join([args_ribo.cmscan_path, "--incE",
                      value, "--acc", rfam, seq]) + "\n")
            call([args_ribo.cmscan_path, "--incE",
                  value, "--acc", rfam, seq], stdout=scan)
        elif args_ribo.cutoff.split("_")[0] == "s":
            value = args_ribo.cutoff.split("_")[-1]
            log.write(" ".join([args_ribo.cmscan_path, "--incT",
                      value, "--acc", rfam, seq]) + "\n")
            call([args_ribo.cmscan_path, "--incT",
                  value, "--acc", rfam, seq], stdout=scan)
        else:
            print("Error: the --cutoff needs to start from 'e' "
                  "(e value) or 's' (score)!")
            log.write("the --cutoff needs to start from 'e' "
                      "(e value) or 's' (score).\n")
            sys.exit()
        scan.close()
        log.write("Done!\n")
        log.write("\t" + scan_file + " is temporary generated.\n")
        return scan_file

    def _scan_extract_rfam(self, prefixs, args_ribo, tmp_files, suffixs,
                           feature, rfam, log):
        '''extract the seq of candidates and scanning the candidates'''
        for gff in os.listdir(self.gff_path):
            if gff.endswith(".gff"):
                prefix = gff.replace(".gff", "")
                first_seq = os.path.join(tmp_files["fasta"],
                                         prefix + ".fa")
                prefixs.append(prefix)
                print("Extracting sequences of candidates for {0}".format(
                      prefix))
                if self.tss_path is not None:
                    tss_file = os.path.join(self.tss_path, prefix + "_TSS.gff")
                else:
                    tss_file = None
                log.write("Running extract_RBS.py to extract potential "
                          "sequences of riboswitches/RNA thermometers for "
                          "{0}.\n".format(prefix))
                extract_potential_rbs(
                      os.path.join(self.fasta_path, prefix + ".fa"),
                      os.path.join(self.gff_path, gff), tss_file,
                      os.path.join(self.tran_path, prefix + "_transcript.gff"),
                      first_seq, args_ribo, feature)
                log.write("\t" + first_seq + " is temporary generated.\n")
                print("Pre-scanning of {0}".format(prefix))
                log.write("Using Infernal to pre-scan riboswitches/RNA "
                          "thermometers for {0}.\n".format(prefix))
                log.write("Please make sure the version of Infernal is at least 1.1.1.\n")
                first_scan_file = self._run_cmscan(
                        args_ribo, first_seq, "txt", prefix, tmp_files,
                        suffixs, rfam, log)
                sec_seq = os.path.join(tmp_files["fasta"],
                                       "_".join([prefix, "regenerate.fa"]))
                first_table = os.path.join(
                        tmp_files["table"],
                        "_".join([prefix, suffixs["csv"]]))
                log.write("Running recompute_RBS.py to update the potential "
                          "sequences of riboswitches/RNA thermometers for {0} "
                          "based on the pre-scanning results.\n".format(prefix))
                regenerate_seq(first_scan_file, first_seq,
                               first_table, sec_seq)
                log.write("\t" + sec_seq + " is temporary generated.\n")
                print("Scanning of {0}".format(prefix))
                log.write("Using Infernal to scan riboswitches/RNA "
                          "thermometers for {0}.\n".format(prefix))
                log.write("Please make sure the version of Infernal is at "
                          "least 1.1.1.\n")
                sec_scan_file = self._run_cmscan(
                        args_ribo, sec_seq, "re_txt", prefix, tmp_files,
                        suffixs, rfam, log)
                sec_table = os.path.join(
                        tmp_files["table"],
                        "_".join([prefix, suffixs["re_csv"]]))
                log.write("Running recompute_RBS.py and modify_rbs_table.py "
                          "to generate tables for {0} "
                          "based on the scanning results.\n".format(prefix))
                reextract_rbs(sec_scan_file, first_table, sec_table,
                              args_ribo.cutoff)
                shutil.move(sec_table, first_table)
                modify_table(first_table, args_ribo.output_all)
        return prefixs

    def _merge_results(self, args_ribo, scan_folder, suffixs, tmp_files,
                       table_folder, stat_folder, feature_id, gff_outfolder,
                       feature, log):
        '''merge the results from the results of two searching'''
        for gff in os.listdir(args_ribo.gffs):
            if gff.endswith(".gff"):
                prefix = gff.replace(".gff", "")
                print("Merging results of {0}".format(prefix))
                pre_strain = ""
                self.helper.check_make_folder(os.path.join(
                                              scan_folder, prefix))
                fh = open(os.path.join(args_ribo.gffs, gff))
                log.write("Merging the results from Infernal to generate "
                          "tables for {0}.\n".format(prefix))
                for entry in self.gff_parser.entries(fh):
                    if entry.seq_id != pre_strain:
                        if len(pre_strain) == 0:
                            shutil.copyfile(os.path.join(
                                tmp_files["table"],
                                "_".join([entry.seq_id, suffixs["csv"]])),
                                os.path.join(
                                    table_folder,
                                    "_".join([prefix, suffixs["csv"]])))
                        else:
                            self.helper.merge_file(os.path.join(
                                tmp_files["table"],
                                "_".join([entry.seq_id, suffixs["csv"]])),
                                os.path.join(
                                    table_folder,
                                    "_".join([prefix, suffixs["csv"]])))
                        shutil.copy(os.path.join(
                            tmp_files["scan"],
                            "_".join([entry.seq_id, suffixs["txt"]])),
                            os.path.join(scan_folder, prefix))
                        shutil.copy(os.path.join(
                            tmp_files["scan"],
                            "_".join([entry.seq_id, suffixs["re_txt"]])),
                            os.path.join(scan_folder, prefix))
                        pre_strain = entry.seq_id
                log.write("The following files are generated.\n")
                for folder in (table_folder, scan_folder):
                    for file_ in os.listdir(folder):
                        log.write("\t" + os.path.join(folder, file_) + "\n")
                out_stat = os.path.join(
                        stat_folder,
                        "_".join(["stat", prefix, feature + ".txt"]))
                print("Computing statistics of {0}".format(prefix))
                log.write("Running ribo_gff.py to do statistics and generate "
                          "gff files for {0}.\n".format(prefix))
                log.write("The following files are generated:\n")
                out_gff = os.path.join(gff_outfolder, "_".join([
                   prefix, feature + ".gff"]))
                stat_and_covert2gff(os.path.join(
                    table_folder, "_".join([prefix, suffixs["csv"]])),
                    feature_id, out_gff,
                    args_ribo.fuzzy, out_stat, feature)
                log.write("\t" + out_gff + "\n")
                log.write("\t" + out_stat + "\n")
                fh.close()

    def _remove_tmp(self, args_ribo):
        self.helper.remove_tmp_dir(args_ribo.gffs)
        self.helper.remove_tmp_dir(args_ribo.fastas)
        self.helper.remove_tmp_dir(args_ribo.trans)
        self.helper.remove_tmp_dir(args_ribo.tsss)

    def _remove_overlap(self, gff_path, tmp_files, suffixs, type_, fuzzy, log):
        log.write("Running rbs_overlap.py to remove the overlapping "
                  "riboswitches/RNA thermometers.\n")
        for gff in os.listdir(gff_path):
            if gff.endswith(".gff"):
                tmp_table = os.path.join(os.path.join(
                        tmp_files["table"], "_".join([
                        gff.replace(".gff", ""), suffixs["csv"]])))
                rbs_overlap(tmp_table,
                    os.path.join(gff_path, gff), type_, fuzzy)
                log.write("\t" + tmp_table + " is updated.\n")

    def _core_prediction(self, args_ribo, feature_id, rfam, tmp_files,
                         table_folder, feature, scan_folder, suffixs,
                         stat_folder, gff_outfolder, out_folder, type_, log):
        '''main part of detection'''
        log.write("Running get_Rfam_ribo.py to get the information of "
                  "riboswitches/RNA thermometers from Rfam.\n")
        rbs_from_rfam(feature_id, args_ribo.rfam, rfam)
        log.write("Using Infernal to compress the Rfam data of "
                  "riboswitches/RNA thermometers.\n")
        log.write("Please make sure the version of Infernal is at least 1.1.1.\n")
        print("Compressing Rfam of " + feature)
        log.write(" ".join([args_ribo.cmpress_path, "-F", rfam]) + "\n")
        call([args_ribo.cmpress_path, "-F", rfam])
        log.write("Done!\n")
        prefixs = []
        self.helper.check_make_folder(tmp_files["fasta"])
        self.helper.check_make_folder(tmp_files["scan"])
        self.helper.check_make_folder(tmp_files["table"])
        prefixs = self._scan_extract_rfam(
                prefixs, args_ribo, tmp_files, suffixs, feature, rfam, log)
        self._remove_overlap(self.gff_path, tmp_files, suffixs, type_,
                             args_ribo.fuzzy, log)
        self._merge_results(args_ribo, scan_folder, suffixs, tmp_files,
                            table_folder, stat_folder, feature_id,
                            gff_outfolder, feature, log)
        log.write("Running map_ribos.py to extract all the details from Rfam.\n")
        mapping_ribos(table_folder, feature_id, feature)
        log.write("The following files are updated:\n")
        for file_ in os.listdir(table_folder):
            log.write("\t" + os.path.join(table_folder, file_) + "\n")
        self.helper.remove_all_content(out_folder, "tmp", "dir")

    def run_ribos(self, args_ribo, log_t, log_r):
        if args_ribo.fuzzy_rbs > 6:
            if log_t is not None:
                log_t.write("--fuzzy_rbs should be equal or less than 6!\n")
            if log_r is not None:
                log_r.write("--fuzzy_rbs should be equal or less than 6!\n")
            print("Error: --fuzzy_rbs should be equal or less than 6!")
            sys.exit()
        self.multiparser.parser_gff(args_ribo.gffs, None)
        self.multiparser.parser_fasta(args_ribo.fastas)
        self.multiparser.parser_gff(args_ribo.trans, "transcript")
        if args_ribo.tsss is not None:
            self.multiparser.parser_gff(args_ribo.tsss, "TSS")
        for gff in os.listdir(args_ribo.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(
                                                 args_ribo.gffs, gff))
        if (args_ribo.program.lower() == "both") or (
                args_ribo.program.lower() == "riboswitch"):
            print("Detecting riboswtiches now")
            self._core_prediction(
                    args_ribo, args_ribo.ribos_id, self.ribos_rfam,
                    self.ribos_tmp_files, self.ribos_table_folder,
                    "riboswitch", self.ribos_scan_folder, self.ribos_suffixs,
                    self.ribos_stat_folder, self.ribos_gff_outfolder,
                    args_ribo.ribos_out_folder, "riboswitch", log_r)
        if (args_ribo.program.lower() == "both") or (
                args_ribo.program.lower() == "thermometer"):
            print("Detecting RNA thermometers now")
            self._core_prediction(
                    args_ribo, args_ribo.thermo_id, self.thermo_rfam,
                    self.thermo_tmp_files, self.thermo_table_folder,
                    "RNA_thermometer", self.thermo_scan_folder,
                    self.thermo_suffixs, self.thermo_stat_folder,
                    self.thermo_gff_outfolder, args_ribo.thermo_out_folder,
                    "thermometer", log_t)
        self._remove_tmp(args_ribo)
Beispiel #43
0
class sRNATargetPrediction(object):
    '''detection of sRNA-target interaction'''
    def __init__(self, args_tar):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.fixer = FormatFixer()
        self.gff_parser = Gff3Parser()
        self.target_seq_path = os.path.join(args_tar.out_folder, "target_seqs")
        self.srna_seq_path = os.path.join(args_tar.out_folder, "sRNA_seqs")
        self.rnaplex_path = os.path.join(args_tar.out_folder,
                                         "RNAplex_results")
        self.rnaup_path = os.path.join(args_tar.out_folder, "RNAup_results")
        self.intarna_path = os.path.join(args_tar.out_folder,
                                         "IntaRNA_results")
        self.merge_path = os.path.join(args_tar.out_folder, "merged_results")
        self.srna_path = os.path.join(args_tar.srnas, "tmp")
        self.fasta_path = os.path.join(args_tar.fastas, "tmp")
        self.gff_path = os.path.join(args_tar.gffs, "tmp")
        self.tmps = {
            "tmp": "tmp_srna_target",
            "rnaup": "tmp_rnaup",
            "log": "tmp_log",
            "all_fa": "tmp*.fa",
            "all_txt": "tmp*.txt"
        }

    def _check_gff(self, gffs):
        for gff in os.listdir(gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(gffs, gff))

    def _check_long_id(self, seq_file, long_ids, type_):
        out_file = seq_file + "_tmp.fa"
        out = open(out_file, "w")
        with open(seq_file) as f_h:
            for line in f_h:
                line = line.strip()
                if line.startswith(">"):
                    if len(line) > 40:
                        long_ids[type_].append(line[1:])
                        out.write(">TMP" + type_ + "_" +
                                  str(len(long_ids[type_])) + "\n")
                    else:
                        out.write(line + "\n")
                else:
                    out.write(line + "\n")
        out.close()
        return out_file

    def _run_rnaplfold(self, rnaplfold_path, file_type, win_size, span,
                       unstr_region, long_ids, seq_path, prefix, out_path,
                       log):
        current = os.getcwd()
        os.chdir(out_path)
        command = " ".join([
            rnaplfold_path, "-W",
            str(win_size), "-L",
            str(span), "-u",
            str(unstr_region), "-O"
        ])
        if file_type == "sRNA":
            srna_seq_file = os.path.join(
                current, seq_path,
                "_".join([self.tmps["tmp"], prefix, file_type + ".fa"]))
            out_file = self._check_long_id(srna_seq_file, long_ids, "srna")
            log.write("<".join([command, out_file]) + "\n")
            os.system("<".join([command, out_file]))
        else:
            tar_seq_file = os.path.join(current, seq_path,
                                        "_".join([prefix, file_type + ".fa"]))
            for tar_seq_file in os.listdir(os.path.join(current, seq_path)):
                if (prefix + "_" + file_type + "_") in tar_seq_file:
                    out_file = self._check_long_id(
                        os.path.join(current, seq_path, tar_seq_file),
                        long_ids, "tar")
                    log.write("<".join([command, out_file]) + "\n")
                    os.system("<".join([command, out_file]))
        os.chdir(current)

    def _wait_process(self, processes):
        for p in processes:
            p.wait()
            if p.stdout:
                p.stdout.close()
            if p.stdin:
                p.stdin.close()
            if p.stderr:
                p.stderr.close()
            try:
                p.kill()
            except OSError:
                pass
            time.sleep(5)

    def _sort_srna_fasta(self, fasta, prefix, path):
        out = open(
            os.path.join(path, "_".join([self.tmps["tmp"], prefix,
                                         "sRNA.fa"])), "w")
        srnas = []
        with open(fasta) as f_h:
            for line in f_h:
                line = line.strip()
                if line.startswith(">"):
                    name = line[1:]
                else:
                    srnas.append({"name": name, "seq": line, "len": len(line)})
        srnas = sorted(srnas, key=lambda x: (x["len"]))
        for srna in srnas:
            out.write(">" + srna["name"].split("|")[0] + "\n")
            out.write(srna["seq"] + "\n")
        out.close()

    def _read_fasta(self, fasta_file):
        seq = ""
        with open(fasta_file, "r") as seq_f:
            for line in seq_f:
                line = line.strip()
                if line.startswith(">"):
                    continue
                else:
                    seq = seq + line
        return seq

    def _get_specific_seq(self, srna_file, seq_file, srna_out, querys):
        for query in querys:
            srna_datas = query.split(":")
            srna = {
                "seq_id": srna_datas[0],
                "strand": srna_datas[3],
                "start": int(srna_datas[1]),
                "end": int(srna_datas[2])
            }
            gff_f = open(srna_file, "r")
            out = open(srna_out, "a")
            seq = self._read_fasta(seq_file)
            num = 0
            detect = False
            for entry in self.gff_parser.entries(gff_f):
                if (entry.seq_id == srna["seq_id"]) and (
                        entry.strand == srna["strand"]) and (
                            entry.start == srna["start"]) and (entry.end
                                                               == srna["end"]):
                    detect = True
                    if "ID" in entry.attributes.keys():
                        id_ = entry.attributes["ID"]
                    else:
                        id_ = entry.feature + str(num)
                    gene = self.helper.extract_gene(seq, entry.start,
                                                    entry.end, entry.strand)
                    out.write(">{0}|{1}|{2}|{3}|{4}\n{5}\n".format(
                        id_, entry.seq_id, entry.start, entry.end,
                        entry.strand, gene))
                    num += 1
            if not detect:
                print("Error: Some of the query sRNAs do not exist!")
                sys.exit()
            gff_f.close()
            out.close()

    def _gen_seq(self, prefixs, target_prefixs, args_tar):
        print("Generating sRNA fasta files")
        for gff in os.listdir(self.gff_path):
            if gff.endswith(".gff"):
                prefix = gff.replace(".gff", "")
                target_prefixs.append(prefix)
        detect = False
        for gff in os.listdir(self.gff_path):
            if gff.endswith(".gff"):
                prefix = gff.replace(".gff", "")
                potential_target(os.path.join(self.gff_path, gff),
                                 os.path.join(self.fasta_path, prefix + ".fa"),
                                 os.path.join(self.target_seq_path), args_tar,
                                 target_prefixs)
                file_num = 1
                num = 0
                sub_prefix = os.path.join(self.target_seq_path,
                                          "_".join([prefix, "target"]))
                if os.path.exists(sub_prefix + ".fa"):
                    sub_out = open(
                        "_".join([sub_prefix,
                                  str(file_num) + ".fa"]), "w")
                    with open((sub_prefix + ".fa"), "r") as t_f:
                        for line in t_f:
                            line = line.strip()
                            if line.startswith(">"):
                                #                                line = line.replace("|", "_")
                                num += 1
                            if (num == 100):
                                num = 0
                                file_num += 1
                                sub_out.close()
                                sub_out = open(
                                    "_".join(
                                        [sub_prefix,
                                         str(file_num) + ".fa"]), "w")
                            detect = True
                            sub_out.write(line + "\n")
                    sub_out.close()
                else:
                    open(sub_prefix + ".fa", "w").close()
        if not detect:
            print("No assigned features can be found. "
                  "Please check your genome annotation. "
                  "And assign correct features to --target_feature.")
            sys.exit()
        print("Generating sRNA fasta files")
        for srna in os.listdir(self.srna_path):
            if srna.endswith("_sRNA.gff"):
                prefix = srna.replace("_sRNA.gff", "")
                prefixs.append(prefix)
                srna_out = os.path.join(self.srna_seq_path,
                                        "_".join([prefix, "sRNA.fa"]))
                if "all" in args_tar.query:
                    self.helper.get_seq(
                        os.path.join(self.srna_path, srna),
                        os.path.join(self.fasta_path, prefix + ".fa"),
                        srna_out)
                else:
                    if "_".join([prefix,
                                 "sRNA.fa"]) in os.listdir(self.srna_seq_path):
                        os.remove(srna_out)
                    self._get_specific_seq(
                        os.path.join(self.srna_path, srna),
                        os.path.join(self.fasta_path, prefix + ".fa"),
                        srna_out, args_tar.query)
                self._sort_srna_fasta(srna_out, prefix, self.srna_seq_path)

    def _run_rnaplex(self, prefix, rnaplfold_folder, args_tar, log):
        print("Running RNAplex of {0}".format(prefix))
        num_process = 0
        processes = []
        for seq in os.listdir(self.target_seq_path):
            if ("_target_" in seq) and (".fa_tmp.fa" in seq):
                print("Running RNAplex with {0}".format(
                    seq.replace(".fa_tmp.fa", "")))
                out_rnaplex = open(
                    os.path.join(
                        self.rnaplex_path, prefix, "_".join(
                            [prefix, "RNAplex",
                             str(num_process) + ".txt"])), "w")
                num_process += 1
                log.write(" ".join([
                    args_tar.rnaplex_path, "-q",
                    os.path.join(
                        self.srna_seq_path, "_".join([
                            self.tmps["tmp"], prefix, "sRNA.fa_tmp.fa"
                        ])), "-t",
                    os.path.join(self.target_seq_path, seq), "-l",
                    str(args_tar.inter_length), "-e",
                    str(args_tar.energy), "-z",
                    str(args_tar.duplex_dist), "-a", rnaplfold_folder
                ]) + "\n")
                p = Popen([
                    args_tar.rnaplex_path, "-q",
                    os.path.join(
                        self.srna_seq_path, "_".join([
                            self.tmps["tmp"], prefix, "sRNA.fa_tmp.fa"
                        ])), "-t",
                    os.path.join(self.target_seq_path, seq), "-l",
                    str(args_tar.inter_length), "-e",
                    str(args_tar.energy), "-z",
                    str(args_tar.duplex_dist), "-a", rnaplfold_folder
                ],
                          stdout=out_rnaplex)
                processes.append(p)
                if num_process % args_tar.core_plex == 0:
                    self._wait_process(processes)
        self._wait_process(processes)
        log.write("The prediction for {0} is done.\n".format(prefix))
        log.write(
            "The following temporary files for storing results of {0} are "
            "generated:\n".format(prefix))
        for file_ in os.listdir(os.path.join(self.rnaplex_path, prefix)):
            log.write("\t" + os.path.join(self.rnaplex_path, prefix, file_) +
                      "\n")
        return num_process

    def _restore_long_ids(self, rnaplex_file, long_ids):
        out = open(rnaplex_file + "tmp", "w")
        with open(rnaplex_file, "r") as t_f:
            for line in t_f:
                line = line.strip()
                if (line.startswith(">")):
                    if (line.startswith(">TMPtar_")):
                        header = long_ids["tar"][int(line.split("_")[1]) - 1]
                    elif (line.startswith(">TMPsrna_")):
                        header = long_ids["srna"][int(line.split("_")[1]) - 1]
                    else:
                        header = line[1:]
                    out.write(">" + header + "\n")
                else:
                    out.write(line + "\n")
        out.close()
        shutil.move(rnaplex_file + "tmp", rnaplex_file)

    def _rna_plex(self, prefixs, target_prefixs, args_tar, log):
        log.write("Using RNAplex and RNAplfold to predict sRNA targets.\n")
        log.write("Please make sure the version of Vienna RNA package is "
                  "at least 2.3.2.\n")
        tmp_rnaplfold_folder = os.path.join(self.rnaplex_path, "tmp_RNAplfold")
        if os.path.exists(tmp_rnaplfold_folder):
            shutil.rmtree(tmp_rnaplfold_folder)
        os.mkdir(tmp_rnaplfold_folder)
        long_ids = {"tar": [], "srna": []}
        for prefix in target_prefixs:
            self._run_rnaplfold(args_tar.rnaplfold_path, "target",
                                args_tar.win_size_t, args_tar.span_t,
                                args_tar.unstr_region_rnaplex_t, long_ids,
                                self.target_seq_path, prefix,
                                tmp_rnaplfold_folder, log)
        for prefix in prefixs:
            print("Running RNAplfold of {0}".format(prefix))
            self.helper.check_make_folder(
                os.path.join(self.rnaplex_path, prefix))
            rnaplfold_folder = os.path.join(self.rnaplex_path, prefix,
                                            "RNAplfold")
            shutil.copytree(tmp_rnaplfold_folder, rnaplfold_folder)
            self._run_rnaplfold(args_tar.rnaplfold_path, "sRNA",
                                args_tar.win_size_s, args_tar.span_s,
                                args_tar.unstr_region_rnaplex_s, long_ids,
                                self.srna_seq_path, prefix, rnaplfold_folder,
                                log)
            num_process = self._run_rnaplex(prefix, rnaplfold_folder, args_tar,
                                            log)
            rnaplex_file = os.path.join(self.rnaplex_path, prefix,
                                        "_".join([prefix, "RNAplex.txt"]))
            if ("_".join([prefix, "RNAplex.txt"])
                    in os.listdir(os.path.join(self.rnaplex_path, prefix))):
                os.remove(rnaplex_file)
            for index in range(0, num_process):
                log.write("Using helper.py to merge the temporary files.\n")
                self.helper.merge_file(
                    os.path.join(
                        self.rnaplex_path, prefix,
                        "_".join([prefix, "RNAplex",
                                  str(index) + ".txt"])), rnaplex_file)
            if (len(long_ids["tar"]) != 0) or (len(long_ids["srna"]) != 0):
                self._restore_long_ids(rnaplex_file, long_ids)
            log.write("\t" + rnaplex_file + " is generated.\n")
            self.helper.remove_all_content(
                os.path.join(self.rnaplex_path, prefix), "_RNAplex_", "file")
            self.fixer.fix_rnaplex(rnaplex_file, self.tmps["tmp"])
            shutil.move(self.tmps["tmp"], rnaplex_file)
            shutil.rmtree(rnaplfold_folder)

    def _run_rnaup(self, num_up, processes, prefix, out_rnaup, out_log,
                   args_tar, log):
        for index in range(1, num_up + 1):
            out_tmp_up = open(
                os.path.join(args_tar.out_folder,
                             "".join([self.tmps["rnaup"],
                                      str(index), ".txt"])), "w")
            out_err = open(
                os.path.join(args_tar.out_folder,
                             "".join([self.tmps["log"],
                                      str(index), ".txt"])), "w")
            in_up = open(
                os.path.join(args_tar.out_folder,
                             "".join([self.tmps["tmp"],
                                      str(index), ".fa"])), "r")
            log.write(" ".join([
                args_tar.rnaup_path, "-u",
                str(args_tar.unstr_region_rnaup), "-o", "--interaction_first"
            ]) + "\n")
            p = Popen([
                args_tar.rnaup_path, "-u",
                str(args_tar.unstr_region_rnaup), "-o", "--interaction_first"
            ],
                      stdin=in_up,
                      stdout=out_tmp_up,
                      stderr=out_err)
            processes.append(p)
        if len(processes) != 0:
            time.sleep(5)
            self._wait_process(processes)
            log.write(
                "The following temporary files for storing results of {0} are "
                "generated:\n".format(prefix))
            for file_ in os.listdir(os.path.join(args_tar.out_folder)):
                log.write("\t" + os.path.join(args_tar.out_folder, file_) +
                          "\n")
            os.system("rm " +
                      os.path.join(args_tar.out_folder, self.tmps["all_fa"]))
            self._merge_txt(num_up, out_rnaup, out_log, args_tar.out_folder)
            os.system("rm " +
                      os.path.join(args_tar.out_folder, self.tmps["all_txt"]))

    def _merge_txt(self, num_up, out_rnaup, out_log, out_folder):
        for index in range(1, num_up + 1):
            self.helper.merge_file(
                os.path.join(out_folder,
                             "".join([self.tmps["rnaup"],
                                      str(index), ".txt"])), out_rnaup)
            self.helper.merge_file(
                os.path.join(out_folder,
                             "".join([self.tmps["log"],
                                      str(index), ".txt"])), out_log)

    def _get_continue(self, out_rnaup):
        '''For RNAup, it can continue running RNAup based on previous run'''
        srnas = []
        matchs = {}
        out = open("tmp.txt", "w")
        with open(out_rnaup) as f_h:
            for line in f_h:
                line = line.strip()
                if ">srna" in line:
                    srna = line[1:]
                    srnas.append(srna)
                    matchs[srna] = []
                else:
                    matchs[srna].append(line)
        srnas = srnas[:-1]
        for srna in srnas:
            out.write(">" + srna + "\n")
            for target in matchs[srna]:
                out.write(target + "\n")
        out.close()
        os.remove(out_rnaup)
        shutil.move("tmp.txt", out_rnaup)
        return srnas

    def _rnaup(self, prefixs, target_prefixs, args_tar, log):
        log.write("Using RNAup to predict sRNA targets.\n")
        log.write("Please make sure the version of Vienna RNA package is "
                  "at least 2.3.2.\n")
        for prefix in prefixs:
            srnas = []
            print("Running RNAup of {0}".format(prefix))
            if not os.path.exists(os.path.join(self.rnaup_path, prefix)):
                os.mkdir(os.path.join(self.rnaup_path, prefix))
            num_up = 0
            processes = []
            out_rnaup = os.path.join(self.rnaup_path, prefix,
                                     "_".join([prefix + "_RNAup.txt"]))
            out_log = os.path.join(self.rnaup_path, prefix,
                                   "_".join([prefix + "_RNAup.log"]))
            if "_".join([prefix, "RNAup.txt"]) in \
                    os.listdir(os.path.join(self.rnaup_path, prefix)):
                if not args_tar.continue_rnaup:
                    os.remove(out_rnaup)
                    os.remove(out_log)
                else:
                    log.write("The data from the previous run is found.\n")
                    srnas = self._get_continue(out_rnaup)
                    log.write("The previous data is loaded.\n")
            with open(
                    os.path.join(
                        self.srna_seq_path,
                        "_".join([self.tmps["tmp"], prefix, "sRNA.fa"])),
                    "r") as s_f:
                for line in s_f:
                    line = line.strip()
                    if line.startswith(">"):
                        if line[1:] in srnas:
                            start = False
                            continue
                        start = True
                        print("Running RNAup with {0}".format(line[1:]))
                        num_up += 1
                        out_up = open(
                            os.path.join(
                                args_tar.out_folder,
                                "".join([self.tmps["tmp"],
                                         str(num_up), ".fa"])), "w")
                        out_up.write(line + "\n")
                    else:
                        if start:
                            out_up.write(line + "\n")
                            out_up.close()
                            for prefix in target_prefixs:
                                self.helper.merge_file(
                                    os.path.join(
                                        self.target_seq_path,
                                        "_".join([prefix, "target.fa"])),
                                    os.path.join(
                                        args_tar.out_folder, "".join([
                                            self.tmps["tmp"],
                                            str(num_up), ".fa"
                                        ])))
                                if num_up == args_tar.core_up:
                                    self._run_rnaup(num_up, processes, prefix,
                                                    out_rnaup, out_log,
                                                    args_tar, log)
                                    processes = []
                                    num_up = 0
                self._run_rnaup(num_up, processes, prefix, out_rnaup, out_log,
                                args_tar, log)
            log.write("The prediction for {0} is done.\n".format(prefix))
            log.write("\t" + out_rnaup +
                      " is complete generated and updated.\n")

    def _intarna(self, prefixs, target_prefixs, args_tar, log):
        log.write("Using IntaRNA to predict sRNA targets.\n")
        log.write(
            "Please make sure the version of IntaRNA is at least 2.0.4.\n")
        all_target = os.path.join(self.target_seq_path, "all_target.fa")
        if os.path.exists(all_target):
            os.remove(all_target)
        for prefix in target_prefixs:
            self.helper.merge_file(
                os.path.join(self.target_seq_path, prefix + "_target.fa"),
                all_target)
        for prefix in prefixs:
            print("Running IntaRNA of {0}".format(prefix))
            intarna_file = os.path.join(self.intarna_path, prefix,
                                        prefix + "_IntaRNA.txt")
            self.helper.check_make_folder(
                os.path.join(self.intarna_path, prefix))
            call([
                args_tar.intarna_path, "-q",
                os.path.join(self.srna_seq_path,
                             "_".join([self.tmps["tmp"], prefix, "sRNA.fa"])),
                "-t", all_target, "--qAccW",
                str(args_tar.slide_win_srna), "--qAccL",
                str(args_tar.max_loop_srna), "--tAccW",
                str(args_tar.slide_win_target), "--tAccL",
                str(args_tar.max_loop_target), "--outMode", "C", "-m",
                args_tar.mode_intarna, "--threads",
                str(args_tar.core_inta), "--out", intarna_file
            ])
            log.write("The prediction for {0} is done.\n".format(prefix))
            log.write("\t" + intarna_file + " is generated.\n")

    def _merge_rnaplex_rnaup(self, prefixs, target_prefixs, args_tar, log):
        '''merge the result of IntaRNA, RNAup and RNAplex'''
        log.write(
            "Running merge_rnaplex_rnaup.py to merge the results from "
            "RNAplex, RNAup, and IntaRNA for generating finanl output.\n")
        log.write("The following files are generated:\n")
        all_gff = os.path.join(self.gff_path, "all.gff")
        if os.path.exists(all_gff):
            os.remove(all_gff)
        for prefix in target_prefixs:
            self.helper.merge_file(
                os.path.join(self.gff_path, prefix + ".gff"), all_gff)
        for prefix in prefixs:
            rnaplex_file = None
            rnaup_file = None
            out_rnaplex = None
            out_rnaup = None
            intarna_file = None
            out_intarna = None
            self.helper.check_make_folder(os.path.join(self.merge_path,
                                                       prefix))
            print("Ranking {0} now".format(prefix))
            if ("RNAplex" in args_tar.program):
                rnaplex_file = os.path.join(self.rnaplex_path, prefix,
                                            "_".join([prefix, "RNAplex.txt"]))
                out_rnaplex = os.path.join(
                    self.rnaplex_path, prefix,
                    "_".join([prefix, "RNAplex_rank.csv"]))
                self._remove_repeat(rnaplex_file, "RNAplex")
            if ("RNAup" in args_tar.program):
                rnaup_file = os.path.join(self.rnaup_path, prefix,
                                          "_".join([prefix, "RNAup.txt"]))
                out_rnaup = os.path.join(self.rnaup_path, prefix,
                                         "_".join([prefix, "RNAup_rank.csv"]))
                self._remove_repeat(rnaup_file, "RNAup")
            if ("IntaRNA" in args_tar.program):
                intarna_file = os.path.join(self.intarna_path, prefix,
                                            "_".join([prefix, "IntaRNA.txt"]))
                out_intarna = os.path.join(
                    self.intarna_path, prefix,
                    "_".join([prefix, "IntaRNA_rank.csv"]))
                self._remove_repeat(intarna_file, "IntaRNA")
            overlap_file = os.path.join(self.merge_path, prefix,
                                        "_".join([prefix, "overlap.csv"]))
            merge_file = os.path.join(self.merge_path, prefix,
                                      "_".join([prefix, "merge.csv"]))
            merge_srna_target(
                rnaplex_file, rnaup_file, intarna_file, args_tar, out_rnaplex,
                out_rnaup, out_intarna,
                os.path.join(self.fasta_path,
                             prefix + ".fa"), merge_file, overlap_file,
                os.path.join(self.srna_path, "_".join([prefix, "sRNA.gff"])),
                all_gff, target_prefixs)
            if ("RNAplex" in args_tar.program):
                log.write("\t" + out_rnaplex + "\n")
            if ("RNAup" in args_tar.program):
                log.write("\t" + out_rnaup + "\n")
            if ("IntaRNA" in args_tar.program):
                log.write("\t" + out_intarna + "\n")
            if (os.path.exists(merge_file)):
                log.write("\t" + merge_file + "\n")
            if (os.path.exists(overlap_file)):
                log.write("\t" + overlap_file + "\n")

    def _remove_rnaplex(self, line, num, pre_num, pre, checks, out_tmp,
                        print_):
        if (line.startswith(">")):
            if (num % 2 == 1):
                print_ = False
                pre = line
                if (line not in checks):
                    checks[line] = []
                    print_ = True
            elif (num % 2 == 0) and (line not in checks[pre]):
                checks[pre].append(line)
                print_ = True
            num = num + 1
        else:
            if (print_):
                if (num != pre_num):
                    out_tmp.write(pre + "\n")
                    out_tmp.write(checks[pre][-1] + "\n")
                out_tmp.write(line + "\n")
                pre_num = num
        return num, pre_num, print_, pre,

    def _remove_rnaup(self, line, pre, num, pre_num, srna_info, checks,
                      out_tmp, print_, tar):
        if (line.startswith(">")):
            print_ = False
            tar = False
            if (pre.startswith(">")):
                if (pre not in checks):
                    checks[pre] = [line]
                    srna_info = pre
                    print_ = True
                else:
                    if (line not in checks[pre]):
                        checks[pre].append(line)
                        print_ = True
            else:
                if (num != 1):
                    if (line not in checks[srna_info]):
                        checks[srna_info].append(line)
                        print_ = True
        else:
            if (print_):
                if (pre_num != len(checks)):
                    out_tmp.write(srna_info + "\n")
                    out_tmp.write(checks[srna_info][-1] + "\n")
                    out_tmp.write(line + "\n")
                else:
                    if (not tar):
                        out_tmp.write(checks[srna_info][-1] + "\n")
                    out_tmp.write(line + "\n")
                pre_num = len(checks)
                tar = True
        pre = line
        num = num + 1
        return num, pre_num, print_, pre, tar, srna_info

    def _remove_intarna(self, line, checks, tar, srna_info, seq, out_tmp):
        if (line.startswith(".")) or (line.startswith("(")) or (
                line.startswith(")")):
            seq = line.split(";")[0]
            if (seq not in checks[tar][srna_info]):
                checks[tar][srna_info].append(seq)
                out_tmp.write(line + "\n")
        else:
            if (len(line.split(";")) >= 8):
                tar = line.split(";")[0]
                srna_info = line.split(";")[3]
                seq = line.split(";")[7]
                if (tar not in checks):
                    checks[tar] = {}
                    checks[tar][srna_info] = [seq]
                    out_tmp.write(line + "\n")
                else:
                    if (srna_info not in checks[tar]):
                        checks[tar][srna_info] = [seq]
                        out_tmp.write(line + "\n")
        return tar, srna_info, seq

    def _remove_repeat(self, interact_file, type_):
        checks = {}
        seq = ""
        pre = ""
        srna_info = ""
        num = 1
        tar = False
        pre_num = 0
        print_ = False
        out_tmp = open(interact_file + "tmp", "w")
        with open(interact_file) as fh:
            for line in fh:
                line = line.strip()
                if (type_ == "RNAplex"):
                    num, pre_num, print_, pre = self._remove_rnaplex(
                        line, num, pre_num, pre, checks, out_tmp, print_)
                elif (type_ == "RNAup"):
                    num, pre_num, print_, pre, tar, srna_info = (
                        self._remove_rnaup(line, pre, num, pre_num, srna_info,
                                           checks, out_tmp, print_, tar))
                elif (type_ == "IntaRNA"):
                    tar, srna_info, seq = self._remove_intarna(
                        line, checks, tar, srna_info, seq, out_tmp)
        out_tmp.close()
        shutil.move(interact_file + "tmp", interact_file)

    def run_srna_target_prediction(self, args_tar, log):
        self._check_gff(args_tar.gffs)
        self._check_gff(args_tar.srnas)
        self.multiparser.parser_gff(args_tar.gffs, None)
        self.multiparser.parser_fasta(args_tar.fastas)
        self.multiparser.parser_gff(args_tar.srnas, "sRNA")
        prefixs = []
        target_prefixs = []
        self._gen_seq(prefixs, target_prefixs, args_tar)
        if ("RNAplex" in args_tar.program):
            self._rna_plex(prefixs, target_prefixs, args_tar, log)
        self.helper.remove_all_content(self.target_seq_path, "_target_",
                                       "file")
        shutil.rmtree(os.path.join(self.rnaplex_path, "tmp_RNAplfold"))
        log.write("The temporary files for running RNAplex are deleted.\n")
        if ("RNAup" in args_tar.program):
            self._rnaup(prefixs, target_prefixs, args_tar, log)
        if ("IntaRNA" in args_tar.program):
            self._intarna(prefixs, target_prefixs, args_tar, log)
        self._merge_rnaplex_rnaup(prefixs, target_prefixs, args_tar, log)
        self.helper.remove_all_content(args_tar.out_folder, self.tmps["tmp"],
                                       "dir")
        self.helper.remove_all_content(args_tar.out_folder, self.tmps["tmp"],
                                       "file")
        self.helper.remove_tmp_dir(args_tar.gffs)
        self.helper.remove_tmp_dir(args_tar.srnas)
        self.helper.remove_tmp_dir(args_tar.fastas)
        self.helper.remove_all_content(self.srna_seq_path, "tmp_", "file")
        os.remove(os.path.join(self.target_seq_path, "all_target.fa"))
Beispiel #44
0
class PPINetwork(object):

    def __init__(self, out_folder):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.gffparser = Gff3Parser()
        self.tmp_id = os.path.join(out_folder, "tmp_id_list")
        self.all_result = os.path.join(out_folder, "all_results")
        self.best_result = os.path.join(out_folder, "best_results")
        self.fig = os.path.join(out_folder, "figures")
        self.with_strain = "with_strain"
        self.without_strain = "without_strain"
        self.tmp_files = {"log": "tmp_log", "action": "tmp_action.log",
                          "pubmed": "tmp_pubmed.log",
                          "specific": os.path.join(
                                      out_folder, "tmp_specific"),
                          "nospecific": os.path.join(
                                        out_folder, "tmp_nospecific"),
                          "wget_action": os.path.join(
                                         out_folder, "tmp_action")}

    def _make_folder_no_exist(self, path, folder):
        if folder not in os.listdir(path):
            os.mkdir(os.path.join(path, folder))

    def _make_subfolder(self, path, strain, ptt):
        os.mkdir(os.path.join(path, strain))
        os.mkdir(os.path.join(path, strain, ptt))

    def _run_wget(self, source, folder, log):
        call(["wget", source, "-O", folder], stderr=log)
        time.sleep(1)

    def _wget_id(self, strain, locus, strain_id, files):
        detect_id = False
        if strain == strain_id["ptt"]:
            print("Retrieving STRING ID for {0} of {1} -- {2}".format(
                   locus, strain_id["string"], strain_id["file"]))
            id_source = ("http://string-db.org/api/tsv/resolve?"
                         "identifier={0}&species={1}").format(
                         locus, strain_id["string"])
            self._run_wget(id_source, os.path.join(files["id_list"], locus),
                           files["id_log"])
            detect_id = True
        return detect_id

    def _retrieve_id(self, strain_id, genes, files):
        for gene in genes:
            detect_id = self._wget_id(gene["strain"], gene["locus_tag"],
                                      strain_id, files)
            if not detect_id:
                print("Error:there is no {0} in {1}".format(
                       gene, strain_id["file"]))

    def _get_prefer_name(self, row_a, strain_id, files, querys):
        prefername = ""
        filename = row_a.split(".")
        if (filename[1] not in os.listdir(files["id_list"])) and (
                "all" not in querys):
            self._wget_id(strain_id["ptt"], filename[1], strain_id, files)
        if filename[1] in os.listdir(files["id_list"]):
            id_h = open(os.path.join(files["id_list"], filename[1]), "r")
            for row_i in csv.reader(id_h, delimiter="\t"):
                if row_a == row_i[0]:
                    prefername = row_i[3]
            id_h.close()
        return prefername

    def _print_title(self, out, id_file, id_folder):
        id_h = open(os.path.join(id_folder, id_file), "r")
        prefername = id_file
        for row_i in csv.reader(id_h, delimiter="\t"):
            prefername = row_i[3]
        id_h.close()
        out.write("Interaction of {0} | {1}\n".format(id_file, prefername))
        out.write("strain\titem_id_a\titem_id_b\tmode\taction\ta_is_acting\t"
                  "STRING_action_score\tpubmed_id\tpubmed_score\n")

    def _get_pubmed(self, row, strain_id, mode, actor, id_file, first_output,
                    ptt, files, paths, args_ppi):
        prefer1 = self._get_prefer_name(row[0], strain_id,
                                        files, args_ppi.querys)
        prefer2 = self._get_prefer_name(row[1], strain_id,
                                        files, args_ppi.querys)
        if (len(prefer1) > 0) and (len(prefer2) > 0):
            if args_ppi.no_specific:
                pubmed_source = (
                    "http://www.ncbi.nlm.nih.gov/CBBresearch/"
                    "Wilbur/IRET/PIE/getppi.cgi?term={0}+{1}").format(
                        prefer1, prefer2)
                self._run_wget(pubmed_source, self.tmp_files["nospecific"],
                               files["pubmed_log"])
            strain_id["pie"] = "+".join(strain_id["pie"].split(" "))
            pubmed_source = (
                "http://www.ncbi.nlm.nih.gov/CBBresearch/Wilbur"
                "/IRET/PIE/getppi.cgi?term={0}+{1}+{2}").format(
                    prefer1, prefer2, strain_id["pie"])
            self._run_wget(pubmed_source, self.tmp_files["specific"],
                           files["pubmed_log"])
            row[2] = mode
            row[4] = actor
            row[0] = prefer1
            row[1] = prefer2
            self._merge_information(
                first_output, self.tmp_files["specific"],
                files["all_specific"], files["best_specific"], row,
                args_ppi.score, id_file, files["id_list"], "specific",
                os.path.join(paths["all"], self.with_strain),
                os.path.join(paths["best"], self.with_strain), ptt)
            if args_ppi.no_specific:
                self._merge_information(
                     first_output, self.tmp_files["nospecific"],
                     files["all_nospecific"], files["best_nospecific"], row,
                     args_ppi.score, id_file, files["id_list"], "nospecific",
                     os.path.join(paths["all"], self.without_strain),
                     os.path.join(paths["best"], self.without_strain), ptt)

    def _print_single_file(self, out_single, row_a, ptt, row):
        if row == "NA":
            out_single.write("\t".join(
                             [ptt, "\t".join(row_a), "NA", "NA"]) + "\n")
        else:
            out_single.write("\t".join(
                             [ptt, "\t".join(row_a), "\t".join(row)]) + "\n")

    def _merge_information(self, first_output, filename, out_all, out_best,
                           row_a, score, id_file, id_folder, file_type,
                           all_folder, best_folder, ptt):
        if os.path.getsize(filename) != 0:
            f_h = open(filename, "r")
            out_all_single = open(os.path.join(
                all_folder, ptt, "_".join([row_a[0], row_a[1] + ".csv"])), "w")
            out_best_single = open(os.path.join(
                best_folder, ptt,
                "_".join([row_a[0], row_a[1] + ".csv"])), "w")
            self._print_title(out_all_single, id_file, id_folder)
            self._print_title(out_best_single, id_file, id_folder)
            detect = False
            for row in csv.reader(f_h, delimiter="\t"):
                self._print_single_file(out_all_single, row_a, ptt, row)
                if first_output["_".join([file_type, "all"])]:
                    first_output["_".join([file_type, "all"])] = False
                    self._print_title(out_all, id_file, id_folder)
                out_all.write("\t".join([ptt, "\t".join(row_a),
                                         "\t".join(row)]) + "\n")
                if (float(row[1]) >= score):
                    detect = True
                    self._print_single_file(out_best_single, row_a, ptt, row)
                    if first_output["_".join([file_type, "best"])]:
                        first_output["_".join([file_type, "best"])] = False
                        self._print_title(out_best, id_file, id_folder)
                    out_best.write("\t".join([ptt, "\t".join(row_a),
                                              "\t".join(row)]) + "\n")
            f_h.close()
            if not detect:
                os.remove(os.path.join(best_folder, ptt,
                          "_".join([row_a[0], row_a[1] + ".csv"])))
            out_all_single.close()
            out_best_single.close()
        else:
            out_all_single = open(os.path.join(
                all_folder, ptt, "_".join([row_a[0], row_a[1] + ".csv"])), "w")
            self._print_title(out_all_single, id_file, id_folder)
            self._print_single_file(out_all_single, row_a, ptt, "NA")
            if first_output["_".join([file_type, "all"])]:
                first_output["_".join([file_type, "all"])] = False
                self._print_title(out_all, id_file, id_folder)
            out_all.write("\t".join([ptt, "\t".join(row_a),
                                     "NA", "NA"]) + "\n")
            out_all_single.close()

    def _detect_protein(self, strain_id, args_ppi):
        fh = open(os.path.join(args_ppi.ptts, strain_id["file"]), "r")
        genes = []
        for row in csv.reader(fh, delimiter="\t"):
            if (len(row) == 1) and ("-" in row[0]) and (".." in row[0]):
                name = (row[0].split("-"))[0].strip().split(",")[0].strip()
            if ("all" in args_ppi.querys):
                if (len(row) > 1) and (row[0] != "Location"):
                    genes.append({"strain": name, "locus_tag": row[5]})
            else:
                for query in args_ppi.querys:
                    datas = query.split(":")
                    strain = datas[0]
                    start = datas[1]
                    end = datas[2]
                    strand = datas[3]
                    if (len(row) > 1) and (row[0] != "Location") and (
                            name == strain) and (
                            start == row[0].split("..")[0]) and (
                            end == row[0].split("..")[1]) and (
                            strand == row[1]):
                        genes.append({"strain": name, "locus_tag": row[5]})
        fh.close()
        return genes

    def _setup_nospecific(self, paths, strain_id, files):
        self._make_subfolder(
            paths["all"], self.without_strain, strain_id["ptt"])
        self._make_subfolder(
            paths["best"], self.without_strain, strain_id["ptt"])
        self._make_subfolder(
            paths["fig"], self.without_strain, strain_id["ptt"])
        filename_nostrain = "_".join([strain_id["file"].replace(".ptt", ""),
                                      self.without_strain + ".csv"])
        files["all_nospecific"] = open(os.path.join(paths["all"],
                                       filename_nostrain), "w")
        files["best_nospecific"] = open(os.path.join(paths["best"],
                                        filename_nostrain), "w")

    def _setup_folder_and_read_file(self, strain_id, pre_file,
                                    files, paths, args_ppi):
        if strain_id["file"].endswith(".ptt"):
            if strain_id["file"] != pre_file:
                self.helper.check_make_folder(
                     "_".join([self.tmp_id, strain_id["file"]]))
                paths["all"] = os.path.join(
                     self.all_result, strain_id["file"][:-4])
                paths["best"] = os.path.join(
                     self.best_result, strain_id["file"][:-4])
                paths["fig"] = os.path.join(
                     self.fig, strain_id["file"][:-4])
                self.helper.check_make_folder(
                     os.path.join(self.all_result, strain_id["file"][:-4]))
                self.helper.check_make_folder(
                     os.path.join(self.best_result, strain_id["file"][:-4]))
                self.helper.check_make_folder(
                     os.path.join(self.fig, strain_id["file"][:-4]))
                self._make_subfolder(
                     paths["all"], self.with_strain, strain_id["ptt"])
                self._make_subfolder(
                     paths["best"], self.with_strain, strain_id["ptt"])
                self._make_subfolder(
                     paths["fig"], self.with_strain, strain_id["ptt"])
                filename_strain = "_".join(
                     [strain_id["file"].replace(".ptt", ""),
                      self.with_strain + ".csv"])
                files["all_specific"] = open(os.path.join(
                                        paths["all"], filename_strain), "w")
                files["best_specific"] = open(os.path.join(
                                         paths["best"], filename_strain), "w")
                if args_ppi.no_specific:
                    self._setup_nospecific(paths, strain_id, files)
                files["id_list"] = "_".join([self.tmp_id, strain_id["file"]])
                files["id_log"] = open(os.path.join(files["id_list"],
                                       self.tmp_files["log"]), "w")
                files["action_log"] = open(os.path.join(args_ppi.out_folder,
                                           self.tmp_files["action"]), "w")
                files["pubmed_log"] = open(os.path.join(args_ppi.out_folder,
                                           self.tmp_files["pubmed"]), "w")
                pre_file = strain_id["file"]
                if strain_id["file"] in os.listdir(args_ppi.ptts):
                    genes = self._detect_protein(strain_id, args_ppi)
            else:
                self._make_folder_no_exist(os.path.join(paths["all"],
                                           self.with_strain), strain_id["ptt"])
                self._make_folder_no_exist(os.path.join(paths["best"],
                                           self.with_strain), strain_id["ptt"])
                if args_ppi.no_specific:
                    self._make_folder_no_exist(
                        os.path.join(paths["all"], self.without_strain),
                        strain_id["ptt"])
                    self._make_folder_no_exist(
                        os.path.join(paths["best"], self.without_strain),
                        strain_id["ptt"])
        else:
            print("Error:wrong .ptt file!!")
            sys.exit()
        return genes

    def _wget_actions(self, files, id_file, strain_id, out_folder):
        detect = False
        t_h = open(os.path.join(files["id_list"], id_file), "r")
        print("Retrieving STRING actions for {0} of {1} -- {2}".format(
              id_file, strain_id["string"], strain_id["file"]))
        for row in csv.reader(t_h, delimiter="\t"):
            if row[0].startswith("stringId"):
                continue
            else:
                detect = True
                if row[1] == strain_id["string"]:
                    action_source = ("http://string-db.org/api/tsv/actions?"
                                     "identifier={0}&species={1}").format(
                                     row[0], row[1])
                    self._run_wget(
                        action_source, self.tmp_files["wget_action"],
                        files["action_log"])
                    break
        t_h.close()
        if not detect:
            print("Warning: " + id_file + " can not be found in STRING...")
        return detect

    def _retrieve_actions(self, files, strain_id, paths, args_ppi):
        for id_file in os.listdir(files["id_list"]):
            if id_file != self.tmp_files["log"]:
                detect_id = self._wget_actions(files, id_file, strain_id,
                                               args_ppi.out_folder)
                if detect_id:
                    a_h = open(self.tmp_files["wget_action"], "r")
                    pre_row = []
                    first = True
                    detect = False
                    first_output = {"specific_all": True,
                                    "specific_best": True,
                                    "nospecific_all": True,
                                    "nospecific_best": True}
                    print("Retrieving Pubmed for {0} of {1} -- {2}".format(
                           id_file, strain_id["string"], strain_id["file"]))
                    for row_a in csv.reader(a_h, delimiter="\t"):
                        if row_a == []:
                            print("No interaction can be detected...")
                            break
                        if row_a[0].startswith("item_id_a"):
                            continue
                        else:
                            detect = True
                            if first:
                                first = False
                                mode = row_a[2]
                                actor = row_a[4]
                            else:
                                if (row_a[0] != pre_row[0]) or (
                                        row_a[1] != pre_row[1]):
                                    self._get_pubmed(
                                        pre_row, strain_id, mode, actor,
                                        id_file, first_output,
                                        strain_id["ptt"], files, paths,
                                        args_ppi)
                                    mode = row_a[2]
                                    actor = row_a[4]
                                else:
                                    mode = mode + ";" + row_a[2]
                                    actor = actor + ";" + row_a[4]
                            pre_row = row_a
                    if detect:
                        detect = False
                        self._get_pubmed(
                            row_a, strain_id, mode, actor, id_file,
                            first_output, strain_id["ptt"], files,
                            paths, args_ppi)
        if detect_id:
            a_h.close()

    def _plot(self, args_ppi, files):
        if args_ppi.no_specific:
            files["all_nospecific"].close()
            files["best_nospecific"].close()
        files["all_specific"].close()
        files["best_specific"].close()
        for folder in os.listdir(self.all_result):
            if folder in os.listdir(self.fig):
                print("plotting {0}".format(folder))
                plot_ppi(os.path.join(self.all_result, folder,
                         "_".join([folder, self.with_strain + ".csv"])),
                         args_ppi.score, os.path.join(self.fig, folder,
                         self.with_strain), args_ppi.size)
                if args_ppi.no_specific:
                    plot_ppi(os.path.join(self.all_result, folder,
                             "_".join([folder, self.without_strain + ".csv"])),
                             args_ppi.score,
                             os.path.join(self.fig, folder,
                             self.without_strain), args_ppi.size)

    def _remove_tmps(self, args_ppi):
        self.helper.remove_all_content(os.path.join(args_ppi.out_folder),
                                       "tmp", "file")
        self.helper.remove_all_content(os.path.join(args_ppi.out_folder),
                                       "tmp", "dir")
        for file_ in os.listdir(args_ppi.ptts):
            if file_.startswith("PPI_"):
                os.remove(os.path.join(args_ppi.ptts, file_))

    def retrieve_ppi_network(self, args_ppi):
        strain_ids = []
        paths = {}
        files = {}
        for strain in args_ppi.strains:
            datas = strain.split(":")
            ptt_file = "PPI_" + datas[0].replace(".gff", ".ptt")
            rnt_file = "PPI_" + datas[0].replace(".gff", ".rnt")
            self.converter.convert_gff2rntptt(
                           os.path.join(args_ppi.ptts, datas[0]),
                           "0", os.path.join(args_ppi.ptts, ptt_file),
                           os.path.join(args_ppi.ptts, rnt_file), None, None)
            strain_ids.append({"file": ptt_file,
                               "ptt": datas[1],
                               "string": datas[2],
                               "pie": datas[3]})
        strain_ids.sort(key=lambda x: x["file"])
        pre_file = ""
        for strain_id in strain_ids:
            genes = self._setup_folder_and_read_file(strain_id, pre_file,
                                                     files, paths, args_ppi)
            s_h = open(args_ppi.species, "r")
            for row in csv.reader(s_h, delimiter="\t"):
                if row[0] != "##":
                    if row[0] == strain_id["string"]:
                        break
                    elif row[2] == strain_id["string"]:
                        strain_id["string"] = row[0]
                        break
                    elif row[3] == strain_id["string"]:
                        strain_id["string"] = row[0]
                        break
            self._retrieve_id(strain_id, genes, files)
            self._retrieve_actions(files, strain_id, paths, args_ppi)
        self._plot(args_ppi, files)
        self._remove_tmps(args_ppi)
Beispiel #45
0
class Screen(object):
    '''generation of screenshot'''
    def __init__(self, args_sc, out_folder):
        self.helper = Helper()
        args_sc.output_folder = out_folder
        filename = args_sc.fasta.split("/")[-1]
        self.strain = ".".join(filename.split(".")[0:-1])
        self.helper.check_make_folder(
            os.path.join(args_sc.output_folder, self.strain))
        self.forward_file = os.path.join(args_sc.output_folder, self.strain,
                                         "forward")
        self.reverse_file = os.path.join(args_sc.output_folder, self.strain,
                                         "reverse")
        os.mkdir(self.forward_file)
        os.mkdir(self.reverse_file)

    def _import_libs(self, texs, strand, lib_dict):
        if strand == "+":
            tex = "ft"
            notex = "fn"
        else:
            tex = "rt"
            notex = "rn"
        for flib in texs:
            if (flib[1] == "tex"):
                lib_dict[tex].append(flib[0])
                for nlib in texs:
                    if (nlib[1] == "notex") and \
                       (flib[2] == nlib[2]) and \
                       (flib[3] == nlib[3]):
                        lib_dict[notex].append(nlib[0])

    def screenshot(self, args_sc, log):
        lib_dict = {"ft": [], "fn": [], "rt": [], "rn": [], "ff": [], "rf": []}
        f_texs = []
        r_texs = []
        if args_sc.tlibs is not None:
            for lib in args_sc.tlibs:
                lib_datas = lib.split(":")
                if not lib_datas[0].endswith(".wig"):
                    log.write("Wiggle files should end with .wig.\n")
                    print("Error: Wiggle files should end with .wig!")
                    sys.exit()
                else:
                    if lib_datas[-1] == "+":
                        f_texs.append(lib_datas)
                    else:
                        r_texs.append(lib_datas)
            f_texs = sorted(f_texs, key=lambda x: (x[1], x[2], x[3]))
            r_texs = sorted(r_texs, key=lambda x: (x[1], x[2], x[3]))
            self._import_libs(f_texs, "+", lib_dict)
            self._import_libs(r_texs, "-", lib_dict)
        if args_sc.flibs is not None:
            for lib in args_sc.flibs:
                lib_datas = lib.split(":")
                if not lib_datas[0].endswith(".wig"):
                    log.write("Wiggle files should end with .wig.\n")
                    print("Error: Wiggle files should end with .wig!")
                    sys.exit()
                else:
                    if lib_datas[-1] == "+":
                        lib_dict["ff"].append(lib_datas[0])
                    else:
                        lib_dict["rf"].append(lib_datas[0])
        log.write("Running gen_screenshots.py to generate IGV batch script.\n")
        gen_screenshot(args_sc, lib_dict, self.forward_file + ".txt",
                       self.reverse_file + ".txt", self.strain)
        log.write("\t" + self.forward_file + ".txt is generated.\n")
        log.write("\t" + self.reverse_file + ".txt is generated.\n")
        if (args_sc.tlibs is None) and (args_sc.flibs is None):
            log.write("No wig files can be found.\n")
            print("Error: There is no wig file assigned!")
            sys.exit()
Beispiel #46
0
class TranscriptDetection(object):
    '''doing for transcript detection'''
    def __init__(self, args_tran):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.gff_outfolder = os.path.join(args_tran.out_folder, "gffs")
        self.tran_path = os.path.join(self.gff_outfolder, "tmp")
        self.stat_path = os.path.join(args_tran.out_folder, "statistics")
        self.tmps = {
            "gff": "tmp.gff",
            "merge": "tmp_merge",
            "tran": os.path.join(args_tran.out_folder, "tmp_tran"),
            "tss_ta": os.path.join(self.gff_outfolder, "tmp_tss_ta"),
            "ta_tss": os.path.join(self.gff_outfolder, "tmp_ta_tss"),
            "ta_gff": os.path.join(self.gff_outfolder, "tmp_ta_gff"),
            "gff_ta": os.path.join(self.gff_outfolder, "tmp_gff_ta"),
            "uni": os.path.join(self.gff_outfolder, "tmp_uni"),
            "overlap": os.path.join(self.gff_outfolder, "tmp_overlap")
        }
        self.frag = "transcript_fragment.gff"
        self.tex = "transcript_tex_notex.gff"
        self.endfix_tran = "transcript.gff"

    def _compute_transcript(self, wig_f, wig_r, wig_folder, wig_type, strain,
                            libs, args_tran):
        print("Computing transcript for {0}".format(strain))
        out = os.path.join(args_tran.out_folder, "_".join([strain, wig_type]))
        detect_transcript(wig_f, wig_r, wig_folder, libs, out, wig_type,
                          args_tran)

    def _compute(self, wig_type, wigs, libs, args_tran):
        strains = []
        wig_folder = os.path.join(wigs, "tmp")
        for wig in os.listdir(wig_folder):
            if wig.endswith("_forward.wig"):
                strains.append(wig.replace("_forward.wig", ""))
        for strain in strains:
            f_file = os.path.join(wig_folder, "_".join([strain,
                                                        "forward.wig"]))
            r_file = os.path.join(wig_folder, "_".join([strain,
                                                        "reverse.wig"]))
            self._compute_transcript(f_file, r_file, wigs, wig_type, strain,
                                     libs, args_tran)
        return strains

    def _compare_tss(self, tas, args_tran):
        self.multiparser.parser_gff(args_tran.compare_tss, "TSS")
        self.multiparser.combine_gff(
            self.gff_outfolder, os.path.join(args_tran.compare_tss, "tmp"),
            "transcript", "TSS")
        print("Comaring of Transcript and TSS file")
        tss_folder = os.path.join(args_tran.compare_tss, "tmp")
        for ta in tas:
            ta_file = os.path.join(self.gff_outfolder,
                                   "_".join([ta, self.endfix_tran]))
            stat_tss_out = os.path.join(
                self.stat_path,
                "".join(["stat_compare_transcript_TSS_", ta, ".csv"]))
            for tss in os.listdir(tss_folder):
                filename = tss.split("_TSS")
                if (filename[0] == ta) and (tss.endswith(".gff")):
                    stat_ta_tss(ta_file, os.path.join(tss_folder, tss),
                                stat_tss_out, self.tmps["ta_tss"],
                                self.tmps["tss_ta"], args_tran.fuzzy)
                    os.remove(ta_file)
                    os.remove(os.path.join(tss_folder, tss))
                    self.helper.sort_gff(self.tmps["ta_tss"], ta_file)
                    self.helper.sort_gff(
                        self.tmps["tss_ta"],
                        os.path.join(args_tran.compare_tss, tss))
                    os.remove(self.tmps["tss_ta"])
                    os.remove(self.tmps["ta_tss"])

    def _compare_cds(self, tas, args_tran):
        self.multiparser.parser_gff(args_tran.gffs, None)
        self.multiparser.combine_gff(self.gff_outfolder,
                                     os.path.join(args_tran.gffs, "tmp"),
                                     "transcript", None)
        print("Comaring of Transcript and genome annotation")
        cds_folder = os.path.join(args_tran.gffs, "tmp")
        for ta in tas:
            ta_file = os.path.join(self.gff_outfolder,
                                   "_".join([ta, self.endfix_tran]))
            stat_gff_out = os.path.join(
                self.stat_path,
                "".join(["stat_compare_transcript_genome_", ta, ".csv"]))
            for gff in os.listdir(cds_folder):
                if (gff[:-4] == ta) and (gff.endswith(".gff")):
                    cds_file = os.path.join(cds_folder, gff)
                    stat_ta_gff(ta_file, cds_file, stat_gff_out,
                                self.tmps["ta_gff"], self.tmps["gff_ta"],
                                args_tran.c_feature)
                    os.remove(ta_file)
                    os.remove(os.path.join(args_tran.gffs, gff))
                    self.helper.sort_gff(self.tmps["ta_gff"], ta_file)
                    self.helper.sort_gff(self.tmps["gff_ta"],
                                         os.path.join(args_tran.gffs, gff))
                    os.remove(self.tmps["ta_gff"])
                    os.remove(self.tmps["gff_ta"])

    def _compare_tss_cds(self, tas, args_tran):
        '''compare transcript with CDS and TSS'''
        if (args_tran.compare_tss is not None) and (args_tran.c_feature
                                                    is not None):
            self.multiparser.parser_gff(self.gff_outfolder, "transcript")
            self._compare_cds(tas, args_tran)
            self._compare_tss(tas, args_tran)
        elif (args_tran.c_feature
              is not None) and (args_tran.compare_tss is None):
            self.multiparser.parser_gff(self.gff_outfolder, "transcript")
            self._compare_cds(tas, args_tran)
        elif (args_tran.c_feature is None) and (args_tran.compare_tss
                                                is not None):
            self.multiparser.parser_gff(self.gff_outfolder, "transcript")
            self._compare_tss(tas, args_tran)

    def _for_one_wig(self, type_, args_tran):
        '''running transcript detection to one type of wig files'''
        if type_ == "tex_notex":
            libs = args_tran.tlibs
            wigs = args_tran.tex_wigs
        else:
            libs = args_tran.flibs
            wigs = args_tran.frag_wigs
        print("Computing {0} wig files".format(type_))
        strains = self._compute(type_, wigs, libs, args_tran)
        for strain in strains:
            out = os.path.join(
                self.gff_outfolder,
                "_".join([strain, "transcript", type_ + ".gff"]))
            self.helper.sort_gff(
                os.path.join(args_tran.out_folder, "_".join([strain, type_])),
                out)
            os.remove(
                os.path.join(args_tran.out_folder, "_".join([strain, type_])))
        return strains

    def _for_two_wigs(self, strains, args_tran):
        '''merge the results of fragemented and tex treated libs'''
        if (args_tran.frag_wigs is not None) and (args_tran.tex_wigs
                                                  is not None):
            print("Merging fragment and tex treat one")
            for strain in strains:
                frag_gff = os.path.join(self.gff_outfolder,
                                        "_".join([strain, self.frag]))
                tex_gff = os.path.join(self.gff_outfolder,
                                       "_".join([strain, self.tex]))
                final_gff = os.path.join(self.gff_outfolder,
                                         "_".join([strain, self.endfix_tran]))
                for gff in os.listdir(self.gff_outfolder):
                    if "_transcript_" in gff:
                        filename = gff.split("_transcript_")
                        if (strain == filename[0]) and ("tex_notex.gff"
                                                        == filename[1]):
                            tex_file = gff
                        elif (strain == filename[0]) and ("fragment.gff"
                                                          == filename[1]):
                            frag_file = gff
                combine(
                    os.path.join(self.gff_outfolder, frag_file),
                    os.path.join(self.gff_outfolder, tex_file),
                    args_tran.tolerance,
                    os.path.join(self.gff_outfolder,
                                 "_".join([strain, self.endfix_tran])))
                os.remove(frag_gff)
                os.remove(tex_gff)
        else:
            if args_tran.frag_wigs is not None:
                for strain in strains:
                    frag_gff = os.path.join(self.gff_outfolder,
                                            "_".join([strain, self.frag]))
                    final_gff = os.path.join(
                        self.gff_outfolder,
                        "_".join([strain, self.endfix_tran]))
                    shutil.move(frag_gff, final_gff)
            elif args_tran.tex_wigs is not None:
                for strain in strains:
                    tex_gff = os.path.join(self.gff_outfolder,
                                           "_".join([strain, self.tex]))
                    final_gff = os.path.join(
                        self.gff_outfolder,
                        "_".join([strain, self.endfix_tran]))
                    shutil.move(tex_gff, final_gff)

    def _post_modify(self, tas, args_tran):
        '''modify the transcript by comparing with genome annotation'''
        for ta in tas:
            for gff in os.listdir(args_tran.gffs):
                if (".gff" in gff) and (gff[:-4] == ta):
                    break
            print("Modifying {0} refering to {1}".format(ta, gff))
            fill_gap(
                os.path.join(args_tran.gffs, gff),
                os.path.join(self.tran_path, "_".join([ta, self.endfix_tran])),
                "overlap", self.tmps["overlap"])
            fill_gap(
                os.path.join(args_tran.gffs, gff),
                os.path.join(self.tran_path, "_".join([ta, self.endfix_tran])),
                "uni", self.tmps["uni"])
            tmp_merge = os.path.join(self.gff_outfolder, self.tmps["merge"])
            if self.tmps["merge"] in self.gff_outfolder:
                os.remove(tmp_merge)
            self.helper.merge_file(self.tmps["overlap"], tmp_merge)
            self.helper.merge_file(self.tmps["uni"], tmp_merge)
            tmp_out = os.path.join(self.gff_outfolder, "_".join(["tmp", ta]))
            self.helper.sort_gff(tmp_merge, tmp_out)
            os.remove(self.tmps["overlap"])
            os.remove(self.tmps["uni"])
            os.remove(tmp_merge)
            final_out = os.path.join(self.gff_outfolder,
                                     "_".join(["final", ta]))
            longer_ta(tmp_out, args_tran.length, final_out)
            shutil.move(
                final_out,
                os.path.join(self.tmps["tran"],
                             "_".join([ta, self.endfix_tran])))
            os.remove(tmp_out)
        shutil.rmtree(self.gff_outfolder)
        shutil.move(self.tmps["tran"], self.gff_outfolder)

    def _remove_file(self, args_tran):
        if "tmp_wig" in os.listdir(args_tran.out_folder):
            shutil.rmtree(os.path.join(args_tran.out_folder, "tmp_wig"))
        self.helper.remove_tmp_dir(args_tran.gffs)
        self.helper.remove_tmp_dir(args_tran.compare_tss)
        self.helper.remove_tmp_dir(args_tran.terms)
        self.helper.remove_tmp(os.path.join(args_tran.out_folder, "gffs"))
        self.helper.remove_tmp(self.gff_outfolder)

    def _compare_term_tran(self, args_tran):
        '''searching the associated terminator to transcript'''
        if args_tran.terms is not None:
            print("Comparing between terminators and transcripts")
            self.multiparser.parser_gff(args_tran.terms, "term")
            if args_tran.gffs is not None:
                self.multiparser.combine_gff(
                    args_tran.gffs, os.path.join(args_tran.terms, "tmp"), None,
                    "term")
            compare_term_tran(self.gff_outfolder,
                              os.path.join(args_tran.terms, "tmp"),
                              args_tran.fuzzy_term, args_tran.fuzzy_term,
                              args_tran.out_folder, "transcript",
                              args_tran.terms, self.gff_outfolder)

    def run_transcript(self, args_tran):
        if (args_tran.frag_wigs is None) and (args_tran.tex_wigs is None):
            print("Error: There is no wigs files!!!!\n")
            sys.exit()
        if args_tran.frag_wigs is not None:
            strains = self._for_one_wig("fragment", args_tran)
        if args_tran.tex_wigs is not None:
            strains = self._for_one_wig("tex_notex", args_tran)
        self._for_two_wigs(strains, args_tran)
        tas = []
        if args_tran.gffs is not None:
            for gff in os.listdir(args_tran.gffs):
                if gff.endswith(".gff"):
                    self.helper.sort_gff(os.path.join(args_tran.gffs, gff),
                                         self.tmps["gff"])
                    shutil.move(self.tmps["gff"],
                                os.path.join(args_tran.gffs, gff))
            self.multiparser.combine_gff(args_tran.gffs,
                                         os.path.join(args_tran.gffs, "tmp"),
                                         None, None)
            self.multiparser.parser_gff(self.gff_outfolder, "transcript")
            self.multiparser.combine_gff(args_tran.gffs, self.tran_path, None,
                                         "transcript")
            self.helper.check_make_folder(self.tmps["tran"])
            for ta in os.listdir(self.tran_path):
                if ta.endswith(".gff"):
                    if os.path.getsize(os.path.join(self.tran_path, ta)) != 0:
                        tas.append(ta.replace("_" + self.endfix_tran, ""))
            self._post_modify(tas, args_tran)
        self._compare_tss_cds(tas, args_tran)
        self._compare_term_tran(args_tran)
        print("Generating table for the details")
        gen_table_transcript(self.gff_outfolder, args_tran)
        plot_tran(self.gff_outfolder, self.stat_path, args_tran.max_dist)
        self._remove_file(args_tran)
Beispiel #47
0
class sRNADetection(object):
    '''detection of sRNA'''
    def __init__(self, args_srna):
        self.args_container = ArgsContainer()
        self.helper = Helper()
        self.multiparser = Multiparser()
        self.gff_output = os.path.join(args_srna.out_folder, "gffs")
        self.table_output = os.path.join(args_srna.out_folder, "tables")
        self.stat_path = os.path.join(args_srna.out_folder, "statistics")
        self.tss_path = self._check_folder_exist(args_srna.tss_folder)
        self.pro_path = self._check_folder_exist(args_srna.pro_folder)
        self.sorf_path = self._check_folder_exist(args_srna.sorf_file)
        self.fasta_path = os.path.join(args_srna.fastas, "tmp")
        self.tran_path = os.path.join(args_srna.trans, "tmp")
        self.term_path = self._check_folder_exist(args_srna.terms)
        self.merge_wigs = os.path.join(args_srna.out_folder, "merge_wigs")
        self.prefixs = {
            "merge": os.path.join(args_srna.out_folder, "tmp_merge"),
            "utr": os.path.join(args_srna.out_folder, "tmp_utrsrna"),
            "normal": os.path.join(args_srna.out_folder, "tmp_normal"),
            "in_cds": os.path.join(args_srna.out_folder, "tmp_incds"),
            "merge_table": os.path.join(args_srna.out_folder,
                                        "tmp_merge_table"),
            "utr_table": os.path.join(args_srna.out_folder,
                                      "tmp_utrsrna_table"),
            "normal_table": os.path.join(args_srna.out_folder,
                                         "tmp_normal_table"),
            "in_cds_table": os.path.join(args_srna.out_folder,
                                         "tmp_incds_table"),
            "basic": os.path.join(args_srna.out_folder, "tmp_basic"),
            "energy": os.path.join(args_srna.out_folder, "tmp_energy")
        }
        self.tmps = {
            "nr": os.path.join(args_srna.out_folder, "tmp_nr"),
            "srna": os.path.join(args_srna.out_folder, "tmp_sRNA")
        }
        self.best_table = os.path.join(self.table_output, "best")
        self.table_output = os.path.join(args_srna.out_folder, "tables")
        self.stat_path = os.path.join(args_srna.out_folder, "statistics")
        self.all_best = {
            "all_gff": os.path.join(self.gff_output, "all_candidates"),
            "best_gff": os.path.join(self.gff_output, "best"),
            "all_table": os.path.join(self.table_output, "all_candidates"),
            "best_table": os.path.join(self.table_output, "best")
        }

    def _check_folder_exist(self, folder):
        if folder is not None:
            path = os.path.join(folder, "tmp")
        else:
            path = None
        return path

    def _check_gff(self, gffs):
        for gff in os.listdir(gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(gffs, gff))

    def _run_format(self, blast_path, database, type_, db_file, err):
        call([
            os.path.join(blast_path, "makeblastdb"), "-in", database,
            "-dbtype", type_, "-out", db_file
        ],
             stderr=err)

    def _formatdb(self, database, type_, out_folder, blast_path,
                  database_type):
        err = open(os.path.join(out_folder, "log.txt"), "w")
        if (database.endswith(".fa")) or (database.endswith(".fna")) or (
                database.endswith(".fasta")):
            pass
        else:
            folders = database.split("/")
            filename = folders[-1]
            folder = "/".join(folders[:-1])
            for fasta in os.listdir(folder):
                if (fasta.endswith(".fa")) or (fasta.endswith(".fna")) or (
                        fasta.endswith(".fasta")):
                    if ".".join(fasta.split(".")[:-1]) == filename:
                        database = os.path.join(folder, fasta)
        if database_type == "sRNA":
            change_format(database, "tmp_srna_database")
            os.remove(database)
            shutil.move("tmp_srna_database", database)
        db_file = ".".join(database.split(".")[:-1])
        self._run_format(blast_path, database, type_, db_file, err)
        err.close()

    def _merge_frag_tex_file(self, files, args_srna):
        '''merge the results of fragmented and tex treated libs'''
        if (args_srna.frag_wigs is not None) and (args_srna.tex_wigs
                                                  is not None):
            self.helper.merge_file(files["frag_gff"], files["tex_gff"])
            self.helper.merge_file(files["frag_csv"], files["tex_csv"])
            shutil.move(files["tex_csv"], files["merge_csv"])
            self.helper.sort_gff(files["tex_gff"], files["merge_gff"])
            os.remove(files["frag_csv"])
            os.remove(files["frag_gff"])
            os.remove(files["tex_gff"])
        elif (args_srna.frag_wigs is not None):
            shutil.move(files["frag_csv"], files["merge_csv"])
            self.helper.sort_gff(files["frag_gff"], files["merge_gff"])
            os.remove(files["frag_gff"])
        elif (args_srna.tex_wigs is not None):
            shutil.move(files["tex_csv"], files["merge_csv"])
            self.helper.sort_gff(files["tex_gff"], files["merge_gff"])

    def _read_lib_wig(self, args_srna):
        libs, texs = read_libs(args_srna.input_libs, args_srna.wig_folder)
        wigs_f = read_wig(args_srna.wig_f_file, "+", libs)
        wigs_r = read_wig(args_srna.wig_r_file, "-", libs)
        return [libs, texs, wigs_f, wigs_r]

    def _run_normal(self, prefix, gff, tran, fuzzy_tss, args_srna):
        '''detection of intergenic and antisense sRNA'''
        tex_datas = None
        frag_datas = None
        if "tmp_cutoff_inter" in os.listdir(args_srna.out_folder):
            os.remove(os.path.join(args_srna.out_folder, "tmp_cutoff_inter"))
        files = {
            "frag_gff": None,
            "frag_csv": None,
            "tex_gff": None,
            "tex_csv": None,
            "merge_gff": None,
            "merge_csv": None
        }
        if self.tss_path is not None:
            tss = self.helper.get_correct_file(self.tss_path, "_TSS.gff",
                                               prefix, None, None)
        else:
            tss = None
        if self.pro_path is not None:
            pro = self.helper.get_correct_file(self.pro_path,
                                               "_processing.gff", prefix, None,
                                               None)
        else:
            pro = None
        if args_srna.frag_wigs is not None:
            files["frag_gff"] = os.path.join(args_srna.out_folder,
                                             "_".join(["tmp_frag", prefix]))
            files["frag_csv"] = os.path.join(
                args_srna.out_folder, "_".join(["tmp_frag_table", prefix]))
            args_srna = self.args_container.container_intersrna(
                "frag", files, args_srna, prefix,
                os.path.join(args_srna.gffs, gff), tran, tss, pro, fuzzy_tss)
            frag_datas = self._read_lib_wig(args_srna)
            intergenic_srna(args_srna, frag_datas[0], frag_datas[1],
                            frag_datas[2], frag_datas[3])
        if args_srna.tex_wigs is not None:
            files["tex_gff"] = os.path.join(args_srna.out_folder,
                                            "_".join(["tmp_tex", prefix]))
            files["tex_csv"] = os.path.join(
                args_srna.out_folder, "_".join(["tmp_tex_table", prefix]))
            args_srna = self.args_container.container_intersrna(
                "tex", files, args_srna, prefix,
                os.path.join(args_srna.gffs, gff), tran, tss, pro, fuzzy_tss)
            tex_datas = self._read_lib_wig(args_srna)
            intergenic_srna(args_srna, tex_datas[0], tex_datas[1],
                            tex_datas[2], tex_datas[3])
        files["merge_csv"] = "_".join([self.prefixs["normal_table"], prefix])
        files["merge_gff"] = "_".join([self.prefixs["normal"], prefix])
        self._merge_frag_tex_file(files, args_srna)
        if ("TSS_class" in os.listdir(
                args_srna.out_folder)) and (not args_srna.tss_source):
            tss = os.path.join(args_srna.out_folder, "TSS_class",
                               prefix + "_TSS.gff")
        return tss, frag_datas, tex_datas

    def _run_utrsrna(self, gff, tran, prefix, tss, pro, args_srna, frag_datas,
                     tex_datas):
        '''detection of UTR-derived sRNA'''
        if "tmp_median" in os.listdir(args_srna.out_folder):
            os.remove(os.path.join(args_srna.out_folder, "tmp_median"))
        files = {
            "frag_gff": None,
            "frag_csv": None,
            "tex_gff": None,
            "tex_csv": None,
            "merge_gff": None,
            "merge_csv": None
        }
        if args_srna.tex_wigs is not None:
            files["tex_gff"] = os.path.join(args_srna.out_folder,
                                            "_".join(["tmp_utr_tex", prefix]))
            files["tex_csv"] = os.path.join(
                args_srna.out_folder, "_".join(["tmp_utr_tex_table", prefix]))
            args_srna = self.args_container.container_utrsrna(
                os.path.join(args_srna.gffs, gff), tran, tss, files, pro,
                os.path.join(self.fasta_path, prefix + ".fa"), "tex", prefix,
                args_srna)
            utr_derived_srna(args_srna, tex_datas[0], tex_datas[1],
                             tex_datas[2], tex_datas[3])
        if args_srna.frag_wigs is not None:
            files["frag_gff"] = os.path.join(
                args_srna.out_folder, "_".join(["tmp_utr_frag", prefix]))
            files["frag_csv"] = os.path.join(
                args_srna.out_folder, "_".join(["tmp_utr_frag_table", prefix]))
            args_srna = self.args_container.container_utrsrna(
                os.path.join(args_srna.gffs, gff), tran, tss, files, pro,
                os.path.join(self.fasta_path, prefix + ".fa"), "frag", prefix,
                args_srna)
            utr_derived_srna(args_srna, frag_datas[0], frag_datas[1],
                             frag_datas[2], frag_datas[3])
        files["merge_csv"] = "_".join([self.prefixs["utr_table"], prefix])
        files["merge_gff"] = "_".join([self.prefixs["utr"], prefix])
        self._merge_frag_tex_file(files, args_srna)
        filter_utr(files["merge_gff"], files["merge_csv"], args_srna.min_utr)

    def _check_necessary_file(self, args_srna):
        if (args_srna.gffs is None) or (args_srna.trans is None) or (
            (args_srna.tex_wigs is None) and (args_srna.frag_wigs is None)):
            print("Error: lack required files!!!!")
            sys.exit()
        if args_srna.utr_srna:
            if (args_srna.tss_folder is None):
                print("Error: lack required TSS files for UTR "
                      "derived sRNA detection!!!!")
                sys.exit()
            if (args_srna.pro_folder is None):
                print("Warning: lack Processing site files for UTR "
                      "derived sRNA detection!!!")
                print("it may effect the results!!!!")
        self._check_gff(args_srna.gffs)
        self._check_gff(args_srna.trans)
        if args_srna.tss_folder is not None:
            self._check_gff(args_srna.tss_folder)
            self.multiparser.parser_gff(args_srna.tss_folder, "TSS")
            self.multiparser.combine_gff(args_srna.gffs, self.tss_path, None,
                                         "TSS")
        if args_srna.pro_folder is not None:
            self._check_gff(args_srna.pro_folder)
            self.multiparser.parser_gff(args_srna.pro_folder, "processing")
            self.multiparser.combine_gff(args_srna.gffs, self.pro_path, None,
                                         "processing")
        if args_srna.sorf_file is not None:
            self._check_gff(args_srna.sorf_file)
            self.multiparser.parser_gff(args_srna.sorf_file, "sORF")
            self.multiparser.combine_gff(args_srna.gffs, self.sorf_path, None,
                                         "sORF")
        if args_srna.import_info is not None:
            if args_srna.utr_srna or ("sec_str" in args_srna.import_info) or (
                    args_srna.nr_database
                    is not None) or (args_srna.srna_database is not None):
                if args_srna.fastas is None:
                    print("Error: lack required fasta files for UTR "
                          "derived sRNA detection!!!!")
                    sys.exit()
                self.multiparser.parser_fasta(args_srna.fastas)
                self.multiparser.combine_fasta(args_srna.gffs, self.fasta_path,
                                               None)
        if args_srna.terms is not None:
            self._check_gff(args_srna.terms)
            self.multiparser.parser_gff(args_srna.terms, "term")
            self.multiparser.combine_gff(args_srna.gffs, self.term_path, None,
                                         "term")
        else:
            self.term_path = None

    def _merge_tex_frag_datas(self, tex_datas, frag_datas):
        if (tex_datas is not None) and (frag_datas is not None):
            for index in [2, 3]:
                for strain, conds in frag_datas[index].items():
                    if strain not in tex_datas[index].keys():
                        tex_datas[index][strain] = conds
                    else:
                        for cond, tracks in conds.items():
                            tex_datas[index][strain][cond] = tracks
        elif (tex_datas is None) and (frag_datas is not None):
            tex_datas = frag_datas
        return tex_datas

    def _run_program(self, args_srna):
        prefixs = []
        tss = None
        for gff in os.listdir(args_srna.gffs):
            if gff.endswith(".gff"):
                prefix = gff.replace(".gff", "")
                prefixs.append(prefix)
                print("Running sRNA detection of {0}....".format(prefix))
                tran = self.helper.get_correct_file(self.tran_path,
                                                    "_transcript.gff", prefix,
                                                    None, None)
                gffs = {
                    "merge": "_".join([self.prefixs["merge"], prefix]),
                    "utr": "_".join([self.prefixs["utr"], prefix]),
                    "normal": "_".join([self.prefixs["normal"], prefix])
                }
                csvs = {
                    "merge": "_".join([self.prefixs["merge_table"], prefix]),
                    "utr": "_".join([self.prefixs["utr_table"], prefix]),
                    "normal": "_".join([self.prefixs["normal_table"], prefix])
                }
                tss, frag_datas, tex_datas = self._run_normal(
                    prefix, gff, tran, args_srna.fuzzy_tsss["inter"],
                    args_srna)
                if args_srna.utr_srna:
                    print("Running UTR derived sRNA detection of {0}".format(
                        prefix))
                    if tss is None:
                        tss = self.helper.get_correct_file(
                            self.tss_path, "_TSS.gff", prefix, None, None)
                    if self.pro_path is not None:
                        pro = self.helper.get_correct_file(
                            self.pro_path, "_processing.gff", prefix, None,
                            None)
                    else:
                        pro = None
                    if tss is not None:
                        self._run_utrsrna(gff, tran, prefix, tss, pro,
                                          args_srna, frag_datas, tex_datas)
                tex_datas = self._merge_tex_frag_datas(tex_datas, frag_datas)
                del frag_datas
                gc.collect()
                self._merge_srna(args_srna, gffs, csvs, prefix,
                                 os.path.join(args_srna.gffs, gff), tss,
                                 tex_datas)
                del tex_datas
                filter_frag(csvs["merge"], gffs["merge"])
                self.helper.sort_gff(gffs["merge"],
                                     "_".join([self.prefixs["basic"], prefix]))
        return prefixs

    def _merge_srna(self, args_srna, gffs, csvs, prefix, gff_file, tss,
                    tex_datas):
        print("merging data of sRNA...")
        merge_srna_gff(gffs, args_srna.in_cds, args_srna.cutoff_overlap,
                       gff_file)
        merge_srna_table(gffs["merge"], csvs, tex_datas[2], tex_datas[3], tss,
                         args_srna)

    def _run_RNAfold(self, seq_file, vienna_path, sec_file):
        os.system(" ".join([
            "cat", seq_file, "|",
            os.path.join(vienna_path, "RNAfold"), "-p", ">", sec_file
        ]))

    def _get_seq_sec(self, fasta_path, out_folder, prefix, sec_path, dot_path,
                     vienna_path):
        '''extract the sec str energy'''
        detect = False
        for fasta in os.listdir(fasta_path):
            if fasta.endswith(".fa") and (fasta.replace(".fa", "") == prefix):
                detect = True
                break
        if detect:
            detect = False
            seq_file = os.path.join(out_folder, "_".join(["sRNA_seq", prefix]))
            sec_file = os.path.join(out_folder, "_".join(["sRNA_2d", prefix]))
            self.helper.get_seq("_".join([self.prefixs["basic"], prefix]),
                                os.path.join(fasta_path, fasta), seq_file)
        else:
            print("Error:There is not fasta file of {0}".format(prefix))
            print("please check your imported information")
            sys.exit()
        tmp_path = os.path.join(out_folder, "tmp_srna")
        self.helper.check_make_folder(tmp_path)
        main_path = os.getcwd()
        os.chdir(tmp_path)
        sec_file = os.path.join(main_path, sec_file)
        seq_file = os.path.join(main_path, seq_file)
        tmp_sec_path = os.path.join(main_path, sec_path)
        tmp_dot_path = os.path.join(main_path, dot_path)
        self._run_RNAfold(seq_file, vienna_path, sec_file)
        extract_energy(
            os.path.join(main_path, "_".join([self.prefixs["basic"], prefix])),
            sec_file,
            os.path.join(main_path, "_".join([self.prefixs["energy"],
                                              prefix])))
        for ps in os.listdir(os.getcwd()):
            new_ps = ps.replace("|", "_")
            shutil.move(ps, new_ps)
        return {
            "sec": tmp_sec_path,
            "dot": tmp_dot_path,
            "main": main_path,
            "tmp": os.path.join(main_path, tmp_path)
        }

    def _run_replot(self, vienna_util, tmp_paths, file_, dot_file, rel_file):
        os.system(" ".join([
            os.path.join(vienna_util, "relplot.pl"),
            os.path.join(tmp_paths["tmp"], file_),
            os.path.join(tmp_paths["tmp"], dot_file), ">",
            os.path.join(tmp_paths["tmp"], rel_file)
        ]))

    def _convert_pdf(self, ps2pdf14_path, tmp_paths, file_, pdf_file):
        call([ps2pdf14_path, os.path.join(tmp_paths["tmp"], file_), pdf_file])

    def _replot_sec_to_pdf(self, vienna_util, tmp_paths, ps2pdf14_path,
                           prefix):
        for file_ in os.listdir(os.getcwd()):
            if file_.endswith("ss.ps"):
                dot_file = file_.replace("ss.ps", "dp.ps")
                rel_file = file_.replace("ss.ps", "rss.ps")
                print("replot {0}".format(file_))
                self._run_replot(vienna_util, tmp_paths, file_, dot_file,
                                 rel_file)
        for file_ in os.listdir(tmp_paths["tmp"]):
            if (file_.endswith("rss.ps")) or (file_.endswith("dp.ps")):
                pdf_file = file_.replace(".ps", ".pdf")
                print("convert {0} to pdf".format(file_))
                self._convert_pdf(ps2pdf14_path, tmp_paths, file_, pdf_file)
        os.mkdir(os.path.join(tmp_paths["sec"], prefix))
        os.mkdir(os.path.join(tmp_paths["dot"], prefix))
        self.helper.move_all_content(tmp_paths["tmp"],
                                     os.path.join(tmp_paths["sec"], prefix),
                                     ["rss.pdf"])
        self.helper.move_all_content(tmp_paths["tmp"],
                                     os.path.join(tmp_paths["dot"], prefix),
                                     ["dp.pdf"])

    def _run_mountain(self, vienna_util, tmp_paths, dot_file, out):
        call([
            os.path.join(vienna_util, "mountain.pl"),
            os.path.join(tmp_paths["tmp"], dot_file)
        ],
             stdout=out)

    def _plot_mountain(self, mountain, moun_path, tmp_paths, prefix,
                       vienna_util):
        if mountain:
            tmp_moun_path = os.path.join(tmp_paths["main"], moun_path)
            os.mkdir(os.path.join(tmp_moun_path, prefix))
            txt_path = os.path.join(tmp_paths["tmp"], "tmp_txt")
            self.helper.check_make_folder(txt_path)
            print("Generating mountain plot of {0}....".format(prefix))
            for dot_file in os.listdir(tmp_paths["tmp"]):
                if dot_file.endswith("dp.ps"):
                    moun_txt = os.path.join(tmp_paths["tmp"], "mountain.txt")
                    out = open(moun_txt, "w")
                    moun_file = dot_file.replace("dp.ps", "mountain.pdf")
                    print("Generating {0}".format(moun_file))
                    self._run_mountain(vienna_util, tmp_paths, dot_file, out)
                    plot_mountain_plot(moun_txt, moun_file)
                    shutil.move(moun_file,
                                os.path.join(tmp_moun_path, prefix, moun_file))
                    out.close()
                    os.remove(moun_txt)

    def _compute_2d_and_energy(self, args_srna, prefixs):
        print("Running energy calculation....")
        moun_path = os.path.join(args_srna.out_folder, "mountain_plot")
        sec_path = os.path.join(args_srna.out_folder, "sec_structure",
                                "sec_plot")
        dot_path = os.path.join(args_srna.out_folder, "sec_structure",
                                "dot_plot")
        self.helper.remove_all_content(sec_path, None, "dir")
        self.helper.remove_all_content(dot_path, None, "dir")
        self.helper.remove_all_content(moun_path, None, "dir")
        for prefix in prefixs:
            tmp_paths = self._get_seq_sec(self.fasta_path,
                                          args_srna.out_folder, prefix,
                                          sec_path, dot_path,
                                          args_srna.vienna_path)
            self._replot_sec_to_pdf(args_srna.vienna_util, tmp_paths,
                                    args_srna.ps2pdf14_path, prefix)
            self._plot_mountain(args_srna.mountain, moun_path, tmp_paths,
                                prefix, args_srna.vienna_util)
            self.helper.remove_all_content(os.getcwd(), ".ps", "file")
            os.chdir(tmp_paths["main"])
            shutil.move("_".join([self.prefixs["energy"], prefix]),
                        "_".join([self.prefixs["basic"], prefix]))
            shutil.rmtree(os.path.join(args_srna.out_folder, "tmp_srna"))

    def _run_blast(self, blast_path, program, database, e, seq_file,
                   blast_file, strand):
        call([
            os.path.join(blast_path, program), "-db", database, "-evalue",
            str(e), "-strand", strand, "-query", seq_file, "-out", blast_file
        ])

    def _get_strand_fasta(self, seq_file, out_folder):
        tmp_plus = os.path.join(out_folder, "tmp_plus.fa")
        tmp_minus = os.path.join(out_folder, "tmp_minus.fa")
        out_p = open(tmp_plus, "w")
        out_m = open(tmp_minus, "w")
        strand = ""
        with open(seq_file) as sh:
            for line in sh:
                line = line.strip()
                if line.startswith(">"):
                    if line[-1] == "+":
                        out_p.write(line + "\n")
                        strand = "plus"
                    elif line[-1] == "-":
                        out_m.write(line + "\n")
                        strand = "minus"
                else:
                    if strand == "plus":
                        out_p.write(line + "\n")
                    elif strand == "minus":
                        out_m.write(line + "\n")
        out_p.close()
        out_m.close()
        return tmp_plus, tmp_minus

    def _blast(self, database, database_format, data_type, args_srna, prefixs,
               program, database_type, e):
        if (database is None):
            print("Error: No database assigned!")
        else:
            if database_format:
                self._formatdb(database, data_type, args_srna.out_folder,
                               args_srna.blast_path, database_type)
            for prefix in prefixs:
                blast_file = os.path.join(
                    args_srna.out_folder, "blast_result_and_misc",
                    "_".join([database_type, "blast", prefix + ".txt"]))
                srna_file = "_".join([self.prefixs["basic"], prefix])
                out_file = os.path.join(
                    args_srna.out_folder,
                    "_".join(["tmp", database_type, prefix]))
                print("Running Blast of {0} in {1}".format(prefix, database))
                seq_file = os.path.join(args_srna.out_folder,
                                        "_".join(["sRNA_seq", prefix]))
                if seq_file not in os.listdir(args_srna.out_folder):
                    self.helper.get_seq(
                        srna_file, os.path.join(self.fasta_path,
                                                prefix + ".fa"), seq_file)
                if database_type == "nr":
                    tmp_plus, tmp_minus = self._get_strand_fasta(
                        seq_file, args_srna.out_folder)
                    tmp_blast = os.path.join("tmp_blast.txt")
                    self._run_blast(args_srna.blast_path, program, database, e,
                                    tmp_plus, tmp_blast, "plus")
                    self._run_blast(args_srna.blast_path, program, database, e,
                                    tmp_minus, blast_file, "minus")
                    self.helper.merge_file(tmp_blast, blast_file)
                    os.remove(tmp_blast)
                    os.remove(tmp_plus)
                    os.remove(tmp_minus)
                else:
                    self._run_blast(args_srna.blast_path, program, database, e,
                                    seq_file, blast_file, "both")
                extract_blast(blast_file, srna_file, out_file,
                              out_file + ".csv", database_type)
                shutil.move(out_file, srna_file)

    def _class_srna(self, prefixs, args_srna):
        '''classify the sRNA based on the filters'''
        if (args_srna.import_info
                is not None) or (args_srna.srna_database is not None) or (
                    args_srna.nr_database
                    is not None) or (self.sorf_path is not None) or (
                        self.tss_path
                        is not None) or (self.term_path is not None) or (
                            args_srna.promoter_table is not None):
            for prefix in prefixs:
                print("classifying sRNA of {0}".format(prefix))
                class_gff = os.path.join(self.gff_output, "for_class")
                class_table = os.path.join(self.table_output, "for_class")
                self.helper.check_make_folder(os.path.join(
                    class_table, prefix))
                self.helper.check_make_folder(os.path.join(class_gff, prefix))
                class_gff = os.path.join(class_gff, prefix)
                class_table = os.path.join(class_table, prefix)
                self.helper.check_make_folder(class_table)
                self.helper.check_make_folder(class_gff)
                out_stat = os.path.join(
                    self.stat_path,
                    "_".join(["stat_sRNA_class", prefix + ".csv"]))
                classify_srna(
                    os.path.join(self.all_best["all_gff"],
                                 "_".join([prefix, "sRNA.gff"])), class_gff,
                    out_stat, args_srna)
                for srna in os.listdir(class_gff):
                    out_table = os.path.join(class_table,
                                             srna.replace(".gff", ".csv"))
                    gen_srna_table(
                        os.path.join(class_gff, srna),
                        "_".join([self.prefixs["merge_table"], prefix]),
                        "_".join([self.tmps["nr"], prefix + ".csv"]),
                        "_".join([self.tmps["srna"], prefix + ".csv"]),
                        args_srna, out_table, self.term_path)

    def _get_best_result(self, prefixs, args_srna):
        '''get the best results based on the filters'''
        for prefix in prefixs:
            best_gff = os.path.join(self.all_best["best_gff"],
                                    "_".join([prefix, "sRNA.gff"]))
            best_table = os.path.join(self.all_best["best_table"],
                                      "_".join([prefix, "sRNA.csv"]))
            gen_best_srna(
                os.path.join(self.all_best["all_gff"],
                             "_".join([prefix, "sRNA.gff"])), best_gff,
                args_srna)
            gen_srna_table(
                os.path.join(self.all_best["best_gff"],
                             "_".join([prefix, "sRNA.gff"])),
                "_".join([self.prefixs["merge_table"], prefix]),
                "_".join([self.tmps["nr"], prefix + ".csv"]),
                "_".join([self.tmps["srna"], prefix + ".csv"]), args_srna,
                best_table, self.term_path)

    def _remove_file(self, args_srna):
        self.helper.remove_all_content(args_srna.out_folder, "tmp_", "dir")
        self.helper.remove_all_content(args_srna.out_folder, "tmp_", "file")
        self.helper.remove_tmp(args_srna.fastas)
        self.helper.remove_tmp(args_srna.gffs)
        self.helper.remove_tmp(self.gff_output)
        if args_srna.frag_wigs is not None:
            self.helper.remove_tmp(args_srna.frag_wigs)
        if args_srna.tex_wigs is not None:
            self.helper.remove_tmp(args_srna.tex_wigs)
        if (args_srna.frag_wigs is not None) and (args_srna.tex_wigs
                                                  is not None):
            shutil.rmtree(args_srna.merge_wigs)
        self.helper.remove_tmp(args_srna.trans)
        if args_srna.tss_folder is not None:
            self.helper.remove_tmp(args_srna.tss_folder)
        if args_srna.pro_folder is not None:
            self.helper.remove_tmp(args_srna.pro_folder)
        if args_srna.sorf_file is not None:
            self.helper.remove_tmp(args_srna.sorf_file)
        if "tmp_median" in os.listdir(args_srna.out_folder):
            os.remove(os.path.join(args_srna.out_folder, "tmp_median"))
        if self.term_path is not None:
            self.helper.remove_tmp(args_srna.terms)

    def _filter_srna(self, args_srna, prefixs):
        '''set the filter of sRNA'''
        if args_srna.import_info is not None:
            if "sec_str" in args_srna.import_info:
                self._compute_2d_and_energy(args_srna, prefixs)
        if args_srna.nr_database is not None:
            self._blast(args_srna.nr_database, args_srna.nr_format, "prot",
                        args_srna, prefixs, "blastx", "nr", args_srna.e_nr)
        if self.sorf_path is not None:
            for prefix in prefixs:
                if ("_".join([prefix, "sORF.gff"])
                        in os.listdir(self.sorf_path)):
                    tmp_srna = os.path.join(args_srna.out_folder,
                                            "".join(["tmp_srna_sorf", prefix]))
                    tmp_sorf = os.path.join(args_srna.out_folder,
                                            "".join(["tmp_sorf_srna", prefix]))
                    srna_sorf_comparison(
                        "_".join([self.prefixs["basic"], prefix]),
                        os.path.join(self.sorf_path,
                                     "_".join([prefix, "sORF.gff"])), tmp_srna,
                        tmp_sorf)
                    os.remove(tmp_sorf)
                    shutil.move(tmp_srna,
                                "_".join([self.prefixs["basic"], prefix]))
        if args_srna.srna_database is not None:
            self._blast(args_srna.srna_database, args_srna.srna_format, "nucl",
                        args_srna, prefixs, "blastn", "sRNA", args_srna.e_srna)

    def _import_info_format(self, import_info):
        new_info = []
        for info in import_info:
            info = info.lower()
            new_info.append(info)
        return new_info

    def _gen_table(self, prefixs, args_srna):
        for prefix in prefixs:
            out_table = os.path.join(self.all_best["all_table"],
                                     "_".join([prefix, "sRNA.csv"]))
            gen_srna_table(
                os.path.join(self.all_best["all_gff"],
                             "_".join([prefix, "sRNA.gff"])),
                "_".join([self.prefixs["merge_table"], prefix]),
                "_".join([self.tmps["nr"], prefix + ".csv"]),
                "_".join([self.tmps["srna"], prefix + ".csv"]), args_srna,
                out_table, self.term_path)

    def _print_rank_all(self, prefixs):
        for prefix in prefixs:
            all_table = os.path.join(self.all_best["all_table"],
                                     "_".join([prefix, "sRNA.csv"]))
            best_table = os.path.join(self.all_best["best_table"],
                                      "_".join([prefix, "sRNA.csv"]))
            print_rank_all(all_table, best_table)

    def _filter_min_utr(self, prefixs, min_utr):
        '''filter out the low expressed UTR-derived sRNA'''
        for prefix in prefixs:
            filter_utr(
                os.path.join(self.all_best["all_gff"],
                             "_".join([prefix, "sRNA.gff"])),
                os.path.join(self.all_best["all_table"],
                             "_".join([prefix, "sRNA.csv"])), min_utr)

    def _antisense(self, gffs, prefixs):
        '''detection of antisense'''
        for prefix in prefixs:
            all_table = os.path.join(self.all_best["all_table"],
                                     "_".join([prefix, "sRNA.csv"]))
            best_table = os.path.join(self.all_best["best_table"],
                                      "_".join([prefix, "sRNA.csv"]))
            all_gff = os.path.join(self.all_best["all_gff"],
                                   "_".join([prefix, "sRNA.gff"]))
            best_gff = os.path.join(self.all_best["best_gff"],
                                    "_".join([prefix, "sRNA.gff"]))
            srna_antisense(all_gff, all_table,
                           os.path.join(gffs, prefix + ".gff"))
            srna_antisense(best_gff, best_table,
                           os.path.join(gffs, prefix + ".gff"))

    def _blast_stat(self, stat_path, srna_tables):
        '''do statistics for blast result'''
        for srna_table in os.listdir(os.path.join(srna_tables, "best")):
            out_srna_blast = os.path.join(
                stat_path, "stat_" + srna_table.replace(".csv", "_blast.csv"))
        blast_class(os.path.join(srna_tables, "best", srna_table),
                    out_srna_blast)

    def _compare_term_promoter(self, out_table, prefix, args_srna):
        '''compare sRNA with terminator and promoter'''
        if self.term_path is not None:
            compare_srna_term(
                os.path.join(self.all_best["all_gff"],
                             "_".join([prefix, "sRNA.gff"])), out_table,
                os.path.join(self.term_path, "_".join([prefix, "term.gff"])),
                args_srna.fuzzy_b, args_srna.fuzzy_a)
        if (args_srna.promoter_table is not None):
            compare_srna_promoter(
                os.path.join(self.all_best["all_gff"],
                             "_".join([prefix, "sRNA.gff"])), out_table,
                args_srna)

    def run_srna_detection(self, args_srna):
        self._check_necessary_file(args_srna)
        self.multiparser.parser_gff(args_srna.trans, "transcript")
        self.multiparser.combine_gff(args_srna.gffs, self.tran_path, None,
                                     "transcript")
        if args_srna.import_info is not None:
            args_srna.import_info = self._import_info_format(
                args_srna.import_info)
        prefixs = self._run_program(args_srna)
        self._filter_srna(args_srna, prefixs)
        for prefix in prefixs:
            shutil.copyfile(
                "_".join([self.prefixs["basic"], prefix]),
                os.path.join(self.all_best["all_gff"],
                             "_".join([prefix, "sRNA.gff"])))
            self._compare_term_promoter(
                "_".join([self.prefixs["merge_table"], prefix]), prefix,
                args_srna)
        self._gen_table(prefixs, args_srna)
        self._class_srna(prefixs, args_srna)
        self._get_best_result(prefixs, args_srna)
        self._print_rank_all(prefixs)
        if args_srna.srna_database is not None:
            if "blast_srna" in args_srna.import_info:
                self._blast_stat(self.stat_path, self.table_output)
        self._remove_file(args_srna)
Beispiel #48
0
class PPINetwork(object):
    '''detection of PPI'''
    def __init__(self, out_folder):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.gffparser = Gff3Parser()
        self.tmp_id = os.path.join(out_folder, "tmp_id_list")
        self.all_result = os.path.join(out_folder, "all_results")
        self.best_result = os.path.join(out_folder, "best_results")
        self.fig = os.path.join(out_folder, "figures")
        self.with_strain = "with_strain"
        self.without_strain = "without_strain"
        self.tmp_files = {
            "log": "tmp_log",
            "action": "tmp_action.log",
            "pubmed": "tmp_pubmed.log",
            "specific": os.path.join(out_folder, "tmp_specific"),
            "nospecific": os.path.join(out_folder, "tmp_nospecific"),
            "wget_action": os.path.join(out_folder, "tmp_action")
        }

    def _make_folder_no_exist(self, path, folder):
        if folder not in os.listdir(path):
            os.mkdir(os.path.join(path, folder))

    def _make_subfolder(self, path, strain, ptt):
        os.mkdir(os.path.join(path, strain))
        os.mkdir(os.path.join(path, strain, ptt))

    def _run_wget(self, source, folder, log):
        call(["wget", source, "-O", folder], stderr=log)
        time.sleep(2)

    def _wget_id(self, strain, locus, strain_id, files):
        detect_id = False
        if strain == strain_id["ptt"]:
            print("Retrieving STRING ID for {0} of {1} -- {2}".format(
                locus, strain_id["string"], strain_id["file"]))
            id_source = ("http://string-db.org/api/tsv/resolve?"
                         "identifier={0}&species={1}").format(
                             locus, strain_id["string"])
            self._run_wget(id_source, os.path.join(files["id_list"], locus),
                           files["id_log"])
            detect_id = True
        return detect_id

    def _retrieve_id(self, strain_id, genes, files):
        for gene in genes:
            detect_id = self._wget_id(gene["strain"], gene["locus_tag"],
                                      strain_id, files)
            if not detect_id:
                print("Error:there is no {0} in {1}".format(
                    gene, strain_id["file"]))

    def _get_prefer_name(self, row_a, strain_id, files, querys):
        prefername = ""
        filename = row_a.split(".")
        if (filename[1] not in os.listdir(
                files["id_list"])) and ("all" not in querys):
            self._wget_id(strain_id["ptt"], filename[1], strain_id, files)
        if filename[1] in os.listdir(files["id_list"]):
            id_h = open(os.path.join(files["id_list"], filename[1]), "r")
            for row_i in csv.reader(id_h, delimiter="\t"):
                if row_a == row_i[0]:
                    prefername = row_i[3]
            id_h.close()
        return prefername

    def _print_title(self, out, id_file, id_folder):
        id_h = open(os.path.join(id_folder, id_file), "r")
        prefername = id_file
        for row_i in csv.reader(id_h, delimiter="\t"):
            prefername = row_i[3]
        id_h.close()
        out.write("Interaction of {0} | {1}\n".format(id_file, prefername))
        out.write("strain\titem_id_a\titem_id_b\tmode\taction\ta_is_acting\t"
                  "STRING_action_score\tpubmed_id\tpubmed_score\n")

    def _get_pubmed(self, row, strain_id, mode, actor, id_file, first_output,
                    ptt, files, paths, args_ppi):
        prefer1 = self._get_prefer_name(row[0], strain_id, files,
                                        args_ppi.querys)
        prefer2 = self._get_prefer_name(row[1], strain_id, files,
                                        args_ppi.querys)
        if (len(prefer1) > 0) and (len(prefer2) > 0):
            if args_ppi.no_specific:
                pubmed_source = (
                    "http://www.ncbi.nlm.nih.gov/CBBresearch/"
                    "Wilbur/IRET/PIE/getppi.cgi?term={0}+{1}").format(
                        prefer1, prefer2)
                self._run_wget(pubmed_source, self.tmp_files["nospecific"],
                               files["pubmed_log"])
            strain_id["pie"] = "+".join(strain_id["pie"].split(" "))
            pubmed_source = ("http://www.ncbi.nlm.nih.gov/CBBresearch/Wilbur"
                             "/IRET/PIE/getppi.cgi?term={0}+{1}+{2}").format(
                                 prefer1, prefer2, strain_id["pie"])
            self._run_wget(pubmed_source, self.tmp_files["specific"],
                           files["pubmed_log"])
            row[2] = mode
            row[4] = actor
            row[0] = prefer1
            row[1] = prefer2
            self._merge_information(
                first_output, self.tmp_files["specific"],
                files["all_specific"], files["best_specific"], row,
                args_ppi.score, id_file, files["id_list"], "specific",
                os.path.join(paths["all"], self.with_strain),
                os.path.join(paths["best"], self.with_strain), ptt)
            if args_ppi.no_specific:
                self._merge_information(
                    first_output, self.tmp_files["nospecific"],
                    files["all_nospecific"], files["best_nospecific"], row,
                    args_ppi.score, id_file, files["id_list"], "nospecific",
                    os.path.join(paths["all"], self.without_strain),
                    os.path.join(paths["best"], self.without_strain), ptt)

    def _print_single_file(self, out_single, row_a, ptt, row):
        if row == "NA":
            out_single.write("\t".join([ptt, "\t".join(row_a), "NA", "NA"]) +
                             "\n")
        else:
            out_single.write(
                "\t".join([ptt, "\t".join(row_a), "\t".join(row)]) + "\n")

    def _merge_information(self, first_output, filename, out_all, out_best,
                           row_a, score, id_file, id_folder, file_type,
                           all_folder, best_folder, ptt):
        if os.path.getsize(filename) != 0:
            f_h = open(filename, "r")
            out_all_single = open(
                os.path.join(all_folder, ptt,
                             "_".join([row_a[0], row_a[1] + ".csv"])), "w")
            out_best_single = open(
                os.path.join(best_folder, ptt,
                             "_".join([row_a[0], row_a[1] + ".csv"])), "w")
            self._print_title(out_all_single, id_file, id_folder)
            self._print_title(out_best_single, id_file, id_folder)
            detect = False
            for row in csv.reader(f_h, delimiter="\t"):
                self._print_single_file(out_all_single, row_a, ptt, row)
                if first_output["_".join([file_type, "all"])]:
                    first_output["_".join([file_type, "all"])] = False
                    self._print_title(out_all, id_file, id_folder)
                out_all.write(
                    "\t".join([ptt, "\t".join(row_a), "\t".join(row)]) + "\n")
                if (float(row[1]) >= score):
                    detect = True
                    self._print_single_file(out_best_single, row_a, ptt, row)
                    if first_output["_".join([file_type, "best"])]:
                        first_output["_".join([file_type, "best"])] = False
                        self._print_title(out_best, id_file, id_folder)
                    out_best.write(
                        "\t".join([ptt, "\t".join(row_a), "\t".join(row)]) +
                        "\n")
            f_h.close()
            if not detect:
                os.remove(
                    os.path.join(best_folder, ptt,
                                 "_".join([row_a[0], row_a[1] + ".csv"])))
            out_all_single.close()
            out_best_single.close()
        else:
            out_all_single = open(
                os.path.join(all_folder, ptt,
                             "_".join([row_a[0], row_a[1] + ".csv"])), "w")
            self._print_title(out_all_single, id_file, id_folder)
            self._print_single_file(out_all_single, row_a, ptt, "NA")
            if first_output["_".join([file_type, "all"])]:
                first_output["_".join([file_type, "all"])] = False
                self._print_title(out_all, id_file, id_folder)
            out_all.write("\t".join([ptt, "\t".join(row_a), "NA", "NA"]) +
                          "\n")
            out_all_single.close()

    def _detect_protein(self, strain_id, args_ppi):
        fh = open(os.path.join(args_ppi.ptts, strain_id["file"]), "r")
        genes = []
        for row in csv.reader(fh, delimiter="\t"):
            if (len(row) == 1) and ("-" in row[0]) and (".." in row[0]):
                name = (row[0].split("-"))[0].strip().split(",")[0].strip()
            if ("all" in args_ppi.querys):
                if (len(row) > 1) and (row[0] != "Location"):
                    genes.append({"strain": name, "locus_tag": row[5]})
            else:
                for query in args_ppi.querys:
                    datas = query.split(":")
                    strain = datas[0]
                    start = datas[1]
                    end = datas[2]
                    strand = datas[3]
                    if (len(row) > 1
                        ) and (row[0] != "Location") and (name == strain) and (
                            start == row[0].split("..")[0]) and (
                                end == row[0].split("..")[1]) and (strand
                                                                   == row[1]):
                        genes.append({"strain": name, "locus_tag": row[5]})
        fh.close()
        return genes

    def _setup_nospecific(self, paths, strain_id, files):
        self._make_subfolder(paths["all"], self.without_strain,
                             strain_id["ptt"])
        self._make_subfolder(paths["best"], self.without_strain,
                             strain_id["ptt"])
        self._make_subfolder(paths["fig"], self.without_strain,
                             strain_id["ptt"])
        filename_nostrain = "_".join([
            strain_id["file"].replace(".ptt", ""), self.without_strain + ".csv"
        ])
        files["all_nospecific"] = open(
            os.path.join(paths["all"], filename_nostrain), "w")
        files["best_nospecific"] = open(
            os.path.join(paths["best"], filename_nostrain), "w")

    def _setup_folder_and_read_file(self, strain_id, pre_file, files, paths,
                                    args_ppi):
        if strain_id["file"].endswith(".ptt"):
            if strain_id["file"] != pre_file:
                self.helper.check_make_folder("_".join(
                    [self.tmp_id, strain_id["file"]]))
                paths["all"] = os.path.join(self.all_result,
                                            strain_id["file"][:-4])
                paths["best"] = os.path.join(self.best_result,
                                             strain_id["file"][:-4])
                paths["fig"] = os.path.join(self.fig, strain_id["file"][:-4])
                self.helper.check_make_folder(
                    os.path.join(self.all_result, strain_id["file"][:-4]))
                self.helper.check_make_folder(
                    os.path.join(self.best_result, strain_id["file"][:-4]))
                self.helper.check_make_folder(
                    os.path.join(self.fig, strain_id["file"][:-4]))
                self._make_subfolder(paths["all"], self.with_strain,
                                     strain_id["ptt"])
                self._make_subfolder(paths["best"], self.with_strain,
                                     strain_id["ptt"])
                self._make_subfolder(paths["fig"], self.with_strain,
                                     strain_id["ptt"])
                filename_strain = "_".join([
                    strain_id["file"].replace(".ptt", ""),
                    self.with_strain + ".csv"
                ])
                files["all_specific"] = open(
                    os.path.join(paths["all"], filename_strain), "w")
                files["best_specific"] = open(
                    os.path.join(paths["best"], filename_strain), "w")
                if args_ppi.no_specific:
                    self._setup_nospecific(paths, strain_id, files)
                files["id_list"] = "_".join([self.tmp_id, strain_id["file"]])
                files["id_log"] = open(
                    os.path.join(files["id_list"], self.tmp_files["log"]), "w")
                files["action_log"] = open(
                    os.path.join(args_ppi.out_folder,
                                 self.tmp_files["action"]), "w")
                files["pubmed_log"] = open(
                    os.path.join(args_ppi.out_folder,
                                 self.tmp_files["pubmed"]), "w")
                pre_file = strain_id["file"]
                if strain_id["file"] in os.listdir(args_ppi.ptts):
                    genes = self._detect_protein(strain_id, args_ppi)
            else:
                self._make_folder_no_exist(
                    os.path.join(paths["all"], self.with_strain),
                    strain_id["ptt"])
                self._make_folder_no_exist(
                    os.path.join(paths["best"], self.with_strain),
                    strain_id["ptt"])
                if args_ppi.no_specific:
                    self._make_folder_no_exist(
                        os.path.join(paths["all"], self.without_strain),
                        strain_id["ptt"])
                    self._make_folder_no_exist(
                        os.path.join(paths["best"], self.without_strain),
                        strain_id["ptt"])
        else:
            print("Error:wrong .ptt file!!")
            sys.exit()
        return genes

    def _wget_actions(self, files, id_file, strain_id, out_folder):
        detect = False
        t_h = open(os.path.join(files["id_list"], id_file), "r")
        print("Retrieving STRING actions for {0} of {1} -- {2}".format(
            id_file, strain_id["string"], strain_id["file"]))
        for row in csv.reader(t_h, delimiter="\t"):
            if row[0].startswith("stringId"):
                continue
            else:
                detect = True
                if row[1] == strain_id["string"]:
                    action_source = ("http://string-db.org/api/tsv/actions?"
                                     "identifier={0}&species={1}").format(
                                         row[0], row[1])
                    self._run_wget(action_source,
                                   self.tmp_files["wget_action"],
                                   files["action_log"])
                    break
        t_h.close()
        if not detect:
            print("Warning: " + id_file + " can not be found in STRING...")
        return detect

    def _retrieve_actions(self, files, strain_id, paths, args_ppi):
        '''get the interaction of proteins'''
        for id_file in os.listdir(files["id_list"]):
            if id_file != self.tmp_files["log"]:
                detect_id = self._wget_actions(files, id_file, strain_id,
                                               args_ppi.out_folder)
                if detect_id:
                    a_h = open(self.tmp_files["wget_action"], "r")
                    pre_row = []
                    first = True
                    detect = False
                    first_output = {
                        "specific_all": True,
                        "specific_best": True,
                        "nospecific_all": True,
                        "nospecific_best": True
                    }
                    print("Retrieving Pubmed for {0} of {1} -- {2}".format(
                        id_file, strain_id["string"], strain_id["file"]))
                    for row_a in csv.reader(a_h, delimiter="\t"):
                        if row_a == []:
                            print("No interaction can be detected...")
                            break
                        if row_a[0].startswith("item_id_a"):
                            continue
                        else:
                            detect = True
                            if first:
                                first = False
                                mode = row_a[2]
                                actor = row_a[4]
                            else:
                                if (row_a[0] != pre_row[0]) or (row_a[1] !=
                                                                pre_row[1]):
                                    self._get_pubmed(pre_row, strain_id, mode,
                                                     actor, id_file,
                                                     first_output,
                                                     strain_id["ptt"], files,
                                                     paths, args_ppi)
                                    mode = row_a[2]
                                    actor = row_a[4]
                                else:
                                    mode = mode + ";" + row_a[2]
                                    actor = actor + ";" + row_a[4]
                            pre_row = row_a
                    if detect:
                        detect = False
                        self._get_pubmed(row_a, strain_id, mode, actor,
                                         id_file, first_output,
                                         strain_id["ptt"], files, paths,
                                         args_ppi)
        if detect_id:
            a_h.close()

    def _plot(self, args_ppi, files):
        if args_ppi.no_specific:
            files["all_nospecific"].close()
            files["best_nospecific"].close()
        files["all_specific"].close()
        files["best_specific"].close()
        for folder in os.listdir(self.all_result):
            if folder in os.listdir(self.fig):
                print("plotting {0}".format(folder))
                plot_ppi(
                    os.path.join(self.all_result, folder,
                                 "_".join([folder,
                                           self.with_strain + ".csv"])),
                    args_ppi.score,
                    os.path.join(self.fig, folder,
                                 self.with_strain), args_ppi.size)
                if args_ppi.no_specific:
                    plot_ppi(
                        os.path.join(
                            self.all_result, folder,
                            "_".join([folder, self.without_strain + ".csv"])),
                        args_ppi.score,
                        os.path.join(self.fig, folder,
                                     self.without_strain), args_ppi.size)

    def _remove_tmps(self, args_ppi):
        self.helper.remove_all_content(os.path.join(args_ppi.out_folder),
                                       "tmp", "file")
        self.helper.remove_all_content(os.path.join(args_ppi.out_folder),
                                       "tmp", "dir")
        for file_ in os.listdir(args_ppi.ptts):
            if file_.startswith("PPI_"):
                os.remove(os.path.join(args_ppi.ptts, file_))

    def retrieve_ppi_network(self, args_ppi):
        '''retrieve PPI from STRING with PIE and draw network'''
        strain_ids = []
        paths = {}
        files = {}
        for strain in args_ppi.strains:
            datas = strain.split(":")
            ptt_file = "PPI_" + datas[0].replace(".gff", ".ptt")
            rnt_file = "PPI_" + datas[0].replace(".gff", ".rnt")
            self.converter.convert_gff2rntptt(
                os.path.join(args_ppi.ptts, datas[0]), "0",
                os.path.join(args_ppi.ptts, ptt_file),
                os.path.join(args_ppi.ptts, rnt_file), None, None)
            strain_ids.append({
                "file": ptt_file,
                "ptt": datas[1],
                "string": datas[2],
                "pie": datas[3]
            })
        strain_ids.sort(key=lambda x: x["file"])
        pre_file = ""
        for strain_id in strain_ids:
            genes = self._setup_folder_and_read_file(strain_id, pre_file,
                                                     files, paths, args_ppi)
            s_h = open(args_ppi.species, "r")
            for row in csv.reader(s_h, delimiter="\t"):
                if row[0] != "##":
                    if row[0] == strain_id["string"]:
                        break
                    elif row[2] == strain_id["string"]:
                        strain_id["string"] = row[0]
                        break
                    elif row[3] == strain_id["string"]:
                        strain_id["string"] = row[0]
                        break
            self._retrieve_id(strain_id, genes, files)
            self._retrieve_actions(files, strain_id, paths, args_ppi)
        self._plot(args_ppi, files)
        self._remove_tmps(args_ppi)
Beispiel #49
0
class MEME(object):
    '''detection of promoter'''

    def __init__(self, args_pro):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.tss_path = os.path.join(args_pro.tsss, "tmp")
        if args_pro.gffs is not None:
            self.gff_path = os.path.join(args_pro.gffs, "tmp")
        else:
            self.gff_path = None
        self.out_fasta = os.path.join(args_pro.output_folder, "fasta_classes")
        self.tmp_folder = os.path.join(os.getcwd(), "tmp")
        self.fastas = {"pri": os.path.join(self.tmp_folder, "primary.fa"),
                       "sec": os.path.join(self.tmp_folder, "secondary.fa"),
                       "inter": os.path.join(self.tmp_folder, "internal.fa"),
                       "anti": os.path.join(self.tmp_folder, "antisense.fa"),
                       "orph": os.path.join(self.tmp_folder, "orphan.fa"),
                       "all_no_orph": "without_orphan.fa",
                       "all": "all_type.fa",
                       "tmp_fa": os.path.join(self.tmp_folder, "tmp.fa"),
                       "tmp_all": os.path.join(self.tmp_folder, "tmp_all.fa")}
        self.all_fasta = os.path.join(args_pro.fastas, "allfasta.fa")
        self.all_tss = os.path.join(self.tss_path, "allfasta_TSS.gff")

    def _gen_and_check_folder(self, out_path, folder, type_):
        sub_out_folder = os.path.join(out_path, type_)
        if folder in os.listdir(sub_out_folder):
            shutil.rmtree(os.path.join(sub_out_folder, folder))
        return sub_out_folder

    def _run_normal_motif(self, input_path, out_path, filename,
                          fasta, width, args_pro, log):
        '''run MEME with specific width'''
        folder = "_".join(["promoter_motifs", filename,
                           str(width), "nt"])
        if (args_pro.program.lower() == "meme") or (
                args_pro.program.lower() == "both"):
            meme_folder = self._gen_and_check_folder(
                            out_path, folder, "MEME")
            command = [args_pro.meme_path, "-maxsize", "1000000",
                       "-dna", "-nmotifs", str(args_pro.num_motif),
                       "-w", str(width), "-maxiter", "100",
                       "-evt", str(args_pro.e_value)]
            if args_pro.para is not None:
                command = command + ["-p", args_pro.para]
            log.write(" ".join(command + ["-oc", os.path.join(
                      meme_folder, folder),
                      os.path.join(input_path, fasta)]) + "\n")
            call(command + ["-oc", os.path.join(meme_folder, folder),
                            os.path.join(input_path, fasta)])
        if (args_pro.program.lower() == "glam2") or (
                args_pro.program.lower() == "both"):
            glam_folder = self._gen_and_check_folder(
                            out_path, folder, "GLAM2")
            log.write(" ".join([args_pro.glam2_path,
                  "-O", os.path.join(glam_folder, folder), "-w",
                  str(width), "-b", str(width), "-r",
                  str(args_pro.num_motif), "-n", str(args_pro.end_run),
                  "n", os.path.join(input_path, fasta)]) + "\n")
            call([args_pro.glam2_path,
                  "-O", os.path.join(glam_folder, folder), "-w",
                  str(width), "-b", str(width), "-r",
                  str(args_pro.num_motif), "-n", str(args_pro.end_run),
                  "n", os.path.join(input_path, fasta)])

    def _run_small_motif(self, input_path, out_path, filename,
                         fasta, width, args_pro, log):
        '''run MEME with range of width'''
        data = width.split("-")
        min_width = data[0]
        max_width = data[1]
        folder = "_".join(["promoter_motifs", filename,
                           "-".join([str(min_width), str(max_width)]), "nt"])
        if (args_pro.program.lower() == "meme") or (
                args_pro.program.lower() == "both"):
            meme_folder = self._gen_and_check_folder(
                            out_path, folder, "MEME")
            command = [args_pro.meme_path, "-maxsize", "1000000",
                       "-dna", "-nmotifs", str(args_pro.num_motif),
                       "-minsites", "0", "-maxsites", "2",
                       "-minw", str(min_width), "-maxw", str(max_width),
                       "-maxiter", "100",
                       "-evt", str(args_pro.e_value)]
            if args_pro.para is not None:
                command = command + ["-p", args_pro.para]
            log.write(" ".join(command + ["-oc", os.path.join(
                      meme_folder, folder),
                      os.path.join(input_path, fasta)]) + "\n")
            call(command + ["-oc", os.path.join(meme_folder, folder),
                            os.path.join(input_path, fasta)])
        if (args_pro.program.lower() == "glam2") or (
                args_pro.program.lower() == "both"):
            glam_folder = self._gen_and_check_folder(
                            out_path, folder, "GLAM2")
            log.write(" ".join([args_pro.glam2_path,
                  "-O", os.path.join(glam_folder, folder), "-a",
                  str(min_width), "-b", str(max_width), "-r",
                  str(args_pro.num_motif), "-n", str(args_pro.end_run),
                  "n", os.path.join(input_path, fasta)]) + "\n")
            call([args_pro.glam2_path,
                  "-O", os.path.join(glam_folder, folder), "-a",
                  str(min_width), "-b", str(max_width), "-r",
                  str(args_pro.num_motif), "-n", str(args_pro.end_run),
                  "n", os.path.join(input_path, fasta)])

    def _get_fasta_file(self, fasta_path, prefix):
        for fasta in os.listdir(fasta_path):
            if (fasta.endswith(".fa")) and \
               (prefix == fasta.replace(".fa", "")):
                break
            elif (fasta.endswith(".fna")) and \
                 (prefix == fasta.replace(".fna", "")):
                break
            elif (fasta.endswith(".fasta")) and \
                 (prefix == fasta.replace(".fasta", "")):
                break
        return fasta

    def _check_gff(self, gffs):
        for gff in os.listdir(gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(gffs, gff))

    def _move_and_merge_fasta(self, input_path, prefix):
        all_type = os.path.join(self.tmp_folder, self.fastas["all"])
        all_no_orph = os.path.join(self.tmp_folder, self.fastas["all_no_orph"])
        if self.fastas["all"] in os.listdir(self.tmp_folder):
            os.remove(all_type)
        if self.fastas["all_no_orph"] in os.listdir(self.tmp_folder):
            os.remove(all_no_orph)
        shutil.copyfile(self.fastas["pri"], self.fastas["tmp_fa"])
        self.helper.merge_file(self.fastas["sec"], self.fastas["tmp_fa"])
        self.helper.merge_file(self.fastas["inter"], self.fastas["tmp_fa"])
        self.helper.merge_file(self.fastas["anti"], self.fastas["tmp_fa"])
        shutil.copyfile(self.fastas["tmp_fa"], self.fastas["tmp_all"])
        self.helper.merge_file(self.fastas["orph"], self.fastas["tmp_all"])
        del_repeat_fasta(self.fastas["tmp_fa"], all_no_orph)
        del_repeat_fasta(self.fastas["tmp_all"], all_type)
        os.remove(self.fastas["tmp_fa"])
        os.remove(self.fastas["tmp_all"])
        out_prefix = os.path.join(input_path, prefix)
        shutil.move(self.fastas["pri"], "_".join([
            out_prefix, "allgenome_primary.fa"]))
        shutil.move(self.fastas["sec"], "_".join([
            out_prefix, "allgenome_secondary.fa"]))
        shutil.move(self.fastas["inter"], "_".join([
            out_prefix, "allgenome_internal.fa"]))
        shutil.move(self.fastas["anti"], "_".join([
            out_prefix, "allgenome_antisense.fa"]))
        shutil.move(self.fastas["orph"], "_".join([
            out_prefix, "allgenome_orphan.fa"]))
        shutil.move(all_type, "_".join([
            out_prefix, "allgenome_all_types.fa"]))
        shutil.move(all_no_orph, "_".join([
            out_prefix, "allgenome_without_orphan.fa"]))

    def _split_fasta_by_strain(self, input_path):
        for fasta in os.listdir(input_path):
            if "allgenome" not in fasta:
                os.remove(os.path.join(input_path, fasta))
        out = None
        for fasta in os.listdir(input_path):
            if fasta.endswith(".fa"):
                pre_strain = ""
                num_strain = 0
                with open(os.path.join(input_path, fasta), "r") as f_h:
                    for line in f_h:
                        line = line.strip()
                        if line.startswith(">"):
                            datas = line.split("_")
                            strain = "_".join(datas[2:])
                            if (pre_strain != strain):
                                num_strain += 1
                                filename = fasta.split("allgenome")
                                if out is not None:
                                    out.close()
                                out = open(os.path.join(
                                           input_path, "".join([
                                               filename[0], strain,
                                               filename[-1]])), "a")
                                pre_strain = strain
                            out.write(line + "\n")
                        else:
                            out.write(line + "\n")
                if num_strain == 1:
                    os.remove(os.path.join(input_path,
                              "".join([filename[0], strain, filename[-1]])))
        out.close()

    def _run_program(self, prefixs, args_pro, log, input_fastas):
        log.write("Using MEME or GLAM2 to predict promoter.\n")
        log.write("Please make sure their versions are at least 4.11.1.\n")
        log.write("If you are running for parallel, please make sure you "
                  "have install MPICH and its version is at least 3.2.\n")
        for prefix in prefixs:
            input_path = os.path.join(self.out_fasta, prefix)
            out_path = os.path.join(args_pro.output_folder, prefix)
            if args_pro.program.lower() == "both":
                self.helper.check_make_folder(os.path.join(out_path, "MEME"))
                self.helper.check_make_folder(os.path.join(out_path, "GLAM2"))
            elif args_pro.program.lower() == "meme":
                self.helper.check_make_folder(os.path.join(out_path, "MEME"))
            elif args_pro.program.lower() == "glam2":
                self.helper.check_make_folder(os.path.join(out_path, "GLAM2"))
            for fasta in os.listdir(input_path):
                filename = fasta.replace(".fa", "")
                names = filename.split("_")
                if (names[-1] in input_fastas) or (
                        ("_".join(names[-2:]) == "all_types") and (
                         "all_types" in input_fastas)) or (
                        ("_".join(names[-2:]) == "without_orphan") and (
                         "without_orphan" in input_fastas)):
                    for width in args_pro.widths:
                        print("Computing promoters of {0} - {1}".format(
                              fasta, width))
                        log.write("Computing promoters of {0} - length {1}.\n".format(
                                  fasta, width))
                        if "-" in width:
                            self._run_small_motif(input_path, out_path, filename,
                                                  fasta, width, args_pro, log)
                        else:
                            self._run_normal_motif(input_path, out_path, filename,
                                                   fasta, width, args_pro, log)
            log.write("Promoter search for {0} is done.\n".format(prefix))
            log.write("All the output files from MEME or GLAM2 are generated "
                      "and stored in {0}.\n".format(out_path))

    def _combine_file(self, prefixs, args_pro):
        '''combine all TSS file in the input folder to generate the 
        global TSS for detecting the global promoter'''
        if args_pro.source:
            for tss in os.listdir(self.tss_path):
                if tss.endswith("_TSS.gff"):
                    self.helper.merge_file(os.path.join(
                         self.tss_path, tss), self.all_tss)
            for fasta in os.listdir(args_pro.fastas):
                if (fasta.endswith(".fa")) or (
                        fasta.endswith(".fna")) or (
                        fasta.endswith(".fasta")):
                    self.helper.merge_file(os.path.join(
                         args_pro.fastas, fasta), self.all_fasta)
        else:
            for tss in os.listdir(os.path.join(
                                  args_pro.output_folder, "TSS_classes")):
                if tss.endswith("_TSS.gff"):
                    self.helper.merge_file(os.path.join(
                         self.tss_path, tss), self.all_tss)
            for fasta in os.listdir(args_pro.fastas):
                if (fasta.endswith(".fa")) or (
                        fasta.endswith(".fna")) or (
                        fasta.endswith(".fasta")):
                    self.helper.merge_file(os.path.join(
                         args_pro.fastas, fasta), self.all_fasta)
        print("Generating fasta file of all sequences")
        prefixs.append("allfasta")
        input_path = os.path.join(self.out_fasta, "allfasta")
        self.helper.check_make_folder(os.path.join(
                                      args_pro.output_folder, "allfasta"))
        self.helper.check_make_folder(os.path.join(
                                      self.out_fasta, "allfasta"))
        args_pro.source = True
        upstream(self.all_tss, self.all_fasta, None,
                 None, args_pro, None)
        self._move_and_merge_fasta(input_path, "allfasta")

    def _remove_files(self, args_pro):
        self.helper.remove_tmp_dir(args_pro.fastas)
        self.helper.remove_tmp_dir(args_pro.tsss)
        self.helper.remove_tmp_dir(args_pro.gffs)
        if "tmp_wig" in os.listdir(args_pro.output_folder):
            shutil.rmtree(os.path.join(args_pro.output_folder, "tmp_wig"))
        if "allfasta" in os.listdir(os.getcwd()):
            shutil.rmtree("allfasta")
        if "tmp" in os.listdir(os.getcwd()):
            shutil.rmtree("tmp")

    def _gen_table(self, output_folder, prefixs, combine, program, log):
        '''generate the promoter table'''
        log.write("Running gen_promoter_table.py to generate promoter "
                  "table which is useful for sRNA prediction.\n")
        log.write("The following files are generated:\n")
        if combine:
            strains = prefixs + ["allfasta"]
        else:
            strains = prefixs
        for strain in strains:
            tss_file = os.path.join(self.tss_path, strain + "_TSS.gff")
            if (program.lower() == "both") or (
                    program.lower() == "meme"):
                for folder in os.listdir(os.path.join(output_folder,
                                                      strain, "MEME")):
                    csv_file = os.path.join(output_folder, strain,
                                            "MEME", folder, "meme.csv")
                    gen_promoter_table(os.path.join(output_folder, strain,
                                       "MEME", folder, "meme.txt"),
                                       csv_file, tss_file, "meme")
                    log.write("\t" + csv_file + "\n")
            if (program.lower() == "both") or (
                    program.lower() == "glam2"):
                for folder in os.listdir(os.path.join(output_folder,
                                                      strain, "GLAM2")):
                    csv_file = os.path.join(output_folder, strain,
                                            "GLAM2", folder, "glam2.csv")
                    gen_promoter_table(os.path.join(output_folder, strain,
                                        "GLAM2", folder, "glam2.txt"),
                                        csv_file, tss_file, "glam2")
                    log.write("\t" + csv_file + "\n")

    def _get_upstream(self, args_pro, prefix, tss, fasta):
        '''get upstream sequence of TSS'''
        if args_pro.source:
            print("Generating fasta file of {0}".format(prefix))
            upstream(os.path.join(self.tss_path, tss),
                     os.path.join(args_pro.fastas, fasta),
                     None, None, args_pro, prefix)
        else:
            if (args_pro.gffs is None):
                print("Error: Please assign proper annotation!!!")
                sys.exit()
            if "TSS_classes" not in os.listdir(args_pro.output_folder):
                os.mkdir(os.path.join(args_pro.output_folder, "TSS_classes"))            
            print("Classifying TSSs and extracting sequence of {0}".format(prefix))
            upstream(os.path.join(self.tss_path, tss),
                     os.path.join(args_pro.fastas, fasta),
                     os.path.join(self.gff_path, prefix + ".gff"),
                     os.path.join(args_pro.output_folder, "TSS_classes",
                     "_".join([prefix, "TSS.gff"])), args_pro, prefix)

    def _get_used_tss_type(self, args_pro):
        input_fastas = []
        for tss in args_pro.use_tss:
            if int(tss) == 1:
                input_fastas.append("all_types")
            elif int(tss) == 2:
                input_fastas.append("primary")
            elif int(tss) == 3:
                input_fastas.append("secondary")
            elif int(tss) == 4:
                input_fastas.append("internal")
            elif int(tss) == 5:
                input_fastas.append("antisense")
            elif int(tss) == 6:
                input_fastas.append("orphan")
            elif int(tss) == 7:
                input_fastas.append("without_orphan")
            else:
                print("Error: The assignment of --use_tss_typ is wrong!")
                sys.exit()
        return input_fastas

    def run_meme(self, args_pro, log):
        if "allfasta.fa" in os.listdir(args_pro.fastas):
            os.remove(self.all_fasta)
            if "allfasta.fa_folder" in os.listdir(args_pro.fastas):
                shutil.rmtree(os.path.join(args_pro.fastas,
                              "allfasta.fa_folder"))
        self.multiparser.parser_fasta(args_pro.fastas)
        self.multiparser.parser_gff(args_pro.tsss, "TSS")
        if "allfasta_TSS.gff" in os.listdir(self.tss_path):
            os.remove(self.all_tss)
        if args_pro.gffs is not None:
            self._check_gff(args_pro.gffs)
            self.multiparser.parser_gff(args_pro.gffs, None)
            self.multiparser.combine_gff(args_pro.fastas, self.gff_path,
                                         "fasta", None)
        self._check_gff(args_pro.tsss)
        self.multiparser.combine_gff(args_pro.fastas, self.tss_path,
                                     "fasta", "TSS")
        self.helper.check_make_folder(self.out_fasta)
        self.helper.check_make_folder(self.tmp_folder)
        prefixs = []
        log.write("Running .TSS_upstream.py to extract the upstream "
                  "sequences of TSSs.\n")
        log.write("The following files are generated:\n")
        for tss in os.listdir(self.tss_path):
            prefix = tss.replace("_TSS.gff", "")
            prefixs.append(prefix)
            self.helper.check_make_folder(os.path.join(args_pro.output_folder,
                                                       prefix))
            self.helper.check_make_folder(os.path.join(self.out_fasta,
                                                       prefix))
            input_path = os.path.join(self.out_fasta, prefix)
            fasta = self._get_fasta_file(args_pro.fastas, prefix)
            self._get_upstream(args_pro, prefix, tss, fasta)
            self._move_and_merge_fasta(input_path, prefix)
            self._split_fasta_by_strain(input_path)
            for file_ in os.listdir(input_path):
                log.write("\t" + os.path.join(input_path, file_) + "\n")
        if args_pro.combine:
            self._combine_file(prefixs, args_pro)
            for file_ in os.listdir(os.path.join(self.out_fasta, "allfasta")):
                log.write("\t" + os.path.join(
                    self.out_fasta, "allfasta", file_) + "\n")
        input_fastas = self._get_used_tss_type(args_pro)
        self._run_program(prefixs, args_pro, log, input_fastas)
        print("Generating the tables")
        self._gen_table(args_pro.output_folder, prefixs,
                        args_pro.combine, args_pro.program, log)
        self._remove_files(args_pro)
Beispiel #50
0
class sRNATargetPrediction(object):
    '''detection of sRNA-target interaction'''
    def __init__(self, args_tar):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.fixer = FormatFixer()
        self.gff_parser = Gff3Parser()
        self.target_seq_path = os.path.join(args_tar.out_folder, "target_seqs")
        self.srna_seq_path = os.path.join(args_tar.out_folder, "sRNA_seqs")
        self.rnaplex_path = os.path.join(args_tar.out_folder,
                                         "RNAplex_results")
        self.rnaup_path = os.path.join(args_tar.out_folder, "RNAup_results")
        self.merge_path = os.path.join(args_tar.out_folder, "merged_results")
        self.srna_path = os.path.join(args_tar.srnas, "tmp")
        self.fasta_path = os.path.join(args_tar.fastas, "tmp")
        self.gff_path = os.path.join(args_tar.gffs, "tmp")
        self.tmps = {
            "tmp": "tmp_srna_target",
            "rnaup": "tmp_rnaup",
            "log": "tmp_log",
            "all_fa": "tmp*.fa",
            "all_txt": "tmp*.txt"
        }

    def _check_gff(self, gffs):
        for gff in os.listdir(gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(gffs, gff))

    def _run_rnaplfold(self, rnaplfold_path, file_type, win_size, span,
                       unstr_region, seq_path, prefix, out_path):
        current = os.getcwd()
        os.chdir(out_path)
        command = " ".join([
            rnaplfold_path, "-W",
            str(win_size), "-L",
            str(span), "-u",
            str(unstr_region), "-O"
        ])
        if file_type == "sRNA":
            os.system("<".join([
                command,
                os.path.join(
                    current, seq_path,
                    "_".join([self.tmps["tmp"], prefix, file_type + ".fa"]))
            ]))
        else:
            os.system("<".join([
                command,
                os.path.join(current, seq_path,
                             "_".join([prefix, file_type + ".fa"]))
            ]))
        os.chdir(current)

    def _wait_process(self, processes):
        for p in processes:
            p.wait()
            if p.stdout:
                p.stdout.close()
            if p.stdin:
                p.stdin.close()
            if p.stderr:
                p.stderr.close()
            try:
                p.kill()
            except OSError:
                pass
            time.sleep(5)

    def _sort_srna_fasta(self, fasta, prefix, path):
        out = open(
            os.path.join(path, "_".join([self.tmps["tmp"], prefix,
                                         "sRNA.fa"])), "w")
        srnas = []
        with open(fasta) as f_h:
            for line in f_h:
                line = line.strip()
                if line.startswith(">"):
                    name = line[1:]
                else:
                    srnas.append({"name": name, "seq": line, "len": len(line)})
        srnas = sorted(srnas, key=lambda x: (x["len"]))
        for srna in srnas:
            out.write(">" + srna["name"].split("|")[0] + "\n")
            out.write(srna["seq"] + "\n")
        out.close()

    def _read_fasta(self, fasta_file):
        seq = ""
        with open(fasta_file, "r") as seq_f:
            for line in seq_f:
                line = line.strip()
                if line.startswith(">"):
                    continue
                else:
                    seq = seq + line
        return seq

    def _get_specific_seq(self, srna_file, seq_file, srna_out, querys):
        for query in querys:
            srna_datas = query.split(":")
            srna = {
                "seq_id": srna_datas[0],
                "strand": srna_datas[3],
                "start": int(srna_datas[1]),
                "end": int(srna_datas[2])
            }
            gff_f = open(srna_file, "r")
            out = open(srna_out, "a")
            seq = self._read_fasta(seq_file)
            num = 0
            detect = False
            for entry in self.gff_parser.entries(gff_f):
                if (entry.seq_id == srna["seq_id"]) and (
                        entry.strand == srna["strand"]) and (
                            entry.start == srna["start"]) and (entry.end
                                                               == srna["end"]):
                    detect = True
                    if "ID" in entry.attributes.keys():
                        id_ = entry.attributes["ID"]
                    else:
                        id_ = entry.feature + str(num)
                    gene = self.helper.extract_gene(seq, entry.start,
                                                    entry.end, entry.strand)
                    out.write(">{0}|{1}|{2}|{3}|{4}\n{5}\n".format(
                        id_, entry.seq_id, entry.start, entry.end,
                        entry.strand, gene))
                    num += 1
            if not detect:
                print("Error: Some of the query sRNAs do not exist!")
                sys.exit()
            gff_f.close()
            out.close()

    def _gen_seq(self, prefixs, args_tar):
        print("Generating sRNA fasta files")
        for srna in os.listdir(self.srna_path):
            if srna.endswith("_sRNA.gff"):
                prefix = srna.replace("_sRNA.gff", "")
                prefixs.append(prefix)
                srna_out = os.path.join(self.srna_seq_path,
                                        "_".join([prefix, "sRNA.fa"]))
                if "all" in args_tar.query:
                    self.helper.get_seq(
                        os.path.join(self.srna_path, srna),
                        os.path.join(self.fasta_path, prefix + ".fa"),
                        srna_out)
                else:
                    if "_".join([prefix,
                                 "sRNA.fa"]) in os.listdir(self.srna_seq_path):
                        os.remove(srna_out)
                    self._get_specific_seq(
                        os.path.join(self.srna_path, srna),
                        os.path.join(self.fasta_path, prefix + ".fa"),
                        srna_out, args_tar.query)
                self._sort_srna_fasta(srna_out, prefix, self.srna_seq_path)
        print("Generating target fasta files")
        for gff in os.listdir(self.gff_path):
            if gff.endswith(".gff"):
                prefix = gff.replace(".gff", "")
                potential_target(os.path.join(self.gff_path, gff),
                                 os.path.join(self.fasta_path, prefix + ".fa"),
                                 os.path.join(self.target_seq_path), args_tar)
                file_num = 1
                num = 0
                sub_prefix = os.path.join(self.target_seq_path,
                                          "_".join([prefix, "target"]))
                sub_out = open("_".join([sub_prefix,
                                         str(file_num) + ".fa"]), "w")
                with open((sub_prefix + ".fa"), "r") as t_f:
                    for line in t_f:
                        line = line.strip()
                        if line.startswith(">"):
                            num += 1
                        if (num == 100):
                            num = 0
                            file_num += 1
                            sub_out.close()
                            sub_out = open(
                                "_".join([sub_prefix,
                                          str(file_num) + ".fa"]), "w")
                        sub_out.write(line + "\n")
                sub_out.close()

    def _run_rnaplex(self, prefix, rnaplfold_folder, args_tar):
        print("Running RNAplex of {0}".format(prefix))
        num_process = 0
        processes = []
        for seq in os.listdir(self.target_seq_path):
            if (prefix in seq) and ("_target_" in seq):
                print("Running RNAplex with {0}".format(seq))
                out_rnaplex = open(
                    os.path.join(
                        self.rnaplex_path, prefix, "_".join(
                            [prefix, "RNAplex",
                             str(num_process) + ".txt"])), "w")
                num_process += 1
                p = Popen([
                    args_tar.rnaplex_path, "-q",
                    os.path.join(
                        self.srna_seq_path, "_".join(
                            [self.tmps["tmp"], prefix, "sRNA.fa"])), "-t",
                    os.path.join(self.target_seq_path, seq), "-l",
                    str(args_tar.inter_length), "-e",
                    str(args_tar.energy), "-z",
                    str(args_tar.duplex_dist), "-a", rnaplfold_folder
                ],
                          stdout=out_rnaplex)
                processes.append(p)
                if num_process % args_tar.core_plex == 0:
                    self._wait_process(processes)
        self._wait_process(processes)
        return num_process

    def _rna_plex(self, prefixs, args_tar):
        for prefix in prefixs:
            print("Running RNAplfold of {0}".format(prefix))
            self.helper.check_make_folder(
                os.path.join(self.rnaplex_path, prefix))
            rnaplfold_folder = os.path.join(self.rnaplex_path, prefix,
                                            "RNAplfold")
            os.mkdir(rnaplfold_folder)
            self._run_rnaplfold(args_tar.rnaplfold_path, "sRNA",
                                args_tar.win_size_s, args_tar.span_s,
                                args_tar.unstr_region_rnaplex_s,
                                self.srna_seq_path, prefix, rnaplfold_folder)
            self._run_rnaplfold(args_tar.rnaplfold_path, "target",
                                args_tar.win_size_t, args_tar.span_t,
                                args_tar.unstr_region_rnaplex_t,
                                self.target_seq_path, prefix, rnaplfold_folder)
            num_process = self._run_rnaplex(prefix, rnaplfold_folder, args_tar)
            rnaplex_file = os.path.join(self.rnaplex_path, prefix,
                                        "_".join([prefix, "RNAplex.txt"]))
            if ("_".join([prefix, "RNAplex.txt"])
                    in os.listdir(os.path.join(self.rnaplex_path, prefix))):
                os.remove(rnaplex_file)
            for index in range(0, num_process):
                self.helper.merge_file(
                    os.path.join(
                        self.rnaplex_path, prefix,
                        "_".join([prefix, "RNAplex",
                                  str(index) + ".txt"])), rnaplex_file)
            self.helper.remove_all_content(
                os.path.join(self.rnaplex_path, prefix), "_RNAplex_", "file")
            self.fixer.fix_rnaplex(rnaplex_file, self.tmps["tmp"])
            shutil.move(self.tmps["tmp"], rnaplex_file)
            shutil.rmtree(rnaplfold_folder)

    def _run_rnaup(self, num_up, processes, out_rnaup, out_log, args_tar):
        for index in range(1, num_up + 1):
            out_tmp_up = open(
                os.path.join(args_tar.out_folder,
                             "".join([self.tmps["rnaup"],
                                      str(index), ".txt"])), "w")
            out_err = open(
                os.path.join(args_tar.out_folder,
                             "".join([self.tmps["log"],
                                      str(index), ".txt"])), "w")
            in_up = open(
                os.path.join(args_tar.out_folder,
                             "".join([self.tmps["tmp"],
                                      str(index), ".fa"])), "r")
            p = Popen([
                args_tar.rnaup_path, "-u",
                str(args_tar.unstr_region_rnaup), "-o", "--interaction_first"
            ],
                      stdin=in_up,
                      stdout=out_tmp_up,
                      stderr=out_err)
            processes.append(p)
        if len(processes) != 0:
            time.sleep(5)
            self._wait_process(processes)
            os.system("rm " +
                      os.path.join(args_tar.out_folder, self.tmps["all_fa"]))
            self._merge_txt(num_up, out_rnaup, out_log, args_tar.out_folder)
            os.system("rm " +
                      os.path.join(args_tar.out_folder, self.tmps["all_txt"]))

    def _merge_txt(self, num_up, out_rnaup, out_log, out_folder):
        for index in range(1, num_up + 1):
            self.helper.merge_file(
                os.path.join(out_folder,
                             "".join([self.tmps["rnaup"],
                                      str(index), ".txt"])), out_rnaup)
            self.helper.merge_file(
                os.path.join(out_folder,
                             "".join([self.tmps["log"],
                                      str(index), ".txt"])), out_log)

    def _get_continue(self, out_rnaup):
        '''For RNAup, it can continue running RNAup based on previous run'''
        srnas = []
        matchs = {}
        out = open("tmp.txt", "w")
        with open(out_rnaup) as f_h:
            for line in f_h:
                line = line.strip()
                if ">srna" in line:
                    srna = line[1:]
                    srnas.append(srna)
                    matchs[srna] = []
                else:
                    matchs[srna].append(line)
        srnas = srnas[:-1]
        for srna in srnas:
            out.write(">" + srna + "\n")
            for target in matchs[srna]:
                out.write(target + "\n")
        out.close()
        os.remove(out_rnaup)
        shutil.move("tmp.txt", out_rnaup)
        return srnas

    def _rnaup(self, prefixs, args_tar):
        for prefix in prefixs:
            srnas = []
            print("Running RNAup of {0}".format(prefix))
            if not os.path.exists(os.path.join(self.rnaup_path, prefix)):
                os.mkdir(os.path.join(self.rnaup_path, prefix))
            num_up = 0
            processes = []
            out_rnaup = os.path.join(self.rnaup_path, prefix,
                                     "_".join([prefix + "_RNAup.txt"]))
            out_log = os.path.join(self.rnaup_path, prefix,
                                   "_".join([prefix + "_RNAup.log"]))
            if "_".join([prefix, "RNAup.txt"]) in \
                    os.listdir(os.path.join(self.rnaup_path, prefix)):
                if not args_tar.continue_rnaup:
                    os.remove(out_rnaup)
                    os.remove(out_log)
                else:
                    srnas = self._get_continue(out_rnaup)
            with open(
                    os.path.join(
                        self.srna_seq_path,
                        "_".join([self.tmps["tmp"], prefix, "sRNA.fa"])),
                    "r") as s_f:
                for line in s_f:
                    line = line.strip()
                    if line.startswith(">"):
                        if line[1:] in srnas:
                            start = False
                            continue
                        start = True
                        print("Running RNAup with {0}".format(line[1:]))
                        num_up += 1
                        out_up = open(
                            os.path.join(
                                args_tar.out_folder,
                                "".join([self.tmps["tmp"],
                                         str(num_up), ".fa"])), "w")
                        out_up.write(line + "\n")
                    else:
                        if start:
                            out_up.write(line + "\n")
                            out_up.close()
                            self.helper.merge_file(
                                os.path.join(self.target_seq_path,
                                             "_".join([prefix, "target.fa"])),
                                os.path.join(
                                    args_tar.out_folder, "".join(
                                        [self.tmps["tmp"],
                                         str(num_up), ".fa"])))
                            if num_up == args_tar.core_up:
                                self._run_rnaup(num_up, processes, out_rnaup,
                                                out_log, args_tar)
                                processes = []
                                num_up = 0
            self._run_rnaup(num_up, processes, out_rnaup, out_log, args_tar)

    def _merge_rnaplex_rnaup(self, prefixs, args_tar):
        '''merge the result of RNAup and RNAplex'''
        for prefix in prefixs:
            rnaplex_file = None
            rnaup_file = None
            out_rnaplex = None
            out_rnaup = None
            self.helper.check_make_folder(os.path.join(self.merge_path,
                                                       prefix))
            print("Ranking {0} now".format(prefix))
            if (args_tar.program == "both") or (args_tar.program == "RNAplex"):
                rnaplex_file = os.path.join(self.rnaplex_path, prefix,
                                            "_".join([prefix, "RNAplex.txt"]))
                out_rnaplex = os.path.join(
                    self.rnaplex_path, prefix,
                    "_".join([prefix, "RNAplex_rank.csv"]))
            if (args_tar.program == "both") or (args_tar.program == "RNAup"):
                rnaup_file = os.path.join(self.rnaup_path, prefix,
                                          "_".join([prefix, "RNAup.txt"]))
                out_rnaup = os.path.join(self.rnaup_path, prefix,
                                         "_".join([prefix, "RNAup_rank.csv"]))
            merge_srna_target(
                rnaplex_file, rnaup_file, args_tar, out_rnaplex, out_rnaup,
                os.path.join(self.fasta_path, prefix + ".fa"),
                os.path.join(self.merge_path, prefix,
                             "_".join([prefix, "merge.csv"])),
                os.path.join(self.merge_path, prefix,
                             "_".join([prefix, "overlap.csv"])),
                os.path.join(self.srna_path, "_".join([prefix, "sRNA.gff"])),
                os.path.join(self.gff_path, prefix + ".gff"))

    def run_srna_target_prediction(self, args_tar):
        self._check_gff(args_tar.gffs)
        self._check_gff(args_tar.srnas)
        self.multiparser.parser_gff(args_tar.gffs, None)
        self.multiparser.parser_fasta(args_tar.fastas)
        self.multiparser.parser_gff(args_tar.srnas, "sRNA")
        prefixs = []
        self._gen_seq(prefixs, args_tar)
        if (args_tar.program == "both") or (args_tar.program == "RNAplex"):
            self._rna_plex(prefixs, args_tar)
        self.helper.remove_all_content(self.target_seq_path, "_target_",
                                       "file")
        #        if (args_tar.program == "RNAplex") or (
        #                args_tar.program == "both"):
        #            for strain in os.listdir(os.path.join(
        #                          args_tar.out_folder, "RNAplex_results")):
        #                shutil.rmtree(os.path.join(args_tar.out_folder, "RNAplex_results",
        #                                           strain, "RNAplfold"))
        if (args_tar.program == "both") or (args_tar.program == "RNAup"):
            self._rnaup(prefixs, args_tar)
        self._merge_rnaplex_rnaup(prefixs, args_tar)
        self.helper.remove_all_content(args_tar.out_folder, self.tmps["tmp"],
                                       "dir")
        self.helper.remove_all_content(args_tar.out_folder, self.tmps["tmp"],
                                       "file")
        self.helper.remove_tmp_dir(args_tar.gffs)
        self.helper.remove_tmp_dir(args_tar.srnas)
        self.helper.remove_tmp_dir(args_tar.fastas)
        self.helper.remove_all_content(self.srna_seq_path, "tmp_", "file")
Beispiel #51
0
class RATT(object):
    '''annotation transfer'''
    def __init__(self, args_ratt):
        self.multiparser = Multiparser()
        self.converter = Converter()
        self.format_fixer = FormatFixer()
        self.helper = Helper()
        if args_ratt.ref_gbk:
            self.gbk = os.path.join(args_ratt.ref_gbk, "gbk_tmp")
            self.gbk_tmp = os.path.join(self.gbk, "tmp")
            self.embl = os.path.join(args_ratt.ref_gbk, "embls")
        if args_ratt.ref_embls:
            self.embl = args_ratt.ref_embls
        self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt")
        self.tmp_files = {
            "tar": os.path.join(args_ratt.tar_fastas, "tmp"),
            "ref": os.path.join(args_ratt.ref_fastas, "tmp"),
            "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"),
            "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"),
            "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"),
            "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt")
        }

    def _convert_to_pttrnt(self, gffs, files, log):
        for gff in files:
            if gff.endswith(".gff"):
                gff = os.path.join(gffs, gff)
                filename = gff.split("/")
                prefix = filename[-1][:-4]
                rnt = gff[:-3] + "rnt"
                ptt = gff[:-3] + "ptt"
                fasta = self.helper.get_correct_file(self.tmp_files["tar"],
                                                     ".fa", prefix, None, None)
                if fasta:
                    self.converter.convert_gff2rntptt(gff, fasta, ptt, rnt,
                                                      None, None)
                    log.write("\t" + ptt + " is generated.\n")
                    log.write("\t" + rnt + " is generated.\n")

    def _remove_files(self, args_ratt, out_gbk, log):
        self.helper.remove_all_content(args_ratt.gff_outfolder, ".gff", "file")
        self.helper.remove_all_content(args_ratt.gff_outfolder, ".ptt", "file")
        self.helper.remove_all_content(args_ratt.gff_outfolder, ".rnt", "file")
        log.write("Moving the final output files to {0}.\n".format(
            args_ratt.gff_outfolder))
        self.helper.move_all_content(self.tmp_files["out_gff"],
                                     args_ratt.gff_outfolder, None)
        log.write("Remove the temperary files.\n")
        shutil.rmtree(self.tmp_files["out_gff"])
        shutil.rmtree(self.tmp_files["tar"])
        shutil.rmtree(self.tmp_files["ref"])
        self.helper.remove_tmp_dir(args_ratt.tar_fastas)
        self.helper.remove_tmp_dir(args_ratt.ref_fastas)
        self.helper.remove_tmp_dir(args_ratt.ref_embls)
        self.helper.remove_tmp_dir(args_ratt.ref_gbk)

    def _convert_to_gff(self, ratt_result, args_ratt, files, log):
        name = ratt_result.split(".")
        filename = ".".join(name[1:-2]) + ".gff"
        output_file = os.path.join(args_ratt.output_path, filename)
        self.converter.convert_embl2gff(
            os.path.join(args_ratt.output_path, ratt_result), output_file)
        self.format_fixer.fix_ratt(output_file, ".".join(name[1:-2]),
                                   "tmp_gff")
        shutil.move("tmp_gff", output_file)
        shutil.copy(output_file, os.path.join(args_ratt.gff_outfolder,
                                              filename))
        log.write("\t" + os.path.join(args_ratt.gff_outfolder, filename) +
                  " is generated.\n")
        files.append(filename)

    def _parser_embl_gbk(self, files):
        self.helper.check_make_folder(self.gbk)
        for file_ in files:
            close = False
            with open(file_, "r") as f_h:
                for line in f_h:
                    if (line.startswith("LOCUS")):
                        out = open(self.gbk_tmp, "w")
                        datas = line.split(" ")
                        for data in datas:
                            if (len(data) != 0) and (data != "LOCUS"):
                                filename = ".".join([data.strip(), "gbk"])
                                break
                    elif (line.startswith("VERSION")):
                        datas = line.split(" ")
                        for data in datas:
                            if (len(data) != 0) and (data != "VERSION"):
                                new_filename = ".".join([data.strip(), "gbk"])
                                break
                        if new_filename.find(filename):
                            filename = new_filename
                    if out:
                        out.write(line)
                    if line.startswith("//"):
                        out.close()
                        close = True
                        shutil.move(self.gbk_tmp,
                                    os.path.join(self.gbk, filename))
            if not close:
                out.close()
        return self.gbk

    def _convert_embl(self, ref_embls, log):
        '''convert gbk to embl'''
        detect_gbk = False
        gbks = []
        out_gbk = None
        for embl in os.listdir(ref_embls):
            if (embl.endswith(".gbk")) or (embl.endswith(".gbff")) or (
                    embl.endswith(".gb")):
                detect_gbk = True
                gbks.append(os.path.join(ref_embls, embl))
        if not detect_gbk:
            log.write(
                "--related_gbk_files is assigned, but not gbk files are detected.\n"
                "The gbk file names need to be ended at .gbk, .gb, or .gbff. \n"
            )
            print("Error: Please assign proper Genebank files!")
            sys.exit()
        elif detect_gbk:
            out_gbk = self._parser_embl_gbk(gbks)
            log.write(
                "Running converter.py to convert gbk file to embl format.\n")
            self.converter.convert_gbk2embl(out_gbk)
            self.helper.check_make_folder(self.embl)
            self.helper.move_all_content(out_gbk, self.embl, [".embl"])
            log.write("\t" + self.embl +
                      " is generated and the embl files are stored in it.\n")
        return out_gbk

    def _run_ratt(self, args_ratt, tar, ref, out, log):
        if (not os.path.exists(self.embl)) or (not os.path.exists(
                os.path.join(self.tmp_files["tar"], tar + ".fa"))) or (
                    not os.path.exists(
                        os.path.join(self.tmp_files["ref"], ref + ".fa"))):
            print("Error: Please check --compare_pair, the strain names "
                  "should be the same as the strain names in fasta, "
                  "genbank or embl files!")
            log.write(
                "The strain names in --compare_pair should be the same "
                "as the strain names in fasta, genbank, or embl files.\n")
            sys.exit()
        log.write("Make sure your RATT version is at least 1.64.\n")
        log.write("If the RATT can not run properly, please check the "
                  "RATT_HOME and PAGIT_HOME is assigned correctly.\n")
        log.write(" ".join([
            args_ratt.ratt_path, self.embl,
            os.path.join(self.tmp_files["tar"], tar +
                         ".fa"), args_ratt.element, args_ratt.transfer_type,
            os.path.join(self.tmp_files["ref"], ref + ".fa")
        ]) + "\n")
        call([
            args_ratt.ratt_path, self.embl,
            os.path.join(self.tmp_files["tar"], tar + ".fa"),
            args_ratt.element, args_ratt.transfer_type,
            os.path.join(self.tmp_files["ref"], ref + ".fa")
        ],
             stdout=out,
             stderr=DEVNULL)
        log.write("Done!\n")

    def _format_and_run(self, args_ratt, log):
        print("Running RATT")
        for pair in args_ratt.pairs:
            ref = pair.split(":")[0]
            tar = pair.split(":")[1]
            out = open(self.ratt_log, "w+")
            self._run_ratt(args_ratt, tar, ref, out, log)
            log.write("The following files are generatd:\n")
            for filename in os.listdir():
                if ("final" in filename):
                    log.write("\t" + filename + "\n")
                    shutil.move(filename,
                                os.path.join(args_ratt.output_path, filename))
                elif (args_ratt.element in filename) or (
                        "query" in filename) or ("Reference" in filename) or (
                            "Query" in filename) or ("Sequences" in filename):
                    log.write("\t" + filename + "\n")
                    if os.path.isfile(filename):
                        os.remove(filename)
                    if os.path.isdir(filename):
                        shutil.rmtree(filename)
        out.close()

    def annotation_transfer(self, args_ratt, log):
        self.multiparser.parser_fasta(args_ratt.tar_fastas)
        self.multiparser.parser_fasta(args_ratt.ref_fastas)
        out_gbk = None
        if args_ratt.ref_embls is None:
            out_gbk = self._convert_embl(args_ratt.ref_gbki, log)
        self._format_and_run(args_ratt, log)
        files = []
        for data in os.listdir(args_ratt.output_path):
            if "final.embl" in data:
                log.write(
                    "Running converter.py to convert embl "
                    "files in {0} to gff, ptt, and rnt format.\n".format(data))
                self._convert_to_gff(data, args_ratt, files, log)
                self._convert_to_pttrnt(args_ratt.gff_outfolder, files, log)
        self.helper.check_make_folder(self.tmp_files["out_gff"])
        log.write("Merging the output of {0}.\n".format(data))
        for folder in os.listdir(args_ratt.tar_fastas):
            files = []
            if "_folder" in folder:
                datas = folder.split("_folder")
                prefix = ".".join(datas[0].split(".")[:-1])
                for file_ in os.listdir(
                        os.path.join(args_ratt.tar_fastas, folder)):
                    files.append(file_[:-3])
                for gff in os.listdir(args_ratt.gff_outfolder):
                    for file_ in files:
                        if (".gff" in gff) and (file_ == gff[:-4]):
                            self.helper.merge_file(
                                os.path.join(args_ratt.gff_outfolder, gff),
                                self.tmp_files["gff"])
                        if (".ptt" in gff) and (file_ == gff[:-4]):
                            self.helper.merge_file(
                                os.path.join(args_ratt.gff_outfolder, gff),
                                self.tmp_files["ptt"])
                        if (".rnt" in gff) and (file_ == gff[:-4]):
                            self.helper.merge_file(
                                os.path.join(args_ratt.gff_outfolder, gff),
                                self.tmp_files["rnt"])
                if os.path.exists(self.tmp_files["gff"]):
                    shutil.move(
                        self.tmp_files["gff"],
                        os.path.join(self.tmp_files["out_gff"],
                                     prefix + ".gff"))
                    shutil.move(
                        self.tmp_files["ptt"],
                        os.path.join(self.tmp_files["out_gff"],
                                     prefix + ".ptt"))
                    shutil.move(
                        self.tmp_files["rnt"],
                        os.path.join(self.tmp_files["out_gff"],
                                     prefix + ".rnt"))
                else:
                    print("Error: Please check your fasta or "
                          "annotation files, they should only contain "
                          "the query genome. And make sure your RATT can "
                          "work properly (check $ANNOgesic/output/"
                          "annotation_transfer/ratt_log.txt).")
                    log.write("Please check your fasta or "
                              "annotation files, they should only contain "
                              "the query genome. And make sure your RATT can "
                              "work properly (check $ANNOgesic/output/"
                              "annotation_transfer/ratt_log.txt).\n")
        self._remove_files(args_ratt, out_gbk, log)
Beispiel #52
0
class SNPCalling(object):
    '''detection of SNP'''

    def __init__(self, args_snp):
        self.multiparser = Multiparser()
        self.seq_editer = SeqEditer()
        self.helper = Helper()
        if args_snp.types == "reference":
            file_type = "compare_reference"
        else:
            file_type = "validate_target"
        self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs")
        self.stat_path = os.path.join(args_snp.out_folder, file_type,
                                      "statistics")
        self.fasta_path = os.path.join(args_snp.fastas, "tmp")
        self.outputs = {"table": os.path.join(
                        args_snp.out_folder, file_type, "SNP_table"),
                        "raw": os.path.join(
                        args_snp.out_folder, file_type, "SNP_raw_outputs"),
                        "tmp": os.path.join(args_snp.out_folder, "tmp_bcf"),
                        "depth": os.path.join(args_snp.out_folder, "tmp_depth")}
        if "whole_reads.bam" in os.listdir(args_snp.out_folder):
            self.helper.remove_all_content(args_snp.out_folder,
                                           "whole_read", "file")
        self.bams = {"whole": os.path.join(args_snp.out_folder,
                                           "whole_reads.bam"),
                     "sort": os.path.join(args_snp.out_folder,
                                          "whole_reads_sorted.bam"),
                     "bams": []}
        self.header = os.path.join(args_snp.out_folder, "header")
        self.baqs = {"with": "with_BAQ", "without": "without_BAQ",
                     "extend": "extend_BAQ"}

    def _transcript_snp(self, fasta, snp, out_table_prefix, type_,
                        prefix, bam_number, table_path, args_snp):
        seq_path = os.path.join(self.seq_path, self.baqs[type_], prefix)
        stat_prefix = os.path.join(self.stat_path, "_".join([
            "stat", "_".join([prefix, self.baqs[type_]]), "SNP"]))
        snp_detect(fasta, snp, self.outputs["depth"], out_table_prefix,
                   os.path.join(seq_path, prefix), bam_number,
                   stat_prefix, args_snp)
        self.helper.move_all_content(table_path, self.stat_path, [".png"])

    def _get_para(self, args_snp):
        bams = self.bams["sort"]
        if args_snp.caller == "c":
            bcf_para = "-vcO"
        else:
            bcf_para = "-vmO"
        return bams, bcf_para

    def _run_tools(self, fasta_file, out_raw_prefix, type_, args_snp):
        bams, bcf_para = self._get_para(args_snp)
        if type_ == "with":
            command = [args_snp.samtools_path, "mpileup", "-t", "DP"]
        elif type_ == "without":
            command = [args_snp.samtools_path, "mpileup", "-t", "DP", "-B"]
        elif type_ == "extend":
            command = [args_snp.samtools_path, "mpileup", "-t", "DP", "-E"]
        if args_snp.rg:
            command = command + ["-ugf", fasta_file, bams]
        else:
            command = command + ["--ignore-RG", "-ugf", fasta_file, bams]
        os.system(" ".join(command) + ">" + self.outputs["tmp"])
        out_vcf = "_".join([out_raw_prefix, self.baqs[type_] + ".vcf"])
        if args_snp.chrom == "1":
            call([args_snp.bcftools_path, "call", "--ploidy", args_snp.chrom,
                  self.outputs["tmp"], bcf_para, "v", "-o", out_vcf])
        elif args_snp.chrom == "2":
            call([args_snp.bcftools_path, "call",
                  self.outputs["tmp"], bcf_para, "v", "-o", out_vcf])
        return out_vcf

    def _run_sub(self, args_snp, fasta_file, type_, file_prefixs, prefix,
                 table_path, bam_number):
        out_vcf = self._run_tools(fasta_file, file_prefixs["raw_prefix"],
                                  type_, args_snp)
        self.helper.check_make_folder(
             os.path.join(self.seq_path, self.baqs[type_], prefix))
        self._transcript_snp(
            fasta_file, out_vcf,
            "_".join([file_prefixs["table_prefix"], self.baqs[type_]]),
            type_, prefix, bam_number, table_path, args_snp)

    def _run_program(self, fasta_file, file_prefixs, prefix, bam_number,
                     table_path, args_snp):
        for index in args_snp.program:
            if index == "with_BAQ":
                type_ = "with"
                print("Running SNP calling with BAQ")
            elif index == "without_BAQ":
                type_ = "without"
                print("Running SNP calling without BAQ")
            elif index == "extend_BAQ":
                print("Running SNP calling extend BAQ")
                type_ = "extend"
            else:
                print("Error: No correct program, please assign "
                      "\"with_BAQ\", \"without_BAQ\", \"extend_BAQ\"!")
                sys.exit()
            self._run_sub(args_snp, fasta_file, type_, file_prefixs, prefix,
                          table_path, bam_number)

    def _detect_fasta(self, fasta):
        detect = False
        if fasta.endswith(".fa"):
            prefix = fasta[:-3]
            detect = True
        elif fasta.endswith(".fna"):
            prefix = fasta[:-4]
            detect = True
        elif fasta.endswith(".fasta"):
            prefix = fasta[:-6]
            detect = True
        return (detect, prefix)

    def _run_bam(self, samtools_path, sub_command, bam_file):
        if sub_command == "merge":
            command = (" ".join([samtools_path, sub_command,
                       self.bams["whole"], bam_file]))
        elif sub_command == "sort":
            command = (" ".join([samtools_path, sub_command,
                                 "-o", bam_file, self.bams["whole"]]))
        os.system(command)
        self.bams["bams"].append(bam_file.replace(".bam", "_sort.bam"))

    def _merge_bams(self, args_snp):
        bams = []
        num_normal = 0
        num_frag = 0
        if (args_snp.bams is None):
            print("Error: There is no BAMs folders!!")
            sys.exit()
        else:
            num_bam = 0
            for files in args_snp.bams:
                for bam in glob(files):
                    bams.append(bam)
                    num_bam += 1
        if num_bam <= 1:
            shutil.copyfile(bams[0], self.bams["whole"])
            print("Sorting BAM file now")
            self._run_bam(args_snp.samtools_path, "sort",
                          self.bams["sort"])
        else:
            print("Merging BAM files now")
            self._run_bam(args_snp.samtools_path, "merge",
                          " ".join(bams))
            print("Sorting BAM file now")
            self._run_bam(args_snp.samtools_path, "sort",
                          self.bams["sort"])
        out_depth = open(self.outputs["depth"], "w")
        call([args_snp.samtools_path, "index",  self.bams["sort"]])
        call([args_snp.samtools_path, "depth",  self.bams["sort"]],
             stdout=out_depth)
        return num_bam

    def _modify_header(self, fastas):
        for fasta in os.listdir(fastas):
            if fasta.endswith("fasta") or \
               fasta.endswith("fa") or \
               fasta.endswith("fna"):
                self.seq_editer.modify_header(os.path.join(fastas, fasta))

    def _get_header(self, samtools_path, bam, seq_names):
        command = " ".join([samtools_path, "view", "-H", bam])
        os.system(">".join([command, self.header]))
        fh = open(self.header, "r")
        for row in csv.reader(fh, delimiter="\t"):
            if row[0] == "@SQ":
                seq_names.append(row[1].split(":")[1])
        fh.close()

    def _get_genome_name(self, args_snp):
        seq_names = []
        self._get_header(args_snp.samtools_path, self.bams["sort"],
                         seq_names)
        return seq_names

    def _remove_bams(self):
        if os.path.exists(self.bams["whole"]):
            os.remove(self.bams["whole"])
        if os.path.exists(self.bams["whole"] + ".bai"):
            os.remove(self.bams["whole"] + ".bai")
        if os.path.exists(self.bams["sort"]):
            os.remove(self.bams["sort"])
        if os.path.exists(self.bams["sort"] + ".bai"):
            os.remove(self.bams["sort"] + ".bai")
        if os.path.exists(self.header):
            os.remove(self.header)
        os.remove(self.outputs["depth"])

    def run_snp_calling(self, args_snp):
        self.multiparser.parser_fasta(args_snp.fastas)
        self._modify_header(args_snp.fastas)
        bam_number = self._merge_bams(args_snp)
        seq_names = self._get_genome_name(args_snp)
        if ("with_BAQ" not in args_snp.program) and (
                "without_BAQ" not in args_snp.program) and (
                "extend_BAQ" not in args_snp.program):
            print("Error: Please assign a correct programs: "
                  "\"with_BAQ\", \"without_BAQ\", \"extend_BAQ\".")
            sys.exit()
        else:
            for fasta in os.listdir(self.fasta_path):
                if (fasta.split(".f")[0] in seq_names):
                    fasta_datas = self._detect_fasta(fasta)
                    detect = fasta_datas[0]
                    prefix = fasta_datas[1]
                    if detect:
                        detect = False
                        print("Computing {0} now".format(fasta))
                        self.helper.check_make_folder(
                             os.path.join(self.outputs["table"], prefix))
                        self.helper.check_make_folder(
                             os.path.join(self.outputs["raw"], prefix))
                        file_prefixs = {"raw_prefix": os.path.join(
                                        self.outputs["raw"], prefix, prefix),
                                        "table_prefix": os.path.join(
                                        self.outputs["table"], prefix, prefix)}
                        fasta_file = os.path.join(self.fasta_path, fasta)
                        table_path = os.path.join(self.outputs["table"],
                                                  prefix)
                        self._run_program(fasta_file, file_prefixs, prefix,
                                          bam_number, table_path, args_snp)
                        os.remove(self.outputs["tmp"])
        self.helper.remove_tmp_dir(args_snp.fastas)
        self._remove_bams()
Beispiel #53
0
class Multiparser(object):

    def __init__(self):
        self.seq_editer = SeqEditer()
        self.helper = Helper()
        self.tmp_fa = "tmp.fa"
        self.tmp_gff = "tmp.gff"
        self.tmp_wig_forward = "tmp_forward.wig"
        self.tmp_wig_reverse = "tmp_reverse.wig"

    def combine_fasta(self, ref_folder, tar_folder, ref_feature):
        tar_merge = os.path.join(tar_folder, "merge_tmp")
        change = False
        if ref_feature is None:
            ref_feature = ""
        else:
            ref_feature = "_" + ref_feature
        self.helper.check_make_folder(tar_merge)
        for folder in os.listdir(ref_folder):
            files = []
            if "_folder" in folder:
                datas = folder.split("_folder")
                if ref_feature == "":
                    prefix = datas[0][:-4]
                elif ref_feature == "_fasta":
                    if datas[0].endswith(".fa"):
                        prefix = datas[0][:-3]
                    elif datas[0].endswith(".fna"):
                        prefix = datas[0][:-4]
                    elif datas[0].endswith(".fasta"):
                        prefix = datas[0][:-6]
                else:
                    datas = datas[0][:-4]
                    datas = datas.split(ref_feature)
                    prefix = datas[0]
                print("Merging fasta file of " + prefix)
                for file_ in os.listdir("/".join([ref_folder, folder])):
                    if ref_feature == "":
                        files.append(file_[:-4])
                    elif ref_feature == "_fasta":
                        files.append(file_[:-3])
                    else:
                        filename = file_.split(ref_feature)
                        files.append(filename[0])
                for tar in os.listdir(tar_folder):
                    if tar.endswith(".fa") or \
                       tar.endswith(".fna") or \
                       tar.endswith(".fasta"):
                        filename = ".".join((tar.split("."))[:-1])
                        for file_ in files:
                            if filename == file_:
                                self.helper.merge_file(
                                     os.path.join(tar_folder, tar),
                                     os.path.join(tar_folder, self.tmp_fa))
                                change = True
                if change:
                    change = False
                    shutil.move(os.path.join(tar_folder, self.tmp_fa),
                                os.path.join(tar_merge, prefix + ".fa"))
        self.helper.remove_all_content(tar_folder, ".fa", "file")
        self.helper.move_all_content(tar_merge, tar_folder, None)
        shutil.rmtree(tar_merge)

    def get_prefix(self, folder, ref_feature):
        datas = folder.split("_folder")
        if ref_feature == "":
            prefix = datas[0][:-4]
        elif ref_feature == "_fasta":
            if datas[0].endswith(".fa"):
                prefix = datas[0][:-3]
            elif datas[0].endswith(".fna"):
                prefix = datas[0][:-4]
            elif datas[0].endswith(".fasta"):
                prefix = datas[0][:-6]
        else:
            datas = datas[0][:-4]
            datas = datas.split(ref_feature)
            prefix = datas[0]
        return prefix

    def combine_wig(self, ref_folder, tar_folder, ref_feature, libs):
        tar_merge = os.path.join(tar_folder, "merge_tmp")
        change_f = False
        change_r = False
        if ref_feature is None:
            ref_feature = ""
        else:
            ref_feature = "_" + ref_feature
        self.helper.check_make_folder(tar_merge)
        for folder in os.listdir(ref_folder):
            files = []
            if "_folder" in folder:
                prefix = self.get_prefix(folder, ref_feature)
                print("Merging wig file of " + prefix)
                for file_ in os.listdir(os.path.join(ref_folder, folder)):
                    if ref_feature == "":
                        files.append(file_[:-4])
                    elif ref_feature == "_fasta":
                        files.append(file_[:-3])
                    else:
                        filename = file_.split(ref_feature)
                        files.append(filename[0])
                for tar in os.listdir(tar_folder):
                    filename = tar.split("_STRAIN_")
                    for file_ in files:
                        if (tar.endswith(".wig")) and (
                                file_ == filename[-1][:-4]):
                            for lib in libs:
                                if (filename[0] in lib) and (lib[-1] == "+"):
                                    self.helper.merge_file(
                                        os.path.join(tar_folder, tar),
                                        os.path.join(tar_folder,
                                                     self.tmp_wig_forward))
                                    change_f = True
                                elif (filename[0] in lib) and (lib[-1] == "-"):
                                    self.helper.merge_file(
                                        os.path.join(tar_folder, tar),
                                        os.path.join(tar_folder,
                                                     self.tmp_wig_reverse))
                                change_r = True
                if change_f and change_r:
                    change_f = False
                    change_r = False
                    shutil.move(os.path.join(tar_folder, self.tmp_wig_forward),
                                os.path.join(tar_merge,
                                             prefix + "_forward.wig"))
                    shutil.move(os.path.join(tar_folder, self.tmp_wig_reverse),
                                os.path.join(tar_merge,
                                             prefix + "_reverse.wig"))
        self.helper.remove_all_content(tar_folder, ".wig", "file")
        self.helper.move_all_content(tar_merge, tar_folder, None)
        shutil.rmtree(tar_merge)

    def combine_gff(self, ref_folder, tar_folder, ref_feature, tar_feature):
        tar_merge = os.path.join(tar_folder, "merge_tmp")
        change = False
        if tar_feature is None:
            tar_feature = ""
        else:
            tar_feature = "_" + tar_feature
        if ref_feature is None:
            ref_feature = ""
        else:
            ref_feature = "_" + ref_feature
        self.helper.check_make_folder(tar_merge)
        for folder in os.listdir(ref_folder):
            files = []
            if "_folder" in folder:
                datas = folder.split("_folder")
                if ref_feature == "":
                    prefix = datas[0][:-4]
                elif ref_feature == "_fasta":
                    if datas[0].endswith(".fa"):
                        prefix = datas[0][:-3]
                    elif datas[0].endswith(".fna"):
                        prefix = datas[0][:-4]
                    elif datas[0].endswith(".fasta"):
                        prefix = datas[0][:-6]
                else:
                    datas = datas[0][:-4]
                    datas = datas.split(ref_feature)
                    prefix = datas[0]
                print("Merging gff file of " + prefix + tar_feature)
                for file_ in os.listdir(os.path.join(ref_folder, folder)):
                    if ref_feature == "":
                        files.append(file_[:-4])
                    elif ref_feature == "_fasta":
                        files.append(file_[:-3])
                    else:
                        filename = file_.split(ref_feature)
                        files.append(filename[0])
                for tar in os.listdir(tar_folder):
                    for file_ in files:
                        if (".gff" in tar) and (
                                file_ + tar_feature == tar[:-4]):
                            self.helper.merge_file(
                                 os.path.join(tar_folder, tar),
                                 os.path.join(tar_folder, self.tmp_gff))
                            change = True
                if change:
                    change = False
                    shutil.move(os.path.join(tar_folder, self.tmp_gff),
                                os.path.join(tar_folder, "merge_tmp",
                                prefix + tar_feature + ".gff"))
        self.helper.remove_all_content(tar_folder, ".gff", "file")
        self.helper.move_all_content(tar_merge, tar_folder, None)
        shutil.rmtree(tar_merge)

    def parser_fasta(self, fastas):
        par_tmp = os.path.join(fastas, "tmp")
        first = True
        out = None
        out_t = None
        for fasta in os.listdir(fastas):
            if (fasta.endswith("fasta") or
                    fasta.endswith("fa") or
                    fasta.endswith("fna")):
                self.seq_editer.modify_header(os.path.join(fastas, fasta))
        self.helper.check_make_folder(par_tmp)
        for fasta in os.listdir(fastas):
            if ("_folder" not in fasta) and ("tmp" != fasta):
                if (fasta.endswith(".fa")) or \
                   (fasta.endswith(".fna")) or \
                   (fasta.endswith(".fasta")):
                    out_path = os.path.join(fastas, fasta + "_folder")
                    print("Parser " + fasta + "...")
                    self.helper.check_make_folder(out_path)
                    with open(os.path.join(fastas, fasta), "r") as f_f:
                        for line in f_f:
                            if line[0] == ">":
                                line = line.strip()
                                if ("|" in line) and (
                                        len(line.split("|")) > 4):
                                    strain = line.split("|")
                                    name = strain[3]
                                else:
                                    name = line[1:]
                                if first:
                                    first = False
                                else:
                                    out.close()
                                    out_t.close()
                                out = open(os.path.join(
                                           out_path, name + ".fa"), "w")
                                out_t = open(os.path.join(
                                             par_tmp, name + ".fa"), "w")
                                out.write(">" + name + "\n")
                                out_t.write(">" + name + "\n")
                            else:
                                out.write(line)
                                out_t.write(line)
        out.close()
        out_t.close()

    def parser_gff(self, gff_folder, feature):
        par_tmp = os.path.join(gff_folder, "tmp")
        out = None
        out_t = None
        first = True
        if feature is None:
            feature = ""
        else:
            feature = "_" + feature
        self.helper.check_make_folder(par_tmp)
        for filename in os.listdir(gff_folder):
            pre_seq_id = ""
            if ("_folder" not in filename) and ("tmp" != filename):
                out_path = os.path.join(gff_folder, filename + "_folder")
                if ".gff" in filename:
                    print("Parser " + filename + "...")
                    self.helper.check_make_folder(out_path)
                    self.helper.sort_gff(os.path.join(gff_folder, filename),
                                         os.path.join(gff_folder, "tmp.gff"))
                    f_h = open(os.path.join(gff_folder, "tmp.gff"), "r")
                    for row in csv.reader(f_h, delimiter="\t"):
                        if row[0].startswith("#"):
                            continue
                        else:
                            if pre_seq_id == row[0]:
                                out.write("\t".join(row) + "\n")
                                out_t.write("\t".join(row) + "\n")
                            else:
                                if first:
                                    first = False
                                else:
                                    out.close()
                                    out_t.close()
                                out = open(os.path.join(out_path,
                                           row[0] + feature + ".gff"), "w")
                                out_t = open(os.path.join(par_tmp,
                                             row[0] + feature + ".gff"), "w")
                                pre_seq_id = row[0]
                                out.write("\t".join(row) + "\n")
                                out_t.write("\t".join(row) + "\n")
                    f_h.close()
        if os.path.exists(os.path.join(gff_folder, "tmp.gff")):
            os.remove(os.path.join(gff_folder, "tmp.gff"))
        out.close()
        out_t.close()

    def parser_wig(self, wig_folder):
        par_tmp = os.path.join(wig_folder, "tmp")
        first = True
        out = None
        out_t = None
        self.helper.check_make_folder(par_tmp)
        for filename in os.listdir(wig_folder):
            track_info = ""
            if ("_folder" not in filename) and ("tmp" != filename):
                out_path = os.path.join(wig_folder, filename + "_folder")
                if ".wig" in filename:
                    print("Parser {0}...".format(filename))
                    self.helper.check_make_folder(out_path)
                    with open(os.path.join(wig_folder, filename), "r") as w_f:
                        for line in w_f:
                            line = line.split(" ")
                            if (line[0] == "track"):
                                track_info = " ".join(line)
                            if (line[0] == "variableStep"):
                                strain = line[1].split("=")
                                if first:
                                    first = False
                                else:
                                    out.close()
                                    out_t.close()
                                out = open("".join([
                                    os.path.join(out_path, filename[:-4]),
                                    "_STRAIN_", strain[1], ".wig"]), "w")
                                out_t = open("".join([
                                    os.path.join(wig_folder, "tmp",
                                                 filename[:-4]),
                                    "_STRAIN_", strain[1], ".wig"]), "w")
                                if track_info != "":
                                    out.write(track_info)
                                    out_t.write(track_info)
                                out.write(" ".join(line))
                                out_t.write(" ".join(line))
                            if (line[0] != "track") and (
                                    line[0] != "variableStep"):
                                out.write(" ".join(line))
                                out_t.write(" ".join(line))
        out.close()
        out_t.close()
Beispiel #54
0
class ArgsContainer(object):

    def __init__(self):
        self.multiparser = Multiparser()
        self.helper = Helper()

    def _check_replicates(self, replicates_tex, replicates_frag):
        if (replicates_tex is not None) and (replicates_frag is not None):
            replicates = {"tex": int(replicates_tex),
                          "frag": int(replicates_frag)}
        elif replicates_tex is not None:
            replicates = {"tex": int(replicates_tex), "frag": -1}
        elif replicates_frag is not None:
            replicates = {"tex": -1, "frag": int(replicates_frag)}
        else:
            print("Error:No replicates number assign!!!")
            sys.exit()
        return replicates

    def _check_libs(self, tex_notex_libs, frag_libs):
        if (tex_notex_libs is None) and (frag_libs is None):
            print("Error: please input proper libraries!!")
        if (tex_notex_libs is not None) and (frag_libs is not None):
            libs = tex_notex_libs + frag_libs
        elif (tex_notex_libs is not None):
            libs = tex_notex_libs
        elif (frag_libs is not None):
            libs = frag_libs
        return libs

    def _parser_combine_wigs(self, subcommand):
        self.tex_path = None
        self.frag_path = None
        self.multiparser.parser_gff(self.gffs, None)
        if subcommand == "terminator":
            gff_path = os.path.join(self.gffs, "tmp")
            self.multiparser.parser_gff(gff_path, None)
        else:
            gff_path = self.gffs
        if self.tex_wigs is not None:
            self.tex_path = os.path.join(self.tex_wigs, "tmp")
            self.multiparser.parser_wig(self.tex_wigs)
            self.multiparser.combine_wig(gff_path, self.tex_path,
                                         None, self.libs)
            self.merge_wigs = self.tex_wigs
            self.wig_path = self.tex_path
        if self.frag_wigs is not None:
            self.frag_path = os.path.join(self.frag_wigs, "tmp")
            self.multiparser.parser_wig(self.frag_wigs)
            self.multiparser.combine_wig(gff_path, self.frag_path,
                                         None, self.libs)
            self.merge_wigs = self.frag_wigs
            self.wig_path = self.frag_path
        if (self.tex_path is not None) and (
                self.frag_path is not None):
            self = self._merge_wig()
        if (self.tex_path is None) and (
                self.frag_path is None):
            print("Error: There is no proper wig files assigned!!")
            sys.exit()
        return self

    def _merge_wig(self):
        self.merge_wigs = os.path.join(self.out_folder, "merge_wigs")
        if (self.tex_wigs is not None) and (
                self.frag_wigs is not None):
            self.helper.check_make_folder(self.merge_wigs)
            self.wig_path = os.path.join(self.merge_wigs, "tmp")
            self.helper.check_make_folder(self.wig_path)
            for wig in os.listdir(self.tex_wigs):
                if os.path.isfile(os.path.join(self.tex_wigs, wig)):
                    shutil.copy(os.path.join(self.tex_wigs, wig),
                                self.merge_wigs)
            for wig in os.listdir(self.frag_wigs):
                if os.path.isfile(os.path.join(self.frag_wigs, wig)):
                    shutil.copy(os.path.join(self.frag_wigs, wig),
                                self.merge_wigs)
            for wig in os.listdir(self.tex_path):
                if os.path.isfile(os.path.join(self.tex_path, wig)):
                    shutil.copy(os.path.join(self.tex_path, wig),
                                self.wig_path)
            for wig in os.listdir(self.frag_path):
                if os.path.isfile(os.path.join(self.frag_path, wig)):
                    self.helper.merge_file(os.path.join(self.frag_path, wig),
                                           os.path.join(self.wig_path, wig))
        elif (self.tex_wigs is not None):
            self.merge_wigs = self.tex_wigs
        elif (self.frag_wigs is not None):
            self.merge_wigs = self.frag_wigs
        return self

    def _deal_multi_inputs(self, inputs, file_type, num, command):
        if inputs is not None:
            datas = inputs.split(",")
            if num is not None:
                if (len(datas) != num):
                    print("Error: the amount of {0} is not correct!!".format(
                        command))
            new_inputs = []
            for data in datas:
                if file_type == "float":
                    new_inputs.append(float(data.strip()))
                elif file_type == "int":
                    new_inputs.append(int(data.strip()))
                else:
                    new_inputs.append(data)
            return new_inputs
        else:
            return inputs

    def container_ratt(self, ratt_path, element, transfer_type,
                       ref_embl_gbk, target_fasta, ref_fasta, ratt_folder,
                       convert_to_gff_rnt_ptt, tar_annotation_folder,
                       compare_pair):
        self.ratt_path = ratt_path
        self.element = element
        self.transfer_type = transfer_type
        self.ref_embls = ref_embl_gbk
        self.tar_fastas = target_fasta
        self.ref_fastas = ref_fasta
        self.output_path = ratt_folder
        self.convert = convert_to_gff_rnt_ptt
        self.gff_outfolder = tar_annotation_folder
        self.pairs = self._deal_multi_inputs(compare_pair, "str", None, None)
        return self

    def container_tsspredator(self, TSSpredator_path, compute_program,
                              fasta_folder, annotation_folder, wig_folder, lib,
                              output_prefix, height, height_reduction, factor,
                              factor_reduction, base_height, enrichment_factor,
                              processing_factor, replicate_match, out_folder,
                              statistics, validate_gene, merge_manual,
                              compare_transcript_assembly, fuzzy, utr_length,
                              cluster, length, re_check_orphan,
                              overlap_feature, reference_gff_folder,
                              remove_low_expression):
        self.tsspredator_path = TSSpredator_path
        self.program = compute_program
        self.fastas = fasta_folder
        self.gffs = annotation_folder
        self.wig_folder = wig_folder
        self.libs = self._deal_multi_inputs(lib, "str", None, None)
        self.output_prefixs = self._deal_multi_inputs(output_prefix, "str",
                                                      None, None)
        self.height = height
        self.height_reduction = height_reduction
        self.factor = factor
        self.factor_reduction = factor_reduction
        self.base_height = base_height
        self.enrichment_factor = enrichment_factor
        self.processing_factor = processing_factor
        self.repmatch = replicate_match
        self.out_folder = out_folder
        self.stat = statistics
        self.validate = validate_gene
        self.manual = merge_manual
        self.ta_files = compare_transcript_assembly
        self.fuzzy = fuzzy
        self.utr_length = utr_length
        self.cluster = cluster
        self.nt_length = length
        self.check_orphan = re_check_orphan
        self.overlap_feature = overlap_feature
        self.references = reference_gff_folder
        self.remove_low_expression = remove_low_expression
        return self

    def container_optimize(self, TSSpredator_path, fasta_file, annotation_file,
                           wig_folder, manual, out_folder, strain_name,
                           max_height, max_height_reduction, max_factor,
                           max_factor_reduction, max_base_height,
                           max_enrichment_factor, max_processing_factor,
                           utr_length, lib, output_prefix, cluster, length,
                           core, program, replicate_match, steps):
        self.tsspredator_path = TSSpredator_path
        self.fastas = fasta_file
        self.gffs = annotation_file
        self.wigs = wig_folder
        self.manual = manual
        self.output_folder = out_folder
        self.project_strain = strain_name
        self.height = max_height
        self.height_reduction = max_height_reduction
        self.factor = max_factor
        self.factor_reduction = max_factor_reduction
        self.base_height = max_base_height
        self.enrichment = max_enrichment_factor
        self.processing = max_processing_factor
        self.utr = utr_length
        self.libs = self._deal_multi_inputs(lib, "str", None, None)
        self.replicate_name = self._deal_multi_inputs(output_prefix, "str",
                                                      None, None)
        self.cluster = cluster
        self.length = length
        self.cores = core
        self.program = program
        self.replicate = replicate_match
        self.steps = steps
        return self

    def container_terminator(
            self, TransTermHP_path, expterm_path, RNAfold_path, out_folder,
            fasta_folder, annotation_folder, transcript_folder, srna,
            statistics, tex_wig_folder, frag_wig_folder, decrease,
            highest_coverage, fuzzy_detect_coverage, fuzzy_within_transcript,
            fuzzy_downstream_transcript, fuzzy_within_gene,
            fuzzy_downstream_gene, transtermhp_folder, tex_notex_libs,
            frag_libs, tex_notex, replicates_tex, replicates_frag, table_best,
            min_loop_length, max_loop_length, min_stem_length, max_stem_length,
            min_AT_tail_length, miss_rate, range_u):
        self.TransTermHP_path = TransTermHP_path
        self.expterm_path = expterm_path
        self.RNAfold_path = RNAfold_path
        self.out_folder = out_folder
        self.fastas = fasta_folder
        self.gffs = annotation_folder
        self.trans = transcript_folder
        self.srnas = srna
        self.stat = statistics
        self.tex_wigs = tex_wig_folder
        self.frag_wigs = frag_wig_folder
        self.decrease = decrease
        self.cutoff_coverage = highest_coverage
        self.fuzzy = fuzzy_detect_coverage
        self.fuzzy_up_ta = fuzzy_within_transcript
        self.fuzzy_down_ta = fuzzy_downstream_transcript
        self.fuzzy_up_gene = fuzzy_within_gene
        self.fuzzy_down_gene = fuzzy_downstream_gene
        self.hp_folder = transtermhp_folder
        self.tlibs = self._deal_multi_inputs(tex_notex_libs, "str", None, None)
        self.flibs = self._deal_multi_inputs(frag_libs, "str", None, None)
        self.libs = self._check_libs(self.tlibs, self.flibs)
        self.tex_notex = tex_notex
        self.replicates_tex = replicates_tex
        self.replicates_frag = replicates_frag
        self.replicates = self._check_replicates(
                replicates_tex, replicates_frag)
        self.table_best = table_best
        self.min_loop = min_loop_length
        self.max_loop = max_loop_length
        self.min_stem = min_stem_length
        self.max_stem = max_stem_length
        self.at_tail = min_AT_tail_length
        self.miss_rate = miss_rate
        self.range_u = range_u
        self = self._parser_combine_wigs("terminator")
        return self

    def container_transcript(
            self, frag_wig_path, tex_wig_path, tex_notex, length,
            annotation_folder, height, width, tolerance, tolerance_coverage,
            replicates_tex, replicates_frag, transcript_assembly_output_folder,
            compare_TSS, compare_genome_annotation, TSS_fuzzy,
            tex_treated_libs, fragmented_libs, compare_feature_genome,
            table_best, terminator_folder, fuzzy_term):
        self.frag_wigs = frag_wig_path
        self.tex_wigs = tex_wig_path
        self.tex = tex_notex
        self.length = length
        self.gffs = annotation_folder
        self.height = height
        self.width = width
        self.tolerance = tolerance
        self.low_cutoff = tolerance_coverage
        self.replicates_tex = replicates_tex
        self.replicates_frag = replicates_frag
        self.replicates = self._check_replicates(
                replicates_tex, replicates_frag)
        self.out_folder = transcript_assembly_output_folder
        self.compare_tss = compare_TSS
        self.compare_cds = compare_genome_annotation
        self.fuzzy = TSS_fuzzy
        self.tlibs = self._deal_multi_inputs(tex_treated_libs, "str", None,
                                             None)
        self.flibs = self._deal_multi_inputs(fragmented_libs, "str", None,
                                             None)
        self.libs = self._check_libs(self.tlibs, self.flibs)
        self.c_feature = self._deal_multi_inputs(compare_feature_genome, "str",
                                                 None, None)
        self.table_best = table_best
        self.terms = terminator_folder
        self.fuzzy_term = fuzzy_term
        self = self._parser_combine_wigs("transcript")
        return self

    def container_utr(self, tss_folder, annotation_folder,
                      transcript_assembly_folder, terminator_folder,
                      terminator_fuzzy, utr_folder, tss_source, base_5utr,
                      length, base_3utr):
        self.tsss = tss_folder
        self.gffs = annotation_folder
        self.trans = transcript_assembly_folder
        self.terms = terminator_folder
        self.fuzzy = terminator_fuzzy
        self.out_folder = utr_folder
        self.source = tss_source
        self.base_5utr = base_5utr
        self.base_3utr = base_3utr
        self.length = length
        return self

    def container_srna(
            self, Vienna_folder, Vienna_utils, blast_plus_folder,
            ps2pdf14_path, srna_folder, UTR_derived_sRNA, annotation_folder,
            TSS_folder, transcript_assembly_folder, TSS_intergenic_fuzzy,
            TSS_5UTR_fuzzy, TSS_3UTR_fuzzy, TSS_interCDS_fuzzy, import_info,
            tex_wig_folder, frag_wig_folder, processing_site_folder,
            fasta_folder, mountain_plot, nr_format, srna_format,
            sRNA_database_path, nr_database_path, cutoff_energy,
            run_intergenic_TEX_coverage, run_intergenic_noTEX_coverage,
            run_intergenic_fragmented_coverage, run_antisense_TEX_coverage,
            run_antisense_noTEX_coverage, run_antisense_fragmented_coverage,
            intergenic_tolerance, run_utr_TEX_coverage, run_utr_noTEX_coverage,
            run_utr_fragmented_coverage, max_length, min_length,
            tex_notex_libs, frag_libs, replicates_tex, replicates_frag,
            tex_notex, blast_e_nr, blast_e_srna, detect_sRNA_in_CDS,
            table_best, decrease_intergenic, decrease_utr, fuzzy_intergenic,
            fuzzy_utr, cutoff_nr_hit, sORF, best_with_all_sRNAhit,
            best_without_sORF_candidate, overlap_percent_CDS,
            terminator_folder, terminator_fuzzy_in_CDS,
            terminator_fuzzy_out_CDS, best_with_terminator,
            ignore_hypothetical_protein, TSS_source, min_utr_coverage,
            promoter_table, best_with_promoter, ranking_promoter,
            promoter_name):
        self.vienna_path = Vienna_folder
        self.vienna_util = Vienna_utils
        self.blast_path = blast_plus_folder
        self.ps2pdf14_path = ps2pdf14_path
        self.out_folder = srna_folder
        self.utr_srna = UTR_derived_sRNA
        self.gffs = annotation_folder
        self.tss_folder = TSS_folder
        self.trans = transcript_assembly_folder
        self.fuzzy_inter_tss = TSS_intergenic_fuzzy
        self.fuzzy_5utr_tss = TSS_5UTR_fuzzy
        self.fuzzy_3utr_tss = TSS_3UTR_fuzzy
        self.fuzzy_intercds_tss = TSS_interCDS_fuzzy
        self.fuzzy_tsss = {"5utr": self.fuzzy_5utr_tss,
                           "3utr": self.fuzzy_3utr_tss,
                           "interCDS": self.fuzzy_intercds_tss,
                           "inter": self.fuzzy_inter_tss}
        self.import_info = self._deal_multi_inputs(import_info, "str",
                                                   None, None)
        self.tex_wigs = tex_wig_folder
        self.frag_wigs = frag_wig_folder
        self.pro_folder = processing_site_folder
        self.fastas = fasta_folder
        self.mountain = mountain_plot
        self.nr_format = nr_format
        self.srna_format = srna_format
        self.srna_database = sRNA_database_path
        self.nr_database = nr_database_path
        self.energy = cutoff_energy
        self.coverage_tex = self._deal_multi_inputs(
                run_intergenic_TEX_coverage, "float", 5,
                "--run_intergenic_TEX_coverage")
        self.coverage_notex = self._deal_multi_inputs(
                run_intergenic_noTEX_coverage, "float", 5,
                "--run_intergenic_noTEX_coverage")
        self.coverage_frag = self._deal_multi_inputs(
                run_intergenic_fragmented_coverage, "float", 5,
                "--run_intergenic_fragmented_coverage")
        self.anti_cover_tex = self._deal_multi_inputs(
                run_antisense_TEX_coverage, "float", 5,
                "--run_antisense_TEX_coverage")
        self.anti_cover_notex = self._deal_multi_inputs(
                run_antisense_noTEX_coverage, "float", 5,
                "--run_antisense_noTEX_coverage")
        self.anti_cover_frag = self._deal_multi_inputs(
                run_antisense_fragmented_coverage, "float", 5,
                "--run_antisense_fragmented_coverage")
        self.tolerance = intergenic_tolerance
        self.utr_tex_cover = self._deal_multi_inputs(
                run_utr_TEX_coverage, "str", 3, "--run_utr_TEX_coverage")
        self.utr_notex_cover = self._deal_multi_inputs(
                run_utr_noTEX_coverage, "str", 3, "--run_utr_TEX_coverage")
        self.utr_frag_cover = self._deal_multi_inputs(
                run_utr_fragmented_coverage, "str", 3,
                "--run_utr_fragmented_coverage")
        self.max_len = max_length
        self.min_len = min_length
        self.tlibs = self._deal_multi_inputs(tex_notex_libs, "str", None, None)
        self.flibs = self._deal_multi_inputs(frag_libs, "str", None, None)
        self.libs = self._check_libs(self.tlibs, self.flibs)
        self.replicates_tex = replicates_tex
        self.replicates_frag = replicates_frag
        self.replicates = self._check_replicates(
                replicates_tex, replicates_frag)
        self.tex_notex = tex_notex
        self.e_nr = blast_e_nr
        self.e_srna = blast_e_srna
        self.in_cds = detect_sRNA_in_CDS
        self.table_best = table_best
        self.decrease_inter = decrease_intergenic
        self.decrease_utr = decrease_utr
        self.fuzzy_inter = fuzzy_intergenic
        self.fuzzy_utr = fuzzy_utr
        self.nr_hits_num = cutoff_nr_hit
        self.sorf_file = sORF
        self.all_hit = best_with_all_sRNAhit
        self.best_sorf = best_without_sORF_candidate
        self.cutoff_overlap = overlap_percent_CDS
        self.terms = terminator_folder
        self.fuzzy_b = terminator_fuzzy_in_CDS
        self.fuzzy_a = terminator_fuzzy_out_CDS
        self.best_term = best_with_terminator
        self.hypo = ignore_hypothetical_protein
        self.tss_source = TSS_source
        self.min_utr = min_utr_coverage
        self.promoter_table = promoter_table
        self.best_promoter = best_with_promoter
        if ranking_promoter < 1:
            print("Error: --ranking_time_promoter must larger than 1...")
            sys.exit()
        self.rank_promoter = ranking_promoter
        self.promoter_name = self._deal_multi_inputs(promoter_name, "str",
                                                     None, None)
        self = self._parser_combine_wigs("srna")
        return self

    def container_intersrna(self, file_type, files, args_srna, prefix,
                            gff_file, tran_file, tss_file, pro_file, fuzzy):
        args_srna.file_type = file_type
        args_srna.gff_file = gff_file
        args_srna.tran_file = tran_file
        args_srna.tss_file = tss_file
        args_srna.pro_file = pro_file
        args_srna.fuzzy = fuzzy
        args_srna.prefix = prefix
        if file_type == "frag":
            args_srna.wig_f_file = os.path.join(
                    args_srna.frag_path, "_".join([prefix, "forward.wig"]))
            args_srna.wig_r_file = os.path.join(
                    args_srna.frag_path, "_".join([prefix, "reverse.wig"]))
            args_srna.wig_folder = args_srna.frag_wigs
            args_srna.input_libs = args_srna.flibs
            args_srna.output_file = files["frag_gff"]
            args_srna.output_table = files["frag_csv"]
            args_srna.cutoffs = args_srna.coverage_frag
            args_srna.tss_source = True
            args_srna.cut_notex = None
            args_srna.anti_notex_cutoff = None
        else:
            args_srna.wig_f_file = os.path.join(
                    args_srna.tex_path, "_".join([prefix, "forward.wig"]))
            args_srna.wig_r_file = os.path.join(
                    args_srna.tex_path, "_".join([prefix, "reverse.wig"]))
            args_srna.wig_folder = args_srna.tex_wigs
            args_srna.input_libs = args_srna.tlibs
            args_srna.output_file = files["tex_gff"]
            args_srna.output_table = files["tex_csv"]
            args_srna.cutoffs = args_srna.coverage_tex
            args_srna.tss_source = args_srna.tss_source
            args_srna.cut_notex = args_srna.coverage_notex
            args_srna.anti_notex_cutoff = args_srna.anti_cover_notex
        return args_srna

    def container_utrsrna(self, gff, tran, tss, files, pro, fasta, file_type,
                          prefix, args_srna):
        args_srna.file_type = file_type
        args_srna.gff_file = gff
        args_srna.ta_file = tran
        args_srna.tss_file = tss
        args_srna.pro_file = pro
        args_srna.prefix = prefix
        args_srna.seq_file = fasta
        if file_type == "frag":
            args_srna.wig_f_file = os.path.join(
                    args_srna.frag_path, "_".join([prefix, "forward.wig"]))
            args_srna.wig_r_file = os.path.join(
                    args_srna.frag_path, "_".join([prefix, "reverse.wig"]))
            args_srna.wig_folder = args_srna.frag_wigs
            args_srna.input_libs = args_srna.flibs
            args_srna.output_file = files["frag_gff"]
            args_srna.output_table = files["frag_csv"]
            args_srna.utr_coverages = args_srna.utr_frag_cover
            args_srna.notex = None
        else:
            args_srna.wig_f_file = os.path.join(
                    args_srna.tex_path, "_".join([prefix, "forward.wig"]))
            args_srna.wig_r_file = os.path.join(
                    args_srna.tex_path, "_".join([prefix, "reverse.wig"]))
            args_srna.wig_folder = args_srna.tex_wigs
            args_srna.input_libs = args_srna.tlibs
            args_srna.output_file = files["tex_gff"]
            args_srna.output_table = files["tex_csv"]
            args_srna.utr_coverages = args_srna.utr_tex_cover
            args_srna.notex = args_srna.utr_notex_cover
        args_srna.coverages = {"5utr": args_srna.utr_coverages[0],
                               "3utr": args_srna.utr_coverages[1],
                               "interCDS": args_srna.utr_coverages[2]}
        if args_srna.notex is not None:
            args_srna.cover_notex = {"5utr": args_srna.notex[0],
                                     "3utr": args_srna.notex[1],
                                     "interCDS": args_srna.notex[2]}
        else:
            args_srna.cover_notex = None
        return args_srna

    def extend_inter_container(self, args_srna, tsss, pros, wigs_f, wigs_r,
                               nums, output, out_table, texs, detects,
                               cutoff_coverage, notex):
        args_srna.tsss = tsss
        args_srna.pros = pros
        args_srna.wigs_f = wigs_f
        args_srna.wigs_r = wigs_r
        args_srna.nums = nums
        args_srna.output = output
        args_srna.out_table = out_table
        args_srna.texs = texs
        args_srna.detects = detects
        args_srna.cutoff_coverage = cutoff_coverage
        args_srna.notex = notex
        return args_srna

    def extend_utr_container(self, args_srna, cdss, tsss, pros, wig_fs, wig_rs,
                             out, out_t, texs):
        args_srna.cdss = cdss
        args_srna.tsss = tsss
        args_srna.pros = pros
        args_srna.wig_fs = wig_fs
        args_srna.wig_rs = wig_rs
        args_srna.out = out
        args_srna.out_t = out_t
        args_srna.texs = texs
        args_srna.utrs = []
        args_srna.srnas = []
        return args_srna

    def container_sorf(self, sorf_folder, UTR_derived_sORF, transcript_folder,
                       annotation_folder, TSS_folder, utr_length, min_length,
                       max_length, tex_wig_folder, frag_wig_folder,
                       cutoff_intergenic_coverage, cutoff_antisense_coverage,
                       cutoff_5utr_coverage, cutoff_3utr_coverage,
                       cutoff_interCDS_coverage, fasta_folder, tex_notex_libs,
                       frag_libs, tex_notex, replicates_tex, replicates_frag,
                       table_best, sRNA_folder, start_codon, stop_codon,
                       cutoff_background, fuzzy_rbs, rbs_not_after_TSS,
                       print_all_combination, best_no_sRNA, best_no_TSS,
                       ignore_hypothetical_protein, min_rbs_distance,
                       max_rbs_distance):
        self.out_folder = sorf_folder
        self.utr_detect = UTR_derived_sORF
        self.trans = transcript_folder
        self.gffs = annotation_folder
        self.tsss = TSS_folder
        self.utr_length = utr_length
        self.min_len = min_length
        self.max_len = max_length
        self.tex_wigs = tex_wig_folder
        self.frag_wigs = frag_wig_folder
        self.cutoff_inter = cutoff_intergenic_coverage
        self.cutoff_anti = cutoff_antisense_coverage
        self.cutoff_5utr = cutoff_5utr_coverage
        self.cutoff_3utr = cutoff_3utr_coverage
        self.cutoff_intercds = cutoff_interCDS_coverage
        self.fastas = fasta_folder
        self.tlibs = self._deal_multi_inputs(tex_notex_libs, "str", None, None)
        self.flibs = self._deal_multi_inputs(frag_libs, "str", None, None)
        self.libs = self._check_libs(self.tlibs, self.flibs)
        self.tex_notex = tex_notex
        self.replicates_tex = replicates_tex
        self.replicates_frag = replicates_frag
        self.replicates = self._check_replicates(
                replicates_tex, replicates_frag)
        self.table_best = table_best
        self.srnas = sRNA_folder
        self.start_codon = self._deal_multi_inputs(start_codon, "str",
                                                   None, None)
        self.stop_codon = self._deal_multi_inputs(stop_codon, "str",
                                                  None, None)
        self.background = cutoff_background
        self.fuzzy_rbs = fuzzy_rbs
        self.noafter_tss = rbs_not_after_TSS
        self.print_all = print_all_combination
        self.no_srna = best_no_sRNA
        self.no_tss = best_no_TSS
        self.hypo = ignore_hypothetical_protein
        self.min_rbs = min_rbs_distance
        self.max_rbs = max_rbs_distance
        self = self._parser_combine_wigs("sorf")
        return self

    def container_srna_target(self, Vienna_folder, annotation_path, fasta_path,
                              sRNA_path, query_sRNA, program,
                              interaction_length, window_size_target,
                              span_target, window_size_srna, span_srna,
                              unstructured_region_RNAplex_target,
                              unstructured_region_RNAplex_srna,
                              unstructured_region_RNAup, energy_threshold,
                              duplex_distance, top, starget_output_folder,
                              process_rnaplex, process_rnaup, continue_rnaup,
                              potential_target_start, potential_target_end,
                              target_feature):
        self.vienna_path = Vienna_folder
        self.gffs = annotation_path
        self.fastas = fasta_path
        self.srnas = sRNA_path
        self.query = self._deal_multi_inputs(query_sRNA, "str", None, None)
        self.program = program
        self.inter_length = interaction_length
        self.win_size_t = window_size_target
        self.span_t = span_target
        self.win_size_s = window_size_srna
        self.span_s = span_srna
        self.unstr_region_rnaplex_t = unstructured_region_RNAplex_target
        self.unstr_region_rnaplex_s = unstructured_region_RNAplex_srna
        self.unstr_region_rnaup = unstructured_region_RNAup
        self.energy = energy_threshold
        self.duplex_dist = duplex_distance
        self.top = top
        self.out_folder = starget_output_folder
        self.core_plex = process_rnaplex
        self.core_up = process_rnaup
        self.continue_rnaup = continue_rnaup
        self.tar_start = potential_target_start
        self.tar_end = potential_target_end
        self.features = self._deal_multi_inputs(target_feature, "str",
                                                None, None)
        return self

    def container_goterm(self, annotation_path, goterm_output_folder,
                         UniProt_id, go_obo, goslim_obo, transcript_path):
        self.gffs = annotation_path
        self.out_folder = goterm_output_folder
        self.uniprot = UniProt_id
        self.go = go_obo
        self.goslim = goslim_obo
        self.trans = transcript_path
        return self

    def container_sublocal(self, Psortb_path, gff_path, fasta_path,
                           bacteria_type, difference_multi, merge_to_gff,
                           sublocal_output_folder, transcript_path):
        self.psortb_path = Psortb_path
        self.gffs = gff_path
        self.fastas = fasta_path
        self.gram = bacteria_type
        self.fuzzy = difference_multi
        self.merge = merge_to_gff
        self.out_folder = sublocal_output_folder
        self.trans = transcript_path
        return self

    def container_ppi(self, gff_path, proteinID_strains, without_strain_pubmed,
                      species_STRING, score, ppi_output_folder, node_size,
                      query):
        self.ptts = gff_path
        self.strains = self._deal_multi_inputs(proteinID_strains, "str",
                                               None, None)
        self.no_specific = without_strain_pubmed
        self.species = species_STRING
        self.score = score
        self.out_folder = ppi_output_folder
        self.size = node_size
        self.querys = self._deal_multi_inputs(query, "str", None, None)
        return self

    def container_promoter(self, MEME_path, promoter_output_folder, tex_libs,
                           TSS_folder, fasta_folder, num_motif, nt_before_TSS,
                           motif_width, TSS_source, tex_wig_path,
                           annotation_folder, combine_all, e_value):
        self.meme_path = MEME_path
        self.output_folder = promoter_output_folder
        self.input_libs = self._deal_multi_inputs(tex_libs, "str", None, None)
        self.tsss = TSS_folder
        self.fastas = fasta_folder
        self.num_motif = num_motif
        self.nt_before = nt_before_TSS
        self.widths = self._deal_multi_inputs(motif_width, "str", None, None)
        self.source = TSS_source
        self.wigs = tex_wig_path
        self.gffs = annotation_folder
        self.combine = combine_all
        self.e_value = e_value
        return self

    def container_operon(self, TSS_folder, annotation_folder,
                         transcript_folder, UTR5_folder, UTR3_folder,
                         term_folder, TSS_fuzzy, term_fuzzy, min_length,
                         statistics, operon_output_folder, combine_gff,
                         operon_statistics_folder):
        self.tsss = TSS_folder
        self.gffs = annotation_folder
        self.trans = transcript_folder
        self.utr5s = UTR5_folder
        self.utr3s = UTR3_folder
        self.terms = term_folder
        self.tss_fuzzy = TSS_fuzzy
        self.term_fuzzy = term_fuzzy
        self.length = min_length
        self.statistics = statistics
        self.output_folder = operon_output_folder
        self.combine = combine_gff
        self.stat_folder = operon_statistics_folder
        return self

    def container_snp(self, samtools_path, bcftools_path, bam_type, program,
                      fasta_path, tex_bam_path, frag_bam_path, quality,
                      read_depth, snp_output_folder, indel_fraction, chrom):
        self.samtools_path = samtools_path
        self.bcftools_path = bcftools_path
        self.types = bam_type
        self.program = self._deal_multi_inputs(program, "str", None, None)
        self.fastas = fasta_path
        self.normal_bams = tex_bam_path
        self.frag_bams = frag_bam_path
        self.quality = quality
        self.depth = read_depth
        self.out_folder = snp_output_folder
        self.fraction = indel_fraction
        if chrom == "haploid":
            chrom = "1"
        elif chrom == "diploid":
            chrom = "2"
        self.chrom = chrom
        return self

    def container_circrna(self, align, process, fasta_path, annotation_path,
                          tex_bam_path, fragmented_bam_path, read_folder,
                          circrna_stat_folder, support_reads,
                          segemehl_folder, samtools_path, start_ratio,
                          end_ratio, ignore_hypothetical_protein, out_folder):
        self.align = align
        self.cores = process
        self.fastas = fasta_path
        self.gffs = annotation_path
        self.normal_bams = tex_bam_path
        self.frag_bams = fragmented_bam_path
        self.read_folder = read_folder
        self.stat_folder = circrna_stat_folder
        self.support = support_reads
        self.segemehl_path = segemehl_folder
        self.samtools_path = samtools_path
        self.start_ratio = start_ratio
        self.end_ratio = end_ratio
        self.hypo = ignore_hypothetical_protein
        self.output_folder = out_folder
        return self

    def container_ribos(self, infernal_path, riboswitch_ID, gff_path,
                        fasta_path, tss_path, transcript_path, Rfam,
                        ribos_output_folder, e_value, output_all,
                        database_folder, fuzzy, start_codon, min_dist_rbs,
                        max_dist_rbs, fuzzy_rbs, UTR_length):
        self.infernal_path = infernal_path
        self.ribos_id = riboswitch_ID
        self.gffs = gff_path
        self.fastas = fasta_path
        self.tsss = tss_path
        self.trans = transcript_path
        self.rfam = Rfam
        self.out_folder = ribos_output_folder
        self.e_value = e_value
        self.output_all = output_all
        self.database = database_folder
        self.fuzzy = fuzzy
        self.start_codons = self._deal_multi_inputs(start_codon, "str",
                                                    None, None)
        self.start_rbs = min_dist_rbs
        self.end_rbs = max_dist_rbs
        self.fuzzy_rbs = fuzzy_rbs
        self.utr = UTR_length
        return self

    def container_screen(self, main_gff, side_gffs, fasta, frag_wig_folder,
                         tex_wig_folder, height, tex_libs, frag_libs, present,
                         output_folder):
        self.main_gff = main_gff
        self.side_gffs = self._deal_multi_inputs(side_gffs, "str", None, None)
        self.fasta = fasta
        self.frag_wigs = frag_wig_folder
        self.tex_wigs = tex_wig_folder
        self.height = height
        self.tlibs = self._deal_multi_inputs(tex_libs, "str", None, None)
        self.flibs = self._deal_multi_inputs(frag_libs, "str", None, None)
        self.present = present
        self.output_folder = output_folder
        return self
Beispiel #55
0
class Controller(object):

    """Manage the actions of the subcommands.

    The Controller take care of providing the argumentes like path
    names and the parallel processing of tasks.

    """
    def __init__(self, args):
        """Create an instance."""
        self._args = args
        if (len(args.__dict__) > 3):
            if not os.path.exists(args.project_path):
                print("Error: --project_path does not exists!")
                sys.exit()
        self._paths = Paths(args.project_path)
        self.args_container = ArgsContainer()
        self.helper = Helper()

    def check_folder(self, folders, flags):
        '''Check the emtpy or wrong assigned folder'''
        for folder, flag in zip(folders, flags):
            if folder is None:
                print("Error: {0} is wrong. Please check it!".format(flag))
                sys.exit()
            else:
                if os.path.exists(folder):
                    if len(os.listdir(folder)) == 0:
                        print("Error: {0} is a empty folder!".format(flag))
                        sys.exit()
                else:
                    print("Error: {0} is wrong. Please check it!".format(
                          flag))
                    sys.exit()

    def check_multi_files(self, input_files, flags):
        if input_files is not None:
            for files, flag in zip(input_files, flags):
                if files is not None:
                    for file_ in files:
                        if not os.path.exists(file_):
                            print("Error: Some files in {0} do "
                                  "not exist!".format(flag))
                            sys.exit()

    def check_parameter(self, paras, names):
        '''Check the parameter is assigned correct or not'''
        for i in range(len(paras)):
            if paras[i] is None:
                print("Error: {0} is wrong. "
                      "Please check it!".format(names[i]))
                sys.exit()

    def check_no_require_folder(self, folders):
        '''Check the folders which are not necessary.
        It should not be assigned a empty or wrong folder'''
        for folder in folders:
            if folder is not None:
                if os.path.exists(folder):
                    if len(os.listdir(folder)) == 0:
                        print("Error: There is a empty folder. "
                              "Please check it!")
                        sys.exit()
                else:
                    print("Error: There is a wrong folder. "
                          "Please check it!")
                    sys.exit()

    def check_execute_file(self, exe):
        detect = False
        if os.path.exists(exe):
            detect = True
            full_exe = os.path.realpath(exe)
        for folder in os.environ["PATH"].split(":"):
            if os.path.exists(os.path.join(folder, exe)):
                detect = True
                full_exe = exe
        if not detect:
            if os.path.exists(os.path.realpath(exe)):
                full_exe = os.path.realpath(exe)
            else:
                print("Error: {0} can't be found!".format(exe))
                print("Please assign the correct path!")
                sys.exit()
        return full_exe

    def check_file(self, files, names, require):
        '''Check the path of file'''
        for i in range(len(files)):
            if require:
                if files[i] is None:
                    print("Error: {0} is wrong. "
                          "Please check it!".format(names[i]))
                    sys.exit()
                else:
                    if not os.path.isfile(files[i]):
                        print("Error: There is a wrong path of {0}. "
                              "Please check it!".format(names[i]))
                        sys.exit()
            else:
                if files[i] is not None:
                    if not os.path.isfile(files[i]):
                        print("Error: There is a wrong path of {0}. "
                              "Please check it!".format(names[i]))
                        sys.exit()

    def create_project(self, version):
        """Create a new project."""
        project_creator.create_root_folder(self._args.project_path)
        project_creator.create_subfolders(self._paths.required_folders("root"))
        project_creator.create_version_file(
            self._paths.version_path, version)
        sys.stdout.write("Created folder \"%s\" and required subfolders.\n" % (
            self._args.project_path))

    def get_input(self):
        """Download required files from website."""
        print("Running get input files")
        if self._args.ftp_path is None:
            print("Error: Please assign the path for downloading the data!")
            sys.exit()
            annotation_folder = self._paths.ref_annotation_folder
            fasta_folder = self._paths.ref_fasta_folder
        self.helper.check_make_folder(self._paths.ref_annotation_folder)
        self.helper.check_make_folder(self._paths.ref_fasta_folder)
        if self._args.ref_gff is True:
            get_file(self._args.ftp_path, self._paths.ref_annotation_folder,
                     "gff")
            get_file(self._args.ftp_path, self._paths.ref_annotation_folder,
                     "_genomic.gff.gz")
        if self._args.ref_fasta is True:
            get_file(self._args.ftp_path, self._paths.ref_fasta_folder,
                     "fna")
            get_file(self._args.ftp_path, self._paths.ref_fasta_folder,
                     "_genomic.fna.gz")
        if self._args.ref_gbk is True:
            get_file(self._args.ftp_path, self._paths.ref_annotation_folder,
                     "gbk")
            get_file(self._args.ftp_path, self._paths.ref_annotation_folder,
                     "gbff")
            get_file(self._args.ftp_path, self._paths.ref_annotation_folder,
                     "_genomic.gbff.gz")
        if self._args.ref_ptt is True:
            get_file(self._args.ftp_path, self._paths.ref_annotation_folder,
                     "ptt")
        if self._args.ref_rnt is True:
            get_file(self._args.ftp_path, self._paths.ref_annotation_folder,
                     "rnt")
        if self._args.convert_embl is True:
            annotation_files = os.listdir(self._paths.ref_annotation_folder)
            if len(annotation_files) == 0:
                sys.stdout.write("No gff files!!\n")
            else:
                Converter().convert_gbk2embl(self._paths.ref_annotation_folder)

    def get_target_fasta(self):
        """Get target fasta"""
        print("Running update genome fasta")
        self.check_multi_files([self._args.related_fasta_files],
                               ["--related_fasta_files"])
        self.check_file([self._args.mutation_table], "--mutation_table", True)
        project_creator.create_subfolders(
            self._paths.required_folders("get_target_fasta"))
        target = TargetFasta(self._paths.tar_fasta_folder,
                             self._args.related_fasta_files)
        target.get_target_fasta(
                self._args.mutation_table, self._paths.tar_fasta_folder,
                self._args.related_fasta_files, self._args.combine_to_one_fasta,
                self._paths.target_base_folder)

    def ratt(self):
        """Run RATT to transfer annotation file from reference to target."""
        print("Running annotation transfer")
        if (self._args.transfer_type != "Strain") and (
                self._args.transfer_type != "Assembly") and (
                self._args.transfer_type != "Species") and (
                self._args.transfer_type != "Assembly.Repetitive") and (
                self._args.transfer_type != "Strain.Repetitive") and (
                self._args.transfer_type != "Species.Repetitive") and (
                self._args.transfer_type != "Multiple") and (
                self._args.transfer_type != "Free"):
            print("Error: please assign correct --transfer_type!")
            sys.exit()
        if (self._args.related_embl_files is None) and (
                self._args.related_gbk_files is None):
            print("Error: please assign proper embl or genbank files")
            sys.exit()
        elif (self._args.related_embl_files is not None) and (
                self._args.related_gbk_files is not None):
            print("Error: please choose embl as input or genbank as input")
            sys.exit()
        self._args.ratt_path = self.check_execute_file(self._args.ratt_path)
        self.check_multi_files(
                [self._args.target_fasta_files, self._args.related_fasta_files],
                ["--target_fasta_files", "--closed_fasta_files"])
        self.check_parameter([self._args.element, self._args.compare_pair],
                             ["--element", "--compare_pair"])
        project_creator.create_subfolders(
            self._paths.required_folders("get_target_fasta"))
        project_creator.create_subfolders(
            self._paths.required_folders("annotation_transfer"))
        args_ratt = self.args_container.container_ratt(
            self._args.ratt_path, self._args.element, self._args.transfer_type,
            self._args.related_embl_files, self._args.related_gbk_files,
            self._args.target_fasta_files, self._args.related_fasta_files,
            self._paths.ratt_folder, self._args.convert_to_gff_rnt_ptt,
            self._paths.tar_annotation_folder, self._args.compare_pair)
        ratt = RATT(args_ratt)
        ratt.annotation_transfer(args_ratt)

    def tsspredator(self):
        """Run TSSpredator for predicting TSS candidates."""
        self.check_multi_files(
                [self._args.fasta_files, self._args.annotation_files,
                 self._args.compare_overlap_gff, self._args.manual_files,
                 self._args.compare_transcript_files],
                ["--fasta_files", "--annotation_files", "--compare_overlap_gff",
                 "--manual_files","--compare_transcript_files"])
        self.check_parameter([self._args.tex_notex_libs, self._args.condition_names],
                             ["--tex_notex_libs", "--condition_names"])
        self._args.tsspredator_path = self.check_execute_file(
                self._args.tsspredator_path)
        if self._args.program.lower() == "tss":
            print("Running TSS prediction")
            project_creator.create_subfolders(
                self._paths.required_folders("TSS"))
            out_folder = self._paths.tsspredator_folder
        elif self._args.program.lower() == "ps":
            print("Running processing site prediction")
            out_folder = self._paths.processing_site_folder
            project_creator.create_subfolders(
                self._paths.required_folders("processing"))
        else:
            print("Error: No such program!")
            sys.exit()
        args_tss = self.args_container.container_tsspredator(
            self._args.tsspredator_path, self._args.program,
            self._args.fasta_files, self._args.annotation_files,
            self._args.tex_notex_libs, self._args.condition_names,
            self._args.height, self._args.height_reduction,
            self._args.factor, self._args.factor_reduction,
            self._args.base_height, self._args.enrichment_factor,
            self._args.processing_factor, self._args.replicate_tex,
            out_folder, self._args.validate_gene,
            self._args.manual_files, self._args.curated_sequence_length,
            self._args.compare_transcript_files, self._args.tolerance,
            self._args.utr_length, self._args.cluster,
            self._args.re_check_orphan,
            self._args.remove_overlap_feature, self._args.compare_overlap_gff,
            self._args.remove_low_expression)
        tsspredator = TSSpredator(args_tss)
        tsspredator.run_tsspredator(args_tss)

    def optimize(self):
        """opimize TSSpredator"""
        self.check_multi_files(
                [self._args.fasta_files, self._args.annotation_files,
                 self._args.manual_files],
                ["--fasta_files", "--annotation_files", "--manual_files"])
        self._args.tsspredator_path = self.check_execute_file(
                self._args.tsspredator_path)
        self.check_parameter([self._args.tex_notex_libs,
                              self._args.condition_names],
                             ["--tex_notex_lib",
                              "--condition_names"])
        if self._args.program.lower() == "tss":
            print("Running optimization of TSS prediction")
            project_creator.create_subfolders(
                self._paths.required_folders("TSS"))
            out_folder = self._paths.tsspredator_folder
        elif self._args.program.lower() == "ps":
            print("Running optimization of processing site prediction")
            out_folder = self._paths.processing_site_folder
            project_creator.create_subfolders(
                self._paths.required_folders("processing"))
        else:
            print("Error: No such program!")
            sys.exit()
        args_ops = self.args_container.container_optimize(
            self._args.tsspredator_path, self._args.fasta_files,
            self._args.annotation_files,
            self._args.manual_files, out_folder, self._args.max_height,
            self._args.max_height_reduction, self._args.max_factor,
            self._args.max_factor_reduction, self._args.max_base_height,
            self._args.max_enrichment_factor, self._args.max_processing_factor,
            self._args.utr_length, self._args.tex_notex_libs,
            self._args.condition_names, self._args.cluster,
            self._args.curated_sequence_length, self._args.parallels,
            self._args.program, self._args.replicate_tex,
            self._args.steps)
        optimize_tss(args_ops)

    def color(self):
        """color the screenshots"""
        print("Running png files coloring")
        self.check_parameter([self._args.track_number], ["--track_numer"])
        self.check_folder([self._args.screenshot_folder], ["--screenshot_folder"])
        self._args.imagemagick_covert_path = self.check_execute_file(
                self._args.imagemagick_covert_path)
        color = ColorPNG()
        color.generate_color_png(
                self._args.track_number, self._args.screenshot_folder,
                self._args.imagemagick_covert_path)

    def terminator(self):
        """Run TransTermHP and Gene converaged for detecting terminators"""
        print("Running terminator prediction")
        if self._args.transterm_path is None:
            print("Please assign the path of transterm in TransTermHP.")
        self.check_multi_files(
                [self._args.fasta_files, self._args.annotation_files,
                 self._args.transcript_files, self._args.srna_files],
                ["--fasta_files", "--annotation_files",
                 "--transcript_files", "--srna_files"])
        for prop in ("transterm_path", "expterm_path", "rnafold_path"):
            setattr(self._args, prop,
                    self.check_execute_file(getattr(self._args, prop)))
        project_creator.create_subfolders(
            self._paths.required_folders("terminator"))
        args_term = self.args_container.container_terminator(
            self._args.transterm_path, self._args.expterm_path,
            self._args.rnafold_path,
            self._paths.transterm_folder, self._args.fasta_files,
            self._args.annotation_files, self._args.transcript_files,
            self._args.srna_files, self._args.decrease,
            self._args.highest_coverage, self._args.tolerance_detect_coverage,
            self._args.tolerance_within_transcript,
            self._args.tolerance_downstream_transcript,
            self._args.tolerance_within_gene,
            self._args.tolerance_downstream_gene, self._paths.transtermhp_folder,
            self._args.tex_notex_libs, self._args.frag_libs,
            self._args.tex_notex, self._args.replicate_tex,
            self._args.replicate_frag, self._args.table_best,
            self._args.min_loop_length, self._args.max_loop_length,
            self._args.min_stem_length, self._args.max_stem_length,
            self._args.min_u_tail, self._args.miss_rate,
            self._args.mutation_u_tail, self._args.keep_multi_term,
            self._args.window_size, self._args.window_shift)
        terminator = Terminator(args_term)
        terminator.run_terminator(args_term)

    def transcript(self):
        """Run Transcript detection"""
        print("Running transcript detection")
        self.check_multi_files(
                [self._args.annotation_files, self._args.tss_files,
                 self._args.terminator_files],
                ["--annotation_files", "--tss_files", "--terminator_files"])
        project_creator.create_subfolders(
            self._paths.required_folders("transcript"))
        args_tran = self.args_container.container_transcript(
            self._args.tex_notex, self._args.modify_transcript,
            self._args.length, self._args.annotation_files,
            self._args.height, self._args.width,
            self._args.tolerance, self._args.tolerance_coverage,
            self._args.replicate_tex, self._args.replicate_frag,
            self._paths.transcript_output_folder,
            self._args.tss_files, self._args.tss_tolerance,
            self._args.tex_notex_libs, self._args.frag_libs,
            self._args.compare_feature_genome, self._args.table_best,
            self._args.terminator_files, self._args.terminator_tolerance,
            self._args.max_length_distribution)
        transcript = TranscriptDetection(args_tran)
        transcript.run_transcript(args_tran)

    def utr_detection(self):
        """Run UTR detection."""
        print("Running UTR detection")
        self.check_multi_files(
            [self._args.annotation_files, self._args.terminator_files,
             self._args.transcript_files, self._args.tss_files],
            ["--annotation_files", "--terminator_files",
             "--transcript_files", "--tss_files"])
        project_creator.create_subfolders(self._paths.required_folders("utr"))
        args_utr = self.args_container.container_utr(
                self._args.tss_files, self._args.annotation_files,
                self._args.transcript_files, self._args.terminator_files,
                self._args.terminator_tolerance, self._paths.utr_folder,
                self._args.tss_source, self._args.base_5utr,
                self._args.utr_length, self._args.base_3utr,
                self._args.tolerance_3utr, self._args.tolerance_5utr)
        utr = UTRDetection(args_utr)
        utr.run_utr_detection(args_utr)

    def _check_filter_input(self, files, info, filters):
        if files is None:
            print("Error: The {0} has to be provided "
                  "if \"{1}\" in --filter_info!".format(info, filters))
            sys.exit()

    def _check_database(self, database, flag, info):
        wrong = False
        if database is None:
            wrong = True
        elif not os.path.isfile(database):
            if (os.path.isfile(database + ".fa")) or (
                    os.path.isfile(database + ".fna")) or (
                    os.path.isfile(database + ".fasta")):
                wrong = False
            else:
                wrong = True
        if wrong:
            print("Error: {0} is required if {1} is in --filter_info. "
                  "But the assignment of {0} is empty or wrong. "
                  "Please check the {0} or remove {1} from "
                  "--filter_info!".format(flag, info))
            sys.exit()

    def srna_detection(self):
        """sRNA_detection."""
        print("Running sRNA prediction")
        self.check_multi_files(
                [self._args.annotation_files, self._args.transcript_files,
                 self._args.fasta_files, self._args.sorf_files,
                 self._args.terminator_files, self._args.promoter_tables,
                 self._args.processing_site_files],
                ["--annotation_files", "--transcript_files",
                 "--fasta_files", "--sorf_files", "--terminator_files",
                 "--promoter_tables", "--processing_site_files"])
        for info in self._args.filter_info:
            if "sec_str" == info:
                if not self._args.compute_sec_structures:
                    print("Error: --compute_sec_structures is not switch on, "
                          "but sec_str is still in --filter_info.")
                    sys.exit()
                self._check_filter_input(
                        self._args.fasta_files, "fasta file", "sec_str")
                for prop in ("rnafold_path", "relplot_path",
                             "mountain_path"):
                    setattr(self._args, prop,
                            self.check_execute_file(getattr(self._args, prop)))
            elif ("blast_nr" == info) or (
                    "blast_srna"== info):
                for prop in ("blastn_path", "blastx_path", "makeblastdb_path"):
                    setattr(self._args, prop,
                            self.check_execute_file(getattr(self._args, prop)))
                if ("blast_nr" == info):
                    self._check_database(self._args.nr_database_path,
                                         "--nr_database_path", "blast_nr")
                if ("blast_srna" == info):
                    self._check_database(self._args.srna_database_path,
                                         "--srna_database_path", "blast_srna")
            elif "sorf" == info:
                self._check_filter_input(
                        self._args.sorf_files, "sORF", "sorf")
            elif "term" == info:
                self._check_filter_input(self._args.terminator_files,
                                         "terminator", "term")
            elif "promoter" == info:
                self._check_filter_input(self._args.promoter_tables,
                                         "Promoter", "promoter")
            elif "tss" == info:
                self._check_filter_input(self._args.tss_files,
                                         "TSS", "tss")
            else:
                if "none" != info.lower():
                    print("Error: Please check the --filter_info, "
                          "invalid value was assigned!")
                    sys.exit()
        if self._args.utr_derived_srna:
            if self._args.tss_files is None:
                print("Error: The TSS has to be provided "
                      "if you want to compute UTR-derived sRNA!")
                sys.exit()
        if self._args.search_poly_u != 0:
            if self._args.fasta_files is None:
                print("Error: The fasta files have to be provided "
                      "if you want to extend 3'end of sRNA by "
                      "searching poly U tail!")
                sys.exit()
        project_creator.create_subfolders(self._paths.required_folders("srna"))
        args_srna = self.args_container.container_srna(
                self._args.rnafold_path, self._args.relplot_path,
                self._args.mountain_path, self._args.blastn_path,
                self._args.blastx_path, self._args.makeblastdb_path,
                self._paths.srna_folder, self._args.utr_derived_srna,
                self._args.annotation_files, self._args.tss_files,
                self._args.transcript_files,
                self._args.tss_intergenic_antisense_tolerance,
                self._args.tss_5utr_tolerance, self._args.tss_3utr_tolerance,
                self._args.tss_intercds_tolerance, self._args.filter_info,
                self._args.processing_site_files, self._args.fasta_files,
                self._args.mountain_plot, self._args.nr_format,
                self._args.srna_format, self._args.srna_database_path,
                self._args.nr_database_path, self._args.cutoff_energy,
                self._args.parallel_blast,
                self._args.min_intergenic_tex_coverage,
                self._args.min_intergenic_notex_coverage,
                self._args.min_intergenic_fragmented_coverage,
                self._args.min_complete_5utr_transcript_coverage,
                self._args.min_antisense_tex_coverage,
                self._args.min_antisense_notex_coverage,
                self._args.min_antisense_fragmented_coverage,
                self._args.min_utr_tex_coverage,
                self._args.min_utr_notex_coverage,
                self._args.min_utr_fragmented_coverage,
                self._args.max_length, self._args.min_length,
                self._args.tex_notex_libs, self._args.frag_libs,
                self._args.replicate_tex, self._args.replicate_frag,
                self._args.tex_notex, self._args.blast_e_nr,
                self._args.blast_e_srna, self._args.detect_srna_in_cds,
                self._args.table_best, self._args.decrease_intergenic_antisense,
                self._args.decrease_utr, self._args.tolerance_intergenic_antisense,
                self._args.tolerance_utr, self._args.cutoff_nr_hit,
                self._args.sorf_files, self._args.overlap_percent_cds,
                self._args.terminator_files,
                self._args.terminator_tolerance_in_srna,
                self._args.terminator_tolerance_out_srna,
                self._args.ignore_hypothetical_protein, self._args.tss_source,
                self._args.min_all_utr_coverage, self._args.promoter_tables,
                self._args.ranking_time_promoter, self._args.promoter_names,
                self._args.compute_sec_structures, self._args.search_poly_u,
                self._args.min_u_poly_u, self._args.mutation_poly_u)
        srna = sRNADetection(args_srna)
        srna.run_srna_detection(args_srna)

    def sorf_detection(self):
        """sORF_detection."""
        print("Running sORF prediction")
        self.check_multi_files(
                [self._args.transcript_files, self._args.annotation_files,
                 self._args.fasta_files, self._args.srna_files,
                 self._args.tss_files],
                ["--transcript_files", "--annotation_files",
                 "--fasta_files", "--srna_files", "--tss_files"])
        project_creator.create_subfolders(
            self._paths.required_folders("sorf"))
        args_sorf = self.args_container.container_sorf(
            self._paths.sorf_folder, self._args.utr_derived_sorf,
            self._args.transcript_files,
            self._args.annotation_files,
            self._args.tss_files, self._args.utr_length,
            self._args.min_length, self._args.max_length,
            self._args.cutoff_intergenic_coverage,
            self._args.cutoff_antisense_coverage,
            self._args.cutoff_5utr_coverage,
            self._args.cutoff_3utr_coverage,
            self._args.cutoff_intercds_coverage,
            self._args.fasta_files, self._args.tex_notex_libs,
            self._args.frag_libs, self._args.tex_notex,
            self._args.replicate_tex, self._args.replicate_frag,
            self._args.table_best, self._args.srna_files,
            self._args.start_codon, self._args.stop_codon,
            self._args.cutoff_base_coverage, self._args.tolerance_rbs,
            self._args.rbs_not_after_tss, self._args.print_all_combination,
            self._args.best_no_srna, self._args.best_no_tss,
            self._args.ignore_hypothetical_protein,
            self._args.min_rbs_distance, self._args.max_rbs_distance,
            self._args.tolerance_3end, self._args.tolerance_5end)
        sorf = sORFDetection(args_sorf)
        sorf.run_sorf_detection(args_sorf)

    def meme(self):
        """promoter detectopn"""
        print("Running promoter detection")
        self.check_multi_files(
                [self._args.tss_files, self._args.fasta_files],
                ["--tss_files", "--fasta_files"])
        if not self._args.tss_source:
            self.check_multi_files([self._args.annotation_files],
                                   ["--annotation_files"])
        if (self._args.program == "both") or (
                self._args.program == "meme"):
            self._args.meme_path = self.check_execute_file(self._args.meme_path)
        elif (self._args.program == "both") or (
                self._args.program == "glam2"):
            self._args.glam2_path = self.check_execute_file(self._args.glam2_path)
        project_creator.create_subfolders(
            self._paths.required_folders("promoter"))
        args_pro = self.args_container.container_promoter(
            self._args.meme_path, self._args.glam2_path,
            self._paths.promoter_output_folder, self._args.tex_libs,
            self._args.tss_files, self._args.fasta_files,
            self._args.num_motifs, self._args.nt_before_tss,
            self._args.motif_width, self._args.tss_source,
            self._args.annotation_files, self._args.end_run,
            self._args.combine_all, self._args.e_value,
            self._args.parallels, self._args.program)
        meme = MEME(args_pro)
        meme.run_meme(args_pro)

    def operon(self):
        """operon detection"""
        print("Running operon detection")
        self.check_multi_files(
                [self._args.tss_files, self._args.annotation_files,
                 self._args.transcript_files, self._args.utr5_files,
                 self._args.utr3_files, self._args.terminator_files],
                ["--tss_files", "--annotation_files",
                 "--transcript_files", "--utr5_files",
                 "--utr3_files", "--terminator_files"])
        project_creator.create_subfolders(
            self._paths.required_folders("operon"))
        args_op = self.args_container.container_operon(
            self._args.tss_files, self._args.annotation_files,
            self._args.transcript_files, self._args.utr5_files,
            self._args.utr3_files, self._args.terminator_files,
            self._args.tss_tolerance, self._args.terminator_tolerance,
            self._args.min_length, self._paths.operon_output_folder,
            self._paths.operon_statistics_folder)
        operon = OperonDetection(args_op)
        operon.run_operon(args_op)

    def circrna(self):
        """circRNA detection"""
        print("Running circular RNA prediction")
        if self._args.read_files:
            self._args.segemehl_path = self.check_execute_file(
                    self._args.segemehl_path)
        for prop in ("testrealign_path", "samtools_path"):
            setattr(self._args, prop,
                    self.check_execute_file(getattr(self._args, prop)))
        self.check_multi_files(
                [self._args.fasta_files, self._args.annotation_files],
                ["--fasta_files", "--annotation_files"])
        project_creator.create_subfolders(
            self._paths.required_folders("circrna"))
        args_circ = self.args_container.container_circrna(
            self._args.parallels, self._args.fasta_files,
            self._args.annotation_files, self._args.bam_files,
            self._args.read_files, self._paths.circrna_stat_folder,
            self._args.support_reads, self._args.segemehl_path,
            self._args.testrealign_path, self._args.samtools_path,
            self._args.start_ratio, self._args.end_ratio,
            self._args.ignore_hypothetical_protein,
            self._paths.circrna_output_folder)
        circ = CircRNADetection(args_circ)
        circ.run_circrna(args_circ)

    def goterm(self):
        """Go term discovery"""
        print("Running GO term mapping")
        self.check_multi_files(
                [self._args.annotation_files, self._args.transcript_files],
                ["--annotation_files", "--transcript_files"])
        self.check_file([self._args.uniprot_id, self._args.go_obo,
                         self._args.goslim_obo],
                        ["--uniprot_id", "--go.obo", "--goslim_obo"], True)
        project_creator.create_subfolders(
            self._paths.required_folders("go_term"))
        args_go = self.args_container.container_goterm(
            self._args.annotation_files,
            self._paths.goterm_output_folder, self._args.uniprot_id,
            self._args.go_obo, self._args.goslim_obo,
            self._args.transcript_files)
        goterm = GoTermFinding(args_go)
        goterm.run_go_term(args_go)

    def srna_target(self):
        """sRNA target prediction"""
        print("Running sRNA target prediction")
        self.check_multi_files(
                [self._args.fasta_files, self._args.srna_files,
                 self._args.annotation_files],
                ["--fasta_files", "--srna_files",
                 "--annotation_files"])
        if "RNAup" in self._args.program:
            self._args.rnaup_path = self.check_execute_file(
                    self._args.rnaup_path)
        if "RNAplex" in self._args.program:
            for prop in ("rnaplfold_path", "rnaplex_path"):
                setattr(self._args, prop,
                        self.check_execute_file(getattr(self._args, prop)))
        if "IntaRNA" in self._args.program:
            self._args.intarna_path = self.check_execute_file(
                    self._args.intarna_path)
            if self._args.mode_intarna is None:
                print("Error: --mode_IntaRNA need to be assigned!")
                sys.exit()
        project_creator.create_subfolders(
            self._paths.required_folders("srna_target"))
        args_tar = self.args_container.container_srna_target(
            self._args.rnaplfold_path, self._args.rnaplex_path,
            self._args.rnaup_path, self._args.intarna_path,
            self._args.annotation_files,
            self._args.fasta_files, self._args.srna_files,
            self._args.query_srnas, self._args.program,
            self._args.interaction_length,
            self._args.window_size_target_rnaplex,
            self._args.span_target_rnaplex,
            self._args.window_size_srna_rnaplfold,
            self._args.span_srna_rnaplfold,
            self._args.unstructured_region_rnaplex_target,
            self._args.unstructured_region_rnaplex_srna,
            self._args.unstructured_region_rnaup,
            self._args.energy_threshold_rnaplex,
            self._args.duplex_distance_rnaplex, self._args.top,
            self._paths.starget_output_folder, self._args.parallels_rnaplex,
            self._args.parallels_rnaup, self._args.parallels_intarna,
            self._args.continue_rnaup,
            self._args.slide_window_size_srna_intarna,
            self._args.max_loop_length_srna_intarna,
            self._args.slide_window_size_target_intarna,
            self._args.max_loop_length_target_intarna,
            self._args.mode_intarna, self._args.potential_target_start,
            self._args.potential_target_end, self._args.target_feature)
        srnatarget = sRNATargetPrediction(args_tar)
        srnatarget.run_srna_target_prediction(args_tar)

    def snp(self):
        """SNP transcript detection"""
        print("Running SNP/mutations calling")
        self.check_multi_files(
                [self._args.fasta_files],
                ["--fasta_files"])
        if (self._args.bam_type != "related_genome") and (
                self._args.bam_type != "reference_genome"):
            print("Error: Please assign \"related_genome\" or"
                  " \"reference_genome\" to --bam_type!")
            sys.exit()
        if (self._args.ploidy != "haploid") and (
                self._args.ploidy != "diploid"):
            print("Error: Please assign \"haploid\" or"
                  " \"diploid\" to --chromosome_type!")
        if (self._args.caller != "c") and (
                self._args.caller != "m"):
            print("Error: Please assign \"c\" or"
                  " \"m\" to --caller!")
        for prop in ("bcftools_path", "samtools_path"):
            setattr(self._args, prop,
                    self.check_execute_file(getattr(self._args, prop)))
        project_creator.create_subfolders(self._paths.required_folders("snp"))
        args_snp = self.args_container.container_snp(
            self._args.samtools_path, self._args.bcftools_path,
            self._args.bam_type,
            self._args.program, self._args.fasta_files,
            self._args.bam_files,
            self._args.quality, self._args.read_depth_range,
            self._paths.snp_output_folder, self._args.indel_fraction,
            self._args.ploidy, self._args.rg_tag, self._args.caller,
            self._args.filter_tag_info, self._args.dp4_cutoff)
        snp = SNPCalling(args_snp)
        snp.run_snp_calling(args_snp)

    def ppi(self):
        """PPI network retrieve"""
        print("Running protein-protein interaction networks prediction")
        self.check_multi_files([self._args.annotation_files],
                               ["--annotation_files"])
        self.check_parameter([self._args.query_strains,
                              self._args.species_string],
                             ["--query_strains", "--species_string"])
        project_creator.create_subfolders(
            self._paths.required_folders("ppi_network"))
        args_ppi = self.args_container.container_ppi(
            self._args.annotation_files, self._args.query_strains,
            self._args.without_strain_pubmed, self._args.species_string,
            self._args.score, self._paths.ppi_output_folder,
            self._args.node_size, self._args.query)
        ppi = PPINetwork(self._paths.ppi_output_folder)
        ppi.retrieve_ppi_network(args_ppi)

    def sublocal(self):
        """Subcellular Localization prediction"""
        print("Running subcellular localization prediction")
        self.check_multi_files(
                [self._args.annotation_files, self._args.fasta_files,
                 self._args.transcript_files],
                ["--annotation_files", "--fasta_files",
                 "--transcript_files"])
        if (self._args.bacteria_type != "positive") and (
                self._args.bacteria_type != "negative"):
            print("Error: Please assign \"positive\" or"
                  " \"negative\" to --bacteria_type!")
            sys.exit()
        self._args.psortb_path = self.check_execute_file(self._args.psortb_path)
        project_creator.create_subfolders(
            self._paths.required_folders("subcellular_localization"))
        args_sub = self.args_container.container_sublocal(
            self._args.psortb_path, self._args.annotation_files,
            self._args.fasta_files, self._args.bacteria_type,
            self._args.difference_multi,
            self._paths.sublocal_output_folder, self._args.transcript_files)
        sublocal = SubLocal(args_sub)
        sublocal.run_sub_local(args_sub)

    def ribos(self):
        """riboswitch and RNA thermometer prediction"""
        print("Running riboswitch and RNA thermometer prediction")
        self.check_multi_files(
                [self._args.annotation_files, self._args.fasta_files,
                 self._args.tss_files, self._args.transcript_files],
                ["--annotation_files", "--fasta_files", "--tss_files",
                 "--transcript_files"])
        if (self._args.program == "both"):
            self.check_file([self._args.riboswitch_id_file, self._args.rfam_path],
                            ["--riboswitch_id_file", "--rfam_path"], True)
            self.check_file([self._args.rna_thermometer_id_file,
                             self._args.rfam_path],
                            ["--rna_thermometer_id_file", "--rfam_path"], True)
            project_creator.create_subfolders(
                    self._paths.required_folders("riboswitch"))
            project_creator.create_subfolders(
                    self._paths.required_folders("thermometer"))
            ribos_path = self._paths.ribos_output_folder
            thermo_path = self._paths.thermo_output_folder
        elif (self._args.program == "thermometer"):
            self.check_file([self._args.rna_thermometer_id_file,
                             self._args.rfam_path],
                            ["--thermometer_id_file", "--rfam_path"], True)
            project_creator.create_subfolders(
                    self._paths.required_folders("thermometer"))
            ribos_path = None
            thermo_path = self._paths.thermo_output_folder
        elif (self._args.program == "riboswitch"):
            self.check_file([self._args.riboswitch_id_file, self._args.rfam_path],
                            ["--riboswitch_id_file", "--rfam_path"], True)
            project_creator.create_subfolders(
                    self._paths.required_folders("riboswitch"))
            ribos_path = self._paths.ribos_output_folder
            thermo_path = None
        else:
            print("Error: Please assign \"thermometer\", \"riboswitch\" "
                  "or \"both\" in --program!")
            sys.exit()
        self._args.cmscan_path = self.check_execute_file(self._args.cmscan_path)
        self._args.cmpress_path = self.check_execute_file(self._args.cmpress_path)
        args_ribo = self.args_container.container_ribos(
            self._args.program, self._args.rna_thermometer_id_file,
            self._args.cmscan_path, self._args.cmpress_path,
            self._args.riboswitch_id_file,
            self._args.annotation_files, self._args.fasta_files,
            self._args.tss_files, self._args.transcript_files,
            self._args.rfam_path, ribos_path,
            thermo_path, self._args.e_value,
            self._args.output_all, self._paths.database_folder,
            self._args.tolerance,
            self._args.tolerance_rbs, self._args.utr_length)
        ribos = Ribos(args_ribo)
        ribos.run_ribos(args_ribo)

    def crispr(self):
        """CRISPR prediction"""
        print("Running CRISPR prediction")
        self.check_multi_files(
                [self._args.fasta_files, self._args.annotation_files],
                ["--fasta_files", "--annotation_files"])
        self._args.crt_path = self.check_execute_file(self._args.crt_path)
        project_creator.create_subfolders(
            self._paths.required_folders("crispr"))
        args_cris = self.args_container.container_cris(
            self._args.fasta_files, self._args.annotation_files,
            self._args.crt_path, self._args.window_size,
            self._args.min_number_repeats, self._args.min_length_repeat,
            self._args.Max_length_repeat, self._args.min_length_spacer,
            self._args.Max_length_spacer, self._paths.crispr_output_folder,
            self._args.ignore_hypothetical_protein)
        cris = Crispr(args_cris)
        cris.run_crispr(args_cris)

    def merge(self):
        """Merge all features"""
        print("Merging all features to one gff file")
        merge_folder = os.path.join(self._paths.output_folder,
                                    "merge_all_features")
        self.helper.check_make_folder(merge_folder)
        other_features = self._args.other_features_files
        self.check_file([self._args.transcript_file] + other_features,
                        ["--transcript_file", "--other_features_files"],
                        False)
        self.check_parameter([self._args.output_prefix], ["--output_prefix"])
        run_merge(merge_folder, self._args.transcript_file,
                  self._args.other_features_files,
                  self._args.terminator_tolerance, self._args.tss_tolerance,
                  os.path.join(merge_folder, self._args.output_prefix))

    def screen(self):
        """generate screenshot"""
        print("Running screenshot generation")
        self.check_file([self._args.main_gff, self._args.fasta_file],
                        ["--main_gff", "--fasta_file"], True)
        if self._args.side_gffs is not None:
            for gff in (self._args.side_gffs):
                gff = gff.strip()
                if not os.path.isfile(gff):
                    print("Error: The --side_gffs do not exist!")
                    sys.exit()
        if self._args.output_folder is None:
            print("Error: Please assign --output_folder!")
            sys.exit()
        if (self._args.present != "expand") and (
                self._args.present != "collapse") and (
                self._args.present != "squish"):
            print("Error: Please assign \"expand\" or "
                  "\"collapse\" or \"squish\" to --present!")
            sys.exit()
        args_sc = self.args_container.container_screen(
            self._args.main_gff, self._args.side_gffs,
            self._args.fasta_file, self._args.height,
            self._args.tex_notex_libs, self._args.frag_libs,
            self._args.present, self._args.output_folder)
        screen = Screen(args_sc)
        screen.screenshot(args_sc)
class Terminator(object):
    '''detection of terminator'''
    def __init__(self, args_term):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.gff_parser = Gff3Parser()
        self.gff_path = os.path.join(args_term.gffs, "tmp")
        self.fasta_path = os.path.join(args_term.fastas, "tmp")
        self.tran_path = os.path.join(args_term.trans, "tmp")
        self.outfolder = {
            "term": os.path.join(args_term.out_folder, "gffs"),
            "csv": os.path.join(args_term.out_folder, "tables")
        }
        self.terms = {
            "all": os.path.join(self.outfolder["term"], "all_candidates"),
            "express": os.path.join(self.outfolder["term"],
                                    "expressed_candidates"),
            "best": os.path.join(self.outfolder["term"], "best_candidates"),
            "non": os.path.join(self.outfolder["term"],
                                "non_expressed_candidates")
        }
        self.csvs = {
            "all": os.path.join(self.outfolder["csv"], "all_candidates"),
            "express": os.path.join(self.outfolder["csv"],
                                    "expressed_candidates"),
            "best": os.path.join(self.outfolder["csv"], "best_candidates"),
            "non": os.path.join(self.outfolder["csv"],
                                "non_expressed_candidates")
        }
        self.combine_path = os.path.join(self.gff_path, "combine")
        self.tmps = {
            "transterm": os.path.join(os.getcwd(), "tmp_transterm"),
            "hp": "transtermhp",
            "hp_gff": "transtermhp.gff",
            "hp_path": "tmp_transterm/tmp",
            "term_table": os.path.join(os.getcwd(), "tmp_term_table"),
            "merge": os.path.join(os.getcwd(), "tmp_merge_gff"),
            "gff": "tmp.gff",
            "folder": os.path.join(os.getcwd(), "tmp")
        }
        self.suffixs = {
            "gff": "term.gff",
            "csv": "term.csv",
            "allgff": "term_all.gff"
        }
        if args_term.srnas:
            self.srna_path = os.path.join(args_term.srnas, "tmp")
        else:
            self.srna_path = None
        self._make_gff_folder()

    def _combine_annotation(self, combine_file, files):
        with open(combine_file, 'w') as result:
            for file_ in files:
                check_start = False
                fh = open(file_, 'r')
                for line in fh:
                    if check_start:
                        result.write(line)
                    if "Location" in line:
                        check_start = True
                if "\n" not in line:
                    result.write("\n")
                fh.close()

    def _make_gff_folder(self):
        self.helper.check_make_folder(self.terms["all"])
        self.helper.check_make_folder(self.csvs["all"])
        self.helper.check_make_folder(self.terms["best"])
        self.helper.check_make_folder(self.csvs["best"])
        self.helper.check_make_folder(self.terms["express"])
        self.helper.check_make_folder(self.csvs["express"])
        self.helper.check_make_folder(self.terms["non"])
        self.helper.check_make_folder(self.csvs["non"])

    def _convert_gff2rntptt(self, gff_path, fasta_path, sRNAs):
        file_types = {}
        prefixs = []
        for gff in os.listdir(gff_path):
            if gff.endswith(".gff"):
                filename = gff.split("/")
                prefix = filename[-1][:-4]
                prefixs.append(prefix)
                gff_file = os.path.join(gff_path, gff)
                rnt_file = os.path.join(gff_path, gff.replace(".gff", ".rnt"))
                ptt_file = os.path.join(gff_path, gff.replace(".gff", ".ptt"))
                fasta = self.helper.get_correct_file(fasta_path, ".fa", prefix,
                                                     None, None)
                if not fasta:
                    print("Error: {0}.fa can not be found!".format(prefix))
                    sys.exit()
                if sRNAs:
                    self.multiparser.parser_gff(sRNAs, "sRNA")
                    srna = self.helper.get_correct_file(
                        self.srna_path, "_sRNA.gff", prefix, None, None)
                    if (srna) and (fasta):
                        self.converter.convert_gff2rntptt(
                            gff_file, fasta, ptt_file, rnt_file, srna,
                            srna.replace(".gff", ".rnt"))
                        file_types[prefix] = "srna"
                    if (not srna) and (fasta):
                        self.converter.convert_gff2rntptt(
                            gff_file, fasta, ptt_file, rnt_file, None, None)
                        file_types[prefix] = "normal"
                else:
                    self.converter.convert_gff2rntptt(gff_file, fasta,
                                                      ptt_file, rnt_file, None,
                                                      None)
                    file_types[prefix] = "normal"
        return file_types, prefixs

    def _combine_ptt_rnt(self, gff_path, file_types, srna_path):
        self.helper.check_make_folder(self.combine_path)
        for prefix, file_type in file_types.items():
            combine_file = os.path.join(self.combine_path, prefix + '.ptt')
            if file_type == "normal":
                files = [
                    os.path.join(gff_path, prefix + ".ptt"),
                    os.path.join(gff_path, prefix + ".rnt")
                ]
                self._combine_annotation(combine_file, files)
            elif file_type == "srna":
                files = [
                    os.path.join(gff_path, prefix + ".ptt"),
                    os.path.join(gff_path, prefix + ".rnt"),
                    os.path.join(srna_path, "_".join([prefix, "sRNA.rnt"]))
                ]
                self._combine_annotation(combine_file, files)

    def _TransTermHP(self, fasta, file_, out_path, prefix, out, args_term):
        call([
            args_term.TransTermHP_path, "-p", args_term.expterm_path, fasta,
            os.path.join(self.combine_path, file_), "--t2t-perf",
            os.path.join(
                out_path, "_".join([
                    prefix,
                    "terminators_within_robust_tail-to-tail_regions.t2t"
                ])), "--bag-output",
            os.path.join(out_path, "_".join(
                [prefix, "best_terminator_after_gene.bag"]))
        ],
             stdout=out)

    def _run_TransTermHP(self, args_term):
        self.helper.check_make_folder(self.tmps["transterm"])
        for file_ in os.listdir(self.combine_path):
            if ".ptt" in file_:
                prefix = file_.replace(".ptt", "")
                fasta = self.helper.get_correct_file(self.fasta_path, ".fa",
                                                     prefix, None, None)
                if not fasta:
                    print("Error: {0}.fa can not be found!".format(prefix))
                    sys.exit()
                out_path = os.path.join(args_term.hp_folder, prefix)
                self.helper.check_make_folder(out_path)
                out = open(
                    os.path.join(out_path,
                                 "_".join([prefix, "terminators.txt"])), "w")
                self._TransTermHP(fasta, file_, out_path, prefix, out,
                                  args_term)
                out.close()
        shutil.rmtree(self.combine_path)

    def _convert_to_gff(self, prefixs, args_term):
        for prefix in prefixs:
            for folder in os.listdir(args_term.hp_folder):
                if prefix == folder:
                    out_path = os.path.join(args_term.hp_folder, folder)
                    for file_ in os.listdir(out_path):
                        if file_.endswith(".bag"):
                            out_file = os.path.join(
                                self.tmps["transterm"],
                                "_".join([prefix, self.tmps["hp_gff"]]))
                            self.converter.convert_transtermhp2gff(
                                os.path.join(out_path, file_), out_file)
        self.multiparser.combine_gff(args_term.gffs, self.tmps["transterm"],
                                     None, self.tmps["hp"])

    def _combine_wigs(self, args_term):
        if (args_term.tex_wigs is not None) and (args_term.frag_wigs
                                                 is not None):
            folder = args_term.tex_wigs.split("/")
            folder = "/".join(folder[:-1])
            merge_wigs = os.path.join(folder, "merge_wigs")
            self.helper.check_make_folder(merge_wigs)
            for wig in os.listdir(args_term.tex_wigs):
                if os.path.isdir(os.path.join(args_term.tex_wigs, wig)):
                    pass
                else:
                    shutil.copy(os.path.join(args_term.tex_wigs, wig),
                                merge_wigs)
            for wig in os.listdir(args_term.frag_wigs):
                if os.path.isdir(os.path.join(args_term.frag_wigs, wig)):
                    pass
                else:
                    shutil.copy(os.path.join(args_term.frag_wigs, wig),
                                merge_wigs)
        elif (args_term.tex_wigs is not None):
            merge_wigs = args_term.tex_wigs
        elif (args_term.frag_wigs is not None):
            merge_wigs = args_term.frag_wigs
        else:
            print("Error: Wiggle files are not assigned!")
            sys.exit()
        return merge_wigs

    def _merge_sRNA(self, sRNAs, prefixs, gff_path):
        '''searching the terminator with sRNA information'''
        if sRNAs is not None:
            self.multiparser.parser_gff(sRNAs, "sRNA")
            self.helper.check_make_folder(self.tmps["merge"])
            for prefix in prefixs:
                tmp_gff = os.path.join(self.tmps["merge"], self.tmps["gff"])
                if self.tmps["gff"] in os.listdir(self.tmps["merge"]):
                    os.remove(tmp_gff)
                self.helper.merge_file(os.path.join(gff_path, prefix + ".gff"),
                                       tmp_gff)
                self.helper.merge_file(
                    os.path.join(self.srna_path,
                                 "_".join([prefix, "sRNA.gff"])), tmp_gff)
                self.helper.sort_gff(
                    tmp_gff, os.path.join(self.tmps["merge"], prefix + ".gff"))
                os.remove(tmp_gff)
            merge_path = self.tmps["merge"]
        else:
            merge_path = gff_path
        return merge_path

    def _move_file(self, term_outfolder, csv_outfolder):
        for gff in os.listdir(term_outfolder):
            if gff.endswith("_term.gff"):
                self.helper.sort_gff(os.path.join(term_outfolder, gff),
                                     self.tmps["gff"])
                shutil.move(self.tmps["gff"],
                            os.path.join(term_outfolder, gff))
                prefix = gff.replace("_term.gff", "")
                new_gff = os.path.join(
                    self.terms["all"],
                    "_".join([prefix, self.suffixs["allgff"]]))
                csv_file = os.path.join(
                    os.path.join(self.csvs["all"],
                                 "_".join([prefix, self.suffixs["csv"]])))
                out = open(new_gff, "w")
                out.write("##gff-version 3\n")
                out.close()
                self.helper.merge_file(
                    os.path.join(term_outfolder, gff),
                    os.path.join(self.terms["all"],
                                 "_".join([prefix, self.suffixs["allgff"]])))
                os.remove(os.path.join(term_outfolder, gff))
                pre_strain = ""
                if ("_".join([prefix, self.suffixs["csv"]])
                        in os.listdir(self.csvs["all"])):
                    os.remove(csv_file)
                out_csv = open(csv_file, "w")
                out_csv.write("\t".join([
                    "Genome", "Name", "Start", "End", "Strand", "Detect",
                    "Coverage_decrease", "Coverage_detail"
                ]) + "\n")
                out_csv.close()
                fh = open(new_gff)
                for entry in self.gff_parser.entries(fh):
                    if entry.seq_id != pre_strain:
                        self.helper.merge_file(
                            os.path.join(
                                self.tmps["term_table"],
                                "_".join([entry.seq_id, "term_raw.csv"])),
                            os.path.join(
                                self.csvs["all"],
                                "_".join([prefix, self.suffixs["csv"]])))
                    pre_strain = entry.seq_id
                fh.close()

    def _run_rnafold(self, RNAfold_path, tmp_seq, tmp_sec, prefix):
        print("Computing secondray structures of {0}".format(prefix))
        self.helper.check_make_folder(self.tmps["folder"])
        pre_cwd = os.getcwd()
        os.chdir(self.tmps["folder"])
        os.system(" ".join([
            RNAfold_path, "<",
            os.path.join("..", tmp_seq), ">",
            os.path.join("..", tmp_sec)
        ]))
        os.chdir(pre_cwd)
        shutil.rmtree(self.tmps["folder"])

    def _compute_intersection_forward_reverse(self, prefixs, merge_path,
                                              wig_path, merge_wigs, args_term):
        '''the approach for searching gene converged region terminator'''
        for prefix in prefixs:
            tmp_seq = os.path.join(args_term.out_folder,
                                   "_".join(["inter_seq", prefix]))
            tmp_index = os.path.join(args_term.out_folder,
                                     "_".join(["inter_index", prefix]))
            tmp_sec = os.path.join(args_term.out_folder,
                                   "_".join(["inter_sec", prefix]))
            tran_file = os.path.join(self.tran_path,
                                     "_".join([prefix, "transcript.gff"]))
            gff_file = os.path.join(merge_path, prefix + ".gff")
            tmp_cand = tmp_cand = os.path.join(
                args_term.out_folder, "_".join(["term_candidates", prefix]))
            if os.path.exists(tran_file):
                print("Extracting sequences of {0}".format(prefix))
                intergenic_seq(os.path.join(self.fasta_path,
                                            prefix + ".fa"), tran_file,
                               gff_file, tmp_seq, tmp_index, args_term)
                self._run_rnafold(args_term.RNAfold_path, tmp_seq, tmp_sec,
                                  prefix)
                extract_info_sec(tmp_sec, tmp_seq, tmp_index)
                os.remove(tmp_index)
                poly_t(tmp_seq, tmp_sec, gff_file, tran_file, tmp_cand,
                       args_term)
            print("Detecting terminators for " + prefix)
            detect_coverage(
                tmp_cand, os.path.join(merge_path, prefix + ".gff"),
                os.path.join(self.tran_path,
                             "_".join([prefix, "transcript.gff"])),
                os.path.join(self.fasta_path, prefix + ".fa"),
                os.path.join(wig_path, "_".join([prefix, "forward.wig"])),
                os.path.join(wig_path, "_".join([prefix, "reverse.wig"])),
                os.path.join(self.tmps["hp_path"],
                             "_".join([prefix, self.tmps["hp_gff"]])),
                merge_wigs,
                os.path.join(self.outfolder["term"],
                             "_".join([prefix, self.suffixs["gff"]])),
                os.path.join(self.tmps["term_table"],
                             "_".join([prefix, "term_raw.csv"])), args_term)
        self.multiparser.combine_gff(args_term.gffs, self.outfolder["term"],
                                     None, "term")
        self._move_file(self.outfolder["term"], self.outfolder["csv"])

    def _remove_tmp_file(self, merge_wigs, args_term):
        self.helper.remove_tmp_dir(args_term.gffs)
        self.helper.remove_tmp_dir(args_term.fastas)
        if args_term.srnas is not None:
            self.helper.remove_tmp(args_term.srnas)
            shutil.rmtree(self.tmps["merge"])
        if (args_term.tex_wigs is not None) and (args_term.frag_wigs
                                                 is not None):
            shutil.rmtree(merge_wigs)
        self.helper.remove_tmp_dir(args_term.trans)
        if "tmp_wig" in os.listdir(args_term.out_folder):
            shutil.rmtree(os.path.join(args_term.out_folder, "tmp_wig"))
        self.helper.remove_tmp(self.outfolder["term"])
        shutil.rmtree(self.tmps["transterm"])
        shutil.rmtree(self.tmps["term_table"])
        self.helper.remove_all_content(args_term.out_folder, "inter_seq_",
                                       "file")
        self.helper.remove_all_content(self.outfolder["term"], "_term.gff",
                                       "file")
        self.helper.remove_all_content(args_term.out_folder, "inter_sec_",
                                       "file")
        self.helper.remove_all_content(args_term.out_folder,
                                       "term_candidates_", "file")

    def _compute_stat(self, args_term):
        new_prefixs = []
        for gff in os.listdir(self.terms["all"]):
            if gff.endswith("_term_all.gff"):
                out_tmp = open(self.tmps["gff"], "w")
                out_tmp.write("##gff-version 3\n")
                new_prefix = gff.replace("_term_all.gff", "")
                new_prefixs.append(gff.replace("_term_all.gff", ""))
                num = 0
                fh = open(os.path.join(self.terms["all"], gff))
                for entry in self.gff_parser.entries(fh):
                    name = '%0*d' % (5, num)
                    entry.attributes["ID"] = (entry.seq_id + "_terminator" +
                                              str(num))
                    entry.attributes["Name"] = "_".join(["terminator_" + name])
                    entry.attribute_string = ";".join([
                        "=".join(items) for items in entry.attributes.items()
                    ])
                    out_tmp.write("\t".join([
                        entry.info_without_attributes, entry.attribute_string
                    ]) + "\n")
                    num += 1
                out_tmp.close()
                fh.close()
                shutil.move(
                    self.tmps["gff"],
                    os.path.join(self.terms["all"],
                                 "_".join([new_prefix, self.suffixs["gff"]])))
        stat_path = os.path.join(args_term.out_folder, "statistics")
        for prefix in new_prefixs:
            stat_term(
                os.path.join(self.terms["all"],
                             "_".join([prefix, self.suffixs["gff"]])),
                os.path.join(self.csvs["all"],
                             "_".join([prefix, self.suffixs["csv"]])),
                os.path.join(stat_path, "_".join(["stat", prefix + ".csv"])),
                os.path.join(self.terms["best"], "_".join([prefix, "term"])),
                os.path.join(self.terms["express"], "_".join([prefix,
                                                              "term"])),
                os.path.join(self.terms["non"], "_".join([prefix, "term"])))
            shutil.move(
                os.path.join(self.terms["best"],
                             "_".join([prefix, self.suffixs["csv"]])),
                os.path.join(self.csvs["best"],
                             "_".join([prefix, self.suffixs["csv"]])))
            shutil.move(
                os.path.join(self.terms["express"],
                             "_".join([prefix, self.suffixs["csv"]])),
                os.path.join(self.csvs["express"],
                             "_".join([prefix, self.suffixs["csv"]])))
            shutil.move(
                os.path.join(self.terms["non"],
                             "_".join([prefix, self.suffixs["csv"]])),
                os.path.join(self.csvs["non"],
                             "_".join([prefix, self.suffixs["csv"]])))
            os.remove(
                os.path.join(self.terms["all"],
                             "_".join([prefix, self.suffixs["allgff"]])))

    def _check_gff_file(self, folder):
        for file_ in os.listdir(folder):
            if file_.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(folder, file_))

    def _compare_term_tran(self, args_term, prefixs):
        '''searching the associated terminator to transcript'''
        self.multiparser.combine_gff(args_term.gffs, self.tran_path, None,
                                     "transcript")
        prefixs = []
        print("Comparing terminators with transcripts now")
        for file_ in os.listdir(self.tran_path):
            if file_.endswith("_transcript.gff"):
                prefixs.append(file_.replace("_transcript.gff", ""))
        for type_ in ("best_candidates", "expressed_candidates",
                      "all_candidates"):
            compare_term_tran(self.tran_path,
                              os.path.join(self.outfolder["term"], type_),
                              args_term.fuzzy_up_ta, args_term.fuzzy_down_ta,
                              args_term.out_folder, "terminator",
                              self.outfolder["term"], args_term.trans)
            for prefix in prefixs:
                shutil.move(
                    os.path.join(
                        args_term.out_folder, "statistics",
                        "stat_compare_transcript_terminator_" + prefix +
                        ".csv"),
                    os.path.join(
                        args_term.out_folder, "statistics", "_".join([
                            "stat_compare_terminator_transcript", prefix,
                            type_ + ".csv"
                        ])))

    def run_terminator(self, args_term):
        self._check_gff_file(args_term.gffs)
        self._check_gff_file(args_term.trans)
        self.multiparser.parser_fasta(args_term.fastas)
        if (not args_term.gffs) or (not args_term.fastas):
            print("Error: Please assign gff files " "and fasta files!")
            sys.exit()
        file_types, prefixs = self._convert_gff2rntptt(self.gff_path,
                                                       self.fasta_path,
                                                       args_term.srnas)
        self._combine_ptt_rnt(self.gff_path, file_types, self.srna_path)
        self._run_TransTermHP(args_term)
        self._convert_to_gff(prefixs, args_term)
        self.helper.remove_tmp(self.gff_path)
        self.multiparser.parser_gff(args_term.trans, "transcript")
        self.helper.check_make_folder(self.tmps["term_table"])
        self.multiparser.parser_gff(self.tmps["transterm"], self.tmps["hp"])
        merge_path = self._merge_sRNA(args_term.srnas, prefixs, self.gff_path)
        self._compute_intersection_forward_reverse(prefixs, merge_path,
                                                   args_term.wig_path,
                                                   args_term.merge_wigs,
                                                   args_term)
        self._compute_stat(args_term)
        self._compare_term_tran(args_term, prefixs)
        self._remove_tmp_file(args_term.merge_wigs, args_term)
Beispiel #57
0
class GoTermFinding(object):
    '''Retrieving the GO term'''

    def __init__(self, args_go):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.out_all = os.path.join(args_go.out_folder, "all_CDSs")
        self.out_express = os.path.join(args_go.out_folder, "expressed_CDSs")
        self.result_all_path = os.path.join(self.out_all, "GO_term_results")
        self.result_express_path = os.path.join(self.out_express,
                                                "GO_term_results")
        self.gff_path = os.path.join(args_go.gffs, "tmp")
        if args_go.trans is not None:
            self.tran_path = os.path.join(args_go.trans, "tmp")
        else:
            self.tran_path = None
        self.stat_all_path = os.path.join(self.out_all, "statistics")
        self.stat_express_path = os.path.join(self.out_express,
                                              "statistics")
        self.all_strain = "all_genomes_uniprot.csv"

    def _retrieve_go(self, uniprot, out_path, type_, log):
        prefixs = []
        log.write("Running gene_ontology.py to retrieve GO terms.\n")
        for gff in os.listdir(self.gff_path):
            prefix = gff.replace(".gff", "")
            prefixs.append(prefix)
            self.helper.check_make_folder(os.path.join(out_path, prefix))
            out_file = os.path.join(out_path, prefix,
                                    "_".join([prefix, "uniprot.csv"]))
            print("Extracting GO terms of {0} from UniProt".format(prefix))
            if self.tran_path is not None:
                tran_file = os.path.join(self.tran_path,
                                         "_".join([prefix, "transcript.gff"]))
            else:
                tran_file = None
            retrieve_uniprot(uniprot, os.path.join(self.gff_path, gff),
                             out_file, tran_file, type_)
            log.write("\t" + out_file + " is generated.\n")

    def _remove_header(self, out_all):
        out = open(out_all + "_tmp", "w")
        fh = open(out_all, "r")
        out.write("\t".join(["Genome", "Strand", "Start", "End",
                             "Protein_id", "Go_term"]) + "\n")
        for row in csv.reader(fh, delimiter='\t'):
            if row[0] != "Genome":
                out.write("\t".join(row) + "\n")
        out.close()
        fh.close()
        shutil.move(out_all + "_tmp", out_all)

    def _merge_files(self, gffs, out_path, out_folder, log):
        '''merge the files according to the input genome folder'''
        folders = []
        log.write("Merging the output files based on the input genome "
                  "information.\n")
        for folder in os.listdir(gffs):
            if folder.endswith("gff_folder"):
                folder_prefix = folder.replace(".gff_folder", "")
                folder_path = os.path.join(out_folder, folder_prefix)
                self.helper.check_make_folder(folder_path)
                folders.append(folder_path)
                filenames = []
                for gff in os.listdir(os.path.join(gffs, folder)):
                    if gff.endswith(".gff"):
                        filenames.append(gff.replace(".gff", ""))
                out_all = os.path.join(folder_path, self.all_strain)
                if len(filenames) > 1:
                    if self.all_strain in os.listdir(folder_path):
                        os.remove(out_all)
                    for filename in filenames:
                        csv_file = "_".join([filename, "uniprot.csv"])
                        self.helper.merge_file(os.path.join(out_path,
                                               filename, csv_file), out_all)
                        self._remove_header(out_all)
                        shutil.copy(os.path.join(out_path, filename, csv_file),
                                    folder_path)
                else:
                    shutil.copyfile(os.path.join(out_path, filenames[0],
                                    "_".join([filenames[0], "uniprot.csv"])),
                                    out_all)
        self.helper.remove_all_content(out_path, None, "dir")
        self.helper.remove_all_content(out_path, None, "file")
        for folder in folders:
            folder_prefix = folder.split("/")[-1]
            shutil.move(folder, os.path.join(out_path, folder_prefix))
            for file_ in os.listdir(os.path.join(out_path, folder_prefix)):
                log.write("\t" + os.path.join(out_path, folder_prefix, file_) + 
                          " is generated.\n")

    def _stat(self, out_path, stat_path, go, goslim, out_folder, log):
        log.write("Running gene_ontology.py to Retrieve GOslim terms and "
                  "do statistics.\n")
        log.write("The following files are generated:\n")
        for folder in os.listdir(out_path):
            strain_stat_path = os.path.join(stat_path, folder)
            self.helper.check_make_folder(strain_stat_path)
            fig_path = os.path.join(strain_stat_path, "figs")
            if "fig" not in os.listdir(strain_stat_path):
                os.mkdir(fig_path)
            stat_file = os.path.join(strain_stat_path,
                                    "_".join(["stat", folder + ".csv"]))
            map2goslim(goslim, go,
                       os.path.join(out_path, folder, self.all_strain),
                       stat_file, out_folder)
            log.write("\t" + stat_file + "\n")
            self.helper.move_all_content(out_folder, fig_path,
                                         ["_three_roots.png"])
            self.helper.move_all_content(out_folder, fig_path,
                                         ["_molecular_function.png"])
            self.helper.move_all_content(out_folder, fig_path,
                                         ["_cellular_component.png"])
            self.helper.move_all_content(out_folder, fig_path,
                                         ["_biological_process.png"])
            for file_ in os.listdir(fig_path):
                log.write("\t" + os.path.join(fig_path, file_) + "\n")

    def run_go_term(self, args_go, log):
        for gff in os.listdir(args_go.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(
                                                 args_go.gffs, gff))
        self.multiparser.parser_gff(args_go.gffs, None)
        if args_go.trans is not None:
            self.multiparser.parser_gff(args_go.trans, "transcript")
        print("Computing all CDSs")
        log.write("Retrieving GO terms for all CDSs.\n")
        self._retrieve_go(args_go.uniprot, self.result_all_path, "all", log)
        self._merge_files(args_go.gffs, self.result_all_path, self.out_all, log)
        self._stat(self.result_all_path, self.stat_all_path, args_go.go,
                   args_go.goslim, self.out_all, log)
        if args_go.trans is not None:
            log.write("Retrieving GO terms only for expressed CDSs.\n")
            print("Computing express CDSs")
            self._retrieve_go(args_go.uniprot, self.result_express_path,
                              "express", log)
            self._merge_files(args_go.gffs, self.result_express_path,
                              self.out_express, log)
            self._stat(self.result_express_path, self.stat_express_path,
                       args_go.go, args_go.goslim, self.out_express, log)
        self.helper.remove_tmp_dir(args_go.gffs)
        if args_go.trans is not None:
            self.helper.remove_tmp_dir(args_go.trans)
Beispiel #58
0
class Ribos(object):
    def __init__(self, args_ribo):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.gff_parser = Gff3Parser()
        self.gff_path = os.path.join(args_ribo.gffs, "tmp")
        self.tss_path = os.path.join(args_ribo.tsss, "tmp")
        self.tran_path = os.path.join(args_ribo.trans, "tmp")
        self.fasta_path = os.path.join(args_ribo.fastas, "tmp")
        self.stat_folder = os.path.join(args_ribo.out_folder, "statistics")
        self.gff_outfolder = os.path.join(args_ribo.out_folder, "gffs")
        self.table_folder = os.path.join(args_ribo.out_folder, "tables")
        self.scan_folder = os.path.join(args_ribo.out_folder, "scan_Rfam")
        self.ribos_rfam = os.path.join(args_ribo.database,
                                       "Rfam_riboswitch.cm")
        self.tmp_files = {
            "fasta": os.path.join(args_ribo.out_folder, "tmp_fasta"),
            "scan": os.path.join(args_ribo.out_folder, "tmp_scan"),
            "table": os.path.join(args_ribo.out_folder, "tmp_table")
        }
        self.suffixs = {
            "csv": "riboswitch.csv",
            "txt": "riboswitch_prescan.txt",
            "re_txt": "riboswitch_scan.txt",
            "re_csv": "riboswitch_scan.csv"
        }

    def _run_infernal(self, args_ribo, seq, type_, prefix):
        scan_file = os.path.join(self.tmp_files["scan"],
                                 "_".join([prefix, self.suffixs[type_]]))
        scan = open(scan_file, "w")
        call([
            os.path.join(args_ribo.infernal_path, "cmscan"), "--incE",
            str(args_ribo.e_value), "--acc", self.ribos_rfam, seq
        ],
             stdout=scan)
        scan.close()
        return scan_file

    def _scan_extract_rfam(self, prefixs, args_ribo):
        for gff in os.listdir(self.gff_path):
            if gff.endswith(".gff"):
                prefix = gff.replace(".gff", "")
                first_seq = os.path.join(self.tmp_files["fasta"],
                                         prefix + ".fa")
                prefixs.append(prefix)
                print("extracting seq of riboswitch candidates of {0}".format(
                    prefix))
                extract_potential_rbs(
                    os.path.join(self.fasta_path, prefix + ".fa"),
                    os.path.join(self.gff_path, gff),
                    os.path.join(self.tss_path, prefix + "_TSS.gff"),
                    os.path.join(self.tran_path, prefix + "_transcript.gff"),
                    first_seq, args_ribo)
                print("pre-scanning of {0}".format(prefix))
                first_scan_file = self._run_infernal(args_ribo, first_seq,
                                                     "txt", prefix)
                sec_seq = os.path.join(self.tmp_files["fasta"],
                                       "_".join([prefix, "regenerate.fa"]))
                first_table = os.path.join(
                    self.tmp_files["table"],
                    "_".join([prefix, self.suffixs["csv"]]))
                regenerate_seq(first_scan_file, first_seq, first_table,
                               sec_seq)
                print("scanning of {0}".format(prefix))
                sec_scan_file = self._run_infernal(args_ribo, sec_seq,
                                                   "re_txt", prefix)
                sec_table = os.path.join(
                    self.tmp_files["table"],
                    "_".join([prefix, self.suffixs["re_csv"]]))
                reextract_rbs(sec_scan_file, first_table, sec_table)
                shutil.move(sec_table, first_table)
                modify_table(first_table, args_ribo.output_all)
        return prefixs

    def _merge_results(self, args_ribo):
        for gff in os.listdir(args_ribo.gffs):
            if gff.endswith(".gff"):
                prefix = gff.replace(".gff", "")
                print("Merge results of {0}".format(prefix))
                pre_strain = ""
                self.helper.check_make_folder(
                    os.path.join(self.scan_folder, prefix))
                fh = open(os.path.join(args_ribo.gffs, gff))
                for entry in self.gff_parser.entries(fh):
                    if entry.seq_id != pre_strain:
                        if len(pre_strain) == 0:
                            shutil.copyfile(
                                os.path.join(
                                    self.tmp_files["table"], "_".join(
                                        [entry.seq_id, self.suffixs["csv"]])),
                                os.path.join(
                                    self.table_folder,
                                    "_".join([prefix, self.suffixs["csv"]])))
                        else:
                            self.helper.merge_file(
                                os.path.join(
                                    self.tmp_files["table"], "_".join(
                                        [entry.seq_id, self.suffixs["csv"]])),
                                os.path.join(
                                    self.table_folder,
                                    "_".join([prefix, self.suffixs["csv"]])))
                        shutil.copy(
                            os.path.join(
                                self.tmp_files["scan"],
                                "_".join([entry.seq_id, self.suffixs["txt"]])),
                            os.path.join(self.scan_folder, prefix))
                        shutil.copy(
                            os.path.join(
                                self.tmp_files["scan"], "_".join(
                                    [entry.seq_id, self.suffixs["re_txt"]])),
                            os.path.join(self.scan_folder, prefix))
                        pre_strain = entry.seq_id
                out_stat = os.path.join(
                    self.stat_folder,
                    "_".join(["stat", prefix, "riboswitch.txt"]))
                print("compute statistics of {0}".format(prefix))
                stat_and_covert2gff(
                    os.path.join(self.table_folder,
                                 "_".join([prefix, self.suffixs["csv"]])),
                    args_ribo.ribos_id,
                    os.path.join(self.gff_outfolder,
                                 "_".join([prefix, "riboswitch.gff"])),
                    args_ribo.fuzzy, out_stat)
                fh.close()

    def _remove_tmp(self, args_ribo):
        self.helper.remove_tmp(args_ribo.gffs)
        self.helper.remove_tmp(args_ribo.fastas)
        self.helper.remove_all_content(args_ribo.out_folder, "tmp", "dir")

    def _remove_overlap(self, gff_path):
        for gff in os.listdir(gff_path):
            if gff.endswith(".gff"):
                rbs_overlap(
                    os.path.join(
                        os.path.join(
                            self.tmp_files["table"], "_".join(
                                [gff.replace(".gff", ""),
                                 self.suffixs["csv"]]))),
                    os.path.join(gff_path, gff))

    def run_ribos(self, args_ribo):
        if args_ribo.fuzzy_rbs > 6:
            print("Error: --fuzzy_rbs should be equal or less than 6!!")
            sys.exit()
        self.multiparser.parser_gff(args_ribo.gffs, None)
        self.multiparser.parser_fasta(args_ribo.fastas)
        self.multiparser.parser_gff(args_ribo.trans, "transcript")
        self.multiparser.parser_gff(args_ribo.tsss, "TSS")
        for gff in os.listdir(args_ribo.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(
                    os.path.join(args_ribo.gffs, gff))
        rbs_from_rfam(args_ribo.ribos_id, args_ribo.rfam, self.ribos_rfam)
        print("compressing Rfam...")
        call([
            os.path.join(args_ribo.infernal_path, "cmpress"), "-F",
            self.ribos_rfam
        ])
        prefixs = []
        self.helper.check_make_folder(self.tmp_files["fasta"])
        self.helper.check_make_folder(self.tmp_files["scan"])
        self.helper.check_make_folder(self.tmp_files["table"])
        prefixs = self._scan_extract_rfam(prefixs, args_ribo)
        self._remove_overlap(self.gff_path)
        self._merge_results(args_ribo)
        mapping_ribos(self.table_folder, args_ribo.ribos_id)
        self._remove_tmp(args_ribo)
Beispiel #59
0
class CircRNADetection(object):
    '''Detection of circRNA'''

    def __init__(self, args_circ):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.alignment_path = os.path.join(args_circ.output_folder,
                                           "segemehl_alignment_files")
        self.splice_path = os.path.join(args_circ.output_folder,
                                        "segemehl_splice_results")
        self.candidate_path = os.path.join(args_circ.output_folder,
                                           "circRNA_tables")
        self.gff_folder = os.path.join(args_circ.output_folder, "gffs")
        self.gff_path = os.path.join(args_circ.gffs, "tmp")
        self.splices = {"file": "splicesites.bed",
                        "splice": "splicesites"}
        self.trans = {"file": "transrealigned.bed",
                      "trans": "transrealigned"}
        self.fasta_path = os.path.join(args_circ.fastas, "tmp")

    def _wait_process(self, processes):
        '''wait for the parallels to finish the process'''
        for p in processes:
            p.wait()
            if p.stdout:
                p.stdout.close()
            if p.stdin:
                p.stdin.close()
            if p.stderr:
                p.stderr.close()
            try:
                p.kill()
            except OSError:
                pass
            time.sleep(5)

    def _deal_zip_file(self, read_files, log):
        tmp_datas = []
        tmp_reads = []
        for reads in read_files:
            zips = []
            tmp_datas = reads["files"]
            for read in reads["files"]:
                if read.endswith(".bz2"):
                    mod_read = read.replace(".bz2", "")
                    if (".fa" not in mod_read) and (
                            ".fasta" not in mod_read) and (
                            ".fna" not in mod_read) and (
                            ".fq" not in mod_read) and (
                            ".fastq" not in mod_read):
                        mod_read = mod_read + ".fa"
                    read_out = open(mod_read, "w")
                    tmp_datas.append(mod_read)
                    zips.append(mod_read)
                    print(" ".join(["Uncompressing", read]))
                    log.write(" ".join(["bzcat", read]) + "\n")
                    call(["bzcat", read], stdout=read_out)
                    log.write("\t" + mod_read + " is generated.\n")
                    read_out.close()
                elif read.endswith(".gz"):
                    mod_read = read.replace(".gz", "")
                    if (".fa" not in mod_read) and (
                            ".fasta" not in mod_read) and (
                            ".fna" not in mod_read) and (
                            ".fq" not in mod_read) and (
                            ".fastq" not in mod_read):
                        mod_read = mod_read + ".fa"
                    read_out = open(mod_read, "w")
                    tmp_datas.append(mod_read)
                    zips.append(mod_read)
                    print(" ".join(["Uncompressing", read]))
                    log.write(" ".join(["zcat", read]) + "\n")
                    call(["zcat", read], stdout=read_out)
                    read_out.close()
                    log.write("\t" + mod_read + " is generated.\n")
            tmp_reads.append({"sample": reads["sample"],
                              "files": tmp_datas, "zips": zips})   
        return tmp_reads

    def _run_segemehl_fasta_index(self, segemehl_path, fasta_path,
                                  index, fasta, log):
        log.write(" ".join([segemehl_path,
                  "-x", os.path.join(fasta_path, index),
                  "-d", os.path.join(fasta_path, fasta)]) + "\n")
        call([segemehl_path,
              "-x", os.path.join(fasta_path, index),
              "-d", os.path.join(fasta_path, fasta)])

    def _run_segemehl_align(self, args_circ, index, fasta, read,
                            sam_file, log_file, fasta_prefix, log):
        out = open(os.path.join(self.alignment_path,
                   fasta_prefix, sam_file), "w")
        log = open(os.path.join(self.alignment_path,
                   fasta_prefix, log_file), "w")
        log.write(" ".join([args_circ.segemehl_path,
                   "-i", os.path.join(self.fasta_path, index),
                   "-d", os.path.join(self.fasta_path, fasta),
                   "-q", read, "-S"]) + "\n")
        p = Popen([args_circ.segemehl_path,
                   "-i", os.path.join(self.fasta_path, index),
                   "-d", os.path.join(self.fasta_path, fasta),
                   "-q", read, "-S"],
                  stdout=out, stderr=log)
        return p

    def _align(self, args_circ, read_datas, log):
        '''align the read. if the bam files are provided, it can be skipped.'''
        prefixs = []
        align_files = []
        log.write("Using segemehl to align the read.\n")
        log.write("Please make sure the version of segemehl is at least 0.1.9.\n")
        for fasta in os.listdir(self.fasta_path):
            index = fasta.replace(".fa", ".idx")
            self._run_segemehl_fasta_index(args_circ.segemehl_path,
                                           self.fasta_path, index, fasta, log)
            processes = []
            num_process = 0
            fasta_prefix = fasta.replace(".fa", "")
            prefixs.append(fasta_prefix)
            self.helper.check_make_folder(os.path.join(
                                self.alignment_path, fasta_prefix))
            log.write("Running for {0}.\n".format(fasta_prefix))
            for reads in read_datas:
                for read in reads["files"]:
                    num_process += 1
                    read_name = read.split("/")[-1]
                    if read_name.endswith(".fa") or \
                       read_name.endswith(".fna") or \
                       read_name.endswith(".fasta") or \
                       read_name.endswith(".fq") or \
                       read_name.endswith(".fastq"):
                        filename = read_name.split(".")
                        read_prefix = ".".join(filename[:-1])
                        sam_file = "_".join([read_prefix, fasta_prefix + ".sam"])
                        log_file = "_".join([read_prefix, fasta_prefix + ".log"])
                        align_files.append("_".join([read_prefix, fasta_prefix]))
                        print("Mapping {0}".format(sam_file))
                        p = self._run_segemehl_align(
                                args_circ, index, fasta, read,
                                sam_file, log_file, fasta_prefix, log)
                        processes.append(p)
                        if num_process == args_circ.cores:
                            self._wait_process(processes)
                            num_process = 0
                self._wait_process(processes)
            log.write("Done!\n")
            log.write("The following files are generated in {0}:\n".format(
                  os.path.join(self.alignment_path, fasta_prefix)))
            for file_ in os.listdir(os.path.join(
                   self.alignment_path, fasta_prefix)):
                log.write("\t" + file_ + "\n")
        return align_files, prefixs

    def _run_samtools_convert_bam(self, samtools_path, pre_sam, out_bam, log):
        log.write(" ".join([samtools_path, "view",
                            "-bS", pre_sam, "-o", out_bam]) + "\n")
        call([samtools_path, "view", "-bS", pre_sam, "-o", out_bam])

    def _convert_sam2bam(self, sub_alignment_path, samtools_path, align_files, log):
        bam_files = []
        convert_ones = []
        remove_ones = []
        log.write("Using Samtools to convert SAM files to BAM files.\n")
        log.write("Please make sure the version of Samtools is at least 1.3.1.\n")
        for sam in os.listdir(sub_alignment_path):
            pre_sam = os.path.join(sub_alignment_path, sam)
            if sam.endswith(".sam"):
                bam_file = sam.replace(".sam", ".bam")
                print("Converting {0} to {1}".format(sam, bam_file))
                out_bam = os.path.join(sub_alignment_path, bam_file)
                self._run_samtools_convert_bam(samtools_path, pre_sam,
                                               out_bam, log)
                bam_files.append(out_bam)
                if align_files:
                    if bam_file.replace(".bam", "") not in align_files:
                        convert_ones.append(out_bam)
                    else:
                        remove_ones.append(pre_sam)
            elif sam.endswith(".bam"):
                if (pre_sam not in convert_ones) and (
                        pre_sam not in remove_ones):
                    bam_files.append(pre_sam)
            elif sam.endswith(".log"):
                os.remove(pre_sam)
        log.write("Done!\n")
        log.write("The following files are generated:\n")
        for file_ in os.listdir(sub_alignment_path):
            if file_.endswith(".bam"):
                log.write("\t" + os.path.join(sub_alignment_path, file_) + "\n")
        return bam_files, convert_ones, remove_ones

    def _run_samtools_merge_sort(self, samtools_path, prefix,
                                 out_folder, bam_datas, log):
        log.write("Using Samtools for merging, sorting and converting "
                  "the BAM files.\n")
        log.write("Make sure the version Samtools is at least 1.3.1.\n")
        for bam_data in bam_datas:
            print("Merging bam files for {0} of {1}".format(
                prefix, bam_data["sample"]))
            sample_bam = os.path.join(out_folder, "_".join([
                prefix, bam_data["sample"] + ".bam"]))
            if len(bam_data["files"]) <= 1:
                shutil.copyfile(bam_data["files"][0], sample_bam)
            else:
                file_line = " ".join(bam_data["files"])
                log.write(" ".join([samtools_path, "merge",
                                    sample_bam, file_line]) + "\n")
                os.system(" ".join([samtools_path, "merge",
                                    sample_bam, file_line]))
            print("Sorting bam files for {0} of {1}".format(
                prefix, bam_data["sample"]))
            sort_sample = os.path.join(out_folder,
                  "_".join([prefix, bam_data["sample"] + "_sort.bam"]))
            log.write(" ".join([samtools_path, "sort",
                      "-o", sort_sample, sample_bam]) + "\n")
            call([samtools_path, "sort", "-o", sort_sample, sample_bam])
            os.remove(sample_bam)
            print("Converting bam files to sam files for {0} of {1}".format(
                prefix, bam_data["sample"]))
            log.write(" ".join([samtools_path, "view", "-h", "-o",
                      sort_sample.replace(".bam", ".sam"), sort_sample]) + "\n")
            call([samtools_path, "view", "-h", "-o",
                  sort_sample.replace(".bam", ".sam"), sort_sample])
        log.write("Done!\n")
        log.write("\t" + sort_sample.replace(".bam", ".sam") + " is generated.\n")

    def _merge_sort_aligment_file(
            self, bam_datas, read_datas, samtools_path,
            out_folder, convert_ones, tmp_reads, remove_ones, prefix, log):
        if bam_datas is None:
            merge_bam_datas = []
            for read_data in read_datas:
                bam_files = []
                for read in read_data["files"]:
                    if read.endswith(".gz") or read.endswith(".bz2"):
                        read = ".".join(
                                read.split("/")[-1].split(".")[:-1])
                    read_prefix = ".".join(
                        read.split("/")[-1].split(".")[:-1])
                    bam_files.append(os.path.join(
                        self.alignment_path, prefix,
                        "_".join([read_prefix, prefix + ".bam"])))
                merge_bam_datas.append({"sample": read_data["sample"],
                                        "files": bam_files})
        elif (bam_datas is not None) and (read_datas is not None):
            merge_bam_datas = copy.deepcopy(bam_datas)
            for bam_data in merge_bam_datas:
                for read_data in read_datas:
                    if bam_data["sample"] == read_data["sample"]:
                        for read in read_data["files"]:
                            read_prefix = ".".join(
                                read.split("/")[-1].split(".")[:-1])
                            bam = os.path.join(
                                self.alignment_path, prefix,
                                "_".join([read_prefix, prefix + ".bam"]))
                            if (bam not in bam_data["files"]):
                                bam_data["files"].append(bam)
        else:
            merge_bam_datas = copy.deepcopy(bam_datas)
        self._run_samtools_merge_sort(samtools_path, prefix,
                                      out_folder, merge_bam_datas, log)
        for bam in convert_ones:
            os.remove(bam)
        for sam in remove_ones:
            os.remove(sam)

    def _run_testrealign(self, prefix, testrealign_path, out_folder, log):
        log.write("Using Segemehl to detect circular RNAs.\n")
        log.write("Please make sure the version of Segemehl is at least 0.1.9.\n")
        log.write("Please make sure your testrealign.x exists. If it does not "
                  "exists, please reinstall your Segemehl via using make all.\n")
        sub_splice_path = os.path.join(self.splice_path, prefix)
        if not os.path.exists(sub_splice_path):
            os.mkdir(sub_splice_path)
        err_log = os.path.join(sub_splice_path, prefix + ".log")
        print("Running testrealign.x for {0}".format(prefix))
        for sam_file in os.listdir(out_folder):
            if sam_file.endswith("sort.sam"):
                sample_prefix = sam_file.replace("_sort.sam", "")
                command = " ".join([
                    testrealign_path,
                    "-d", os.path.join(self.fasta_path, prefix + ".fa"),
                    "-q", os.path.join(out_folder, sam_file), "-n",
                    "-U", os.path.join(sub_splice_path,
                                       sample_prefix + "_splicesites.bed"),
                    "-T", os.path.join(sub_splice_path,
                                       sample_prefix + "_transrealigned.bed")])
                log.write(command + " 2>" + err_log + "\n")
                os.system(command + " 2>" + err_log)
        log.write("Done!\n")
        log.write("The following files are generated:\n")
        for file_ in os.listdir(sub_splice_path):
            log.write("\t" + os.path.join(sub_splice_path, file_) + "\n")
        self.helper.remove_all_content(out_folder, ".sam", "file")

    def _merge_bed(self, fastas, splice_path, output_folder):
        '''Merge the bed files for analysis'''
        fa_prefixs = []
        for fasta in os.listdir(fastas):
            headers = []
            if (fasta.endswith(".fa") or fasta.endswith(".fna") or
                    fasta.endswith(".fasta")):
                with open(os.path.join(fastas, fasta), "r") as f_h:
                    for line in f_h:
                        line = line.strip()
                        if line.startswith(">"):
                            headers.append(line[1:])
                filename = fasta.split(".")
                fasta_prefix = ".".join(filename[:-1])
                fa_prefixs.append(fasta_prefix)
                bed_folder = os.path.join(
                    output_folder, fasta_prefix)
                self.helper.check_make_folder(bed_folder)
                samples = []
                for header in headers:
                    for splice in os.listdir(os.path.join(
                            splice_path, header)):
                        if splice.endswith(".bed"):
                            if self.splices["file"] in splice:
                                sample = splice.replace(header, "")
                                sample = sample.replace(
                                    self.splices["file"], "")
                                if sample not in samples:
                                    samples.append(sample)
                            shutil.copyfile(
                                os.path.join(
                                splice_path, header, splice),
                                os.path.join(
                                bed_folder, "tmp_" + splice))
                for sample in samples:
                    out_splice = os.path.join(bed_folder, "".join([
                        fasta_prefix + sample + self.splices["file"]]))
                    out_trans = os.path.join(bed_folder, "".join([
                        fasta_prefix + sample + self.trans["file"]]))
                    if os.path.exists(out_splice):
                        os.remove(out_splice)
                    if os.path.exists(out_trans):
                        os.remove(out_trans)
                    for file_ in os.listdir(bed_folder):
                        if (self.splices["splice"] in file_) and (
                                sample in file_):
                            self.helper.merge_file(os.path.join(
                                    bed_folder, file_), out_splice)
                        elif (self.trans["trans"] in file_) and (
                                sample in file_):
                            self.helper.merge_file(os.path.join(
                                    bed_folder, file_), out_trans)
        self.helper.remove_all_content(splice_path, None, "dir")
        return samples, fa_prefixs

    def _stat_and_gen_gff(self, prefixs, samples, args_circ, log):
        '''do statistics and print the result to gff file'''
        log.write("Running circRNA.py to do statistics and generate gff files.\n")
        log.write("The following files are generated:\n")
        for prefix in prefixs:
            self.helper.check_make_folder(os.path.join(self.gff_folder,
                                                       prefix))
            self.helper.check_make_folder(os.path.join(self.splice_path,
                                                       prefix))
            for bed in os.listdir(os.path.join(
                args_circ.output_folder, prefix)):
                if (bed.split("_")[0] != "tmp") and (bed.endswith(".bed")):
                    shutil.copy(
                        os.path.join(args_circ.output_folder, prefix, bed),
                        os.path.join(self.splice_path, prefix))
            self.helper.check_make_folder(os.path.join(
                                          self.candidate_path, prefix))
            print("Comparing circular RNAs with annotations of {0}".format(
                prefix))
            for sample in samples:
                splice_file = os.path.join(
                    self.splice_path, prefix,
                    "".join([prefix, sample, self.splices["file"]]))
                stat_file = os.path.join(args_circ.stat_folder,
                               "".join(["stat_", prefix, sample,
                                        "circRNA.csv"]))
                csv_all = os.path.join(self.candidate_path, prefix,
                               "".join([prefix, sample, "circRNA_all.csv"]))
                csv_best = os.path.join(self.candidate_path, prefix,
                               "".join([prefix, sample, "circRNA_best.csv"]))
                gff_all = os.path.join(self.gff_folder, prefix,
                                "".join([prefix, sample, "circRNA_all.gff"]))
                gff_best = os.path.join(self.gff_folder, prefix,
                                  "".join([prefix, sample, "circRNA_best.gff"]))
                detect_circrna(splice_file, os.path.join(
                               self.gff_path, prefix + ".gff"), csv_all,
                               args_circ, stat_file)
                self.converter.convert_circ2gff(
                     os.path.join(self.candidate_path, prefix,
                                  "".join([prefix, sample, "circRNA_all.csv"])),
                     args_circ, gff_all, gff_best)
                log.write("\t" + stat_file + "\n")
                log.write("\t" + csv_all + "\n")
                log.write("\t" + csv_best + "\n")
                log.write("\t" + gff_all + "\n")
                log.write("\t" + gff_best + "\n")

    def _extract_input_files(self, inputs):
        input_datas = []
        for input_ in inputs:
            datas = input_.split(":")
            if len(datas) != 2:
                print("Error: the format of --bam_files or "
                      "--read_files is wrong!")
                sys.exit()
            for file_ in datas[-1].split(","):
                if not os.path.exists(file_):
                    print("Error: some files in --bam_files or "
                          "--read_files do not exist!")
                    sys.exit()
            input_datas.append({"sample": datas[0],
                                "files": datas[-1].split(",")})
        return input_datas

    def _combine_read_bam(self, bam_files, bam_datas, read_datas):
        if bam_datas is not None:
            for bam_data in bam_datas:
                for read_data in read_datas:
                    if bam_data["sample"] == read_data["sample"]:
                        for read in read_data["files"]:
                            prefix = ".".join(
                                read.split("/")[-1].split(".")[:-1])
                            bam = os.path.join(self.alignment_path,
                                               prefix + ".bam")
                            if (bam in bam_files) and (
                                    bam not in bam_data["files"]):
                                bam_data["files"].append(bam)
        else:
            bam_datas = []
            for read_data in read_datas:
                bam_files = []
                for read in read_data["files"]:
                    prefix = ".".join(
                        read.split("/")[-1].split(".")[:-1])
                    bam_files.append(os.path.join(
                        self.alignment_path, prefix + ".bam"))
                bam_datas.append({"sample": read_data["sample"],
                                  "files": bam_files})
        return bam_datas

    def _remove_tmp_files(self, args_circ, fa_prefixs):
        self.helper.remove_tmp_dir(args_circ.fastas)
        self.helper.remove_tmp_dir(args_circ.gffs)
        self.helper.remove_all_content(args_circ.output_folder,
                                       ".bam", "file")
        for prefix in fa_prefixs:
            shutil.rmtree(os.path.join(args_circ.output_folder, prefix))

    def run_circrna(self, args_circ, log):
        '''detection of circRNA'''
        bam_datas = None
        read_datas = None
        if (args_circ.bams is None) and (args_circ.read_files is None):
            log.write("--bam_files and --read_files can not be both emtpy.\n")
            print("Error: --bam_files or --read_files should be assigned.")
            sys.exit()
        if args_circ.bams is not None:
            bam_datas = self._extract_input_files(args_circ.bams)
        if args_circ.read_files is not None:
            read_datas = self._extract_input_files(args_circ.read_files)
        for gff in os.listdir(args_circ.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(
                                                 args_circ.gffs, gff))
        if args_circ.segemehl_path is None:
            log.write("segemehl does not exists.\n")
            print("Error: please assign segemehl path!!")
            sys.exit()
        self.multiparser.parser_fasta(args_circ.fastas)
        self.multiparser.parser_gff(args_circ.gffs, None)
        self.multiparser.combine_gff(args_circ.fastas, self.gff_path,
                                     "fasta", None)
        tmp_reads = []
        if args_circ.read_files:
            log.write("Raw read files are found.\n")
            tmp_reads = self._deal_zip_file(read_datas, log)
            align_files, prefixs = self._align(args_circ, tmp_reads, log)
        else:
            align_files = None
        prefixs = []
        for fasta in os.listdir(self.fasta_path):
            if fasta.endswith(".fa"):
                fasta_prefix = fasta.replace(".fa", "")
                prefixs.append(fasta_prefix)
        for prefix in prefixs:
            if args_circ.read_files:
                sub_alignment_path = os.path.join(self.alignment_path, prefix)
                bam_files, convert_ones, remove_ones = self._convert_sam2bam(
                sub_alignment_path, args_circ.samtools_path, align_files, log)
            else:
                convert_ones = []
                remove_ones = []
            self._merge_sort_aligment_file(
                bam_datas, read_datas, args_circ.samtools_path,
                args_circ.output_folder,
                convert_ones, tmp_reads, remove_ones, prefix, log)
            self._run_testrealign(prefix, args_circ.testrealign_path,
                                  args_circ.output_folder, log)
        samples, fa_prefixs = self._merge_bed(
            args_circ.fastas, self.splice_path, args_circ.output_folder)
        self._stat_and_gen_gff(fa_prefixs, samples, args_circ, log)
        if len(tmp_reads) != 0:
            for reads in tmp_reads:
                for read in reads["zips"]:
                    os.remove(read)
        self._remove_tmp_files(args_circ, fa_prefixs)