def extract_inter_seq(inter, cds, seq, fuzzy, inters):
    helper = Helper()
    start = inter["start"] - fuzzy
    end = inter["end"] + fuzzy
    if inter["start"] - fuzzy <= 0:
        start = 1
    if inter["end"] + fuzzy >= len(seq[cds.seq_id]):
        end = len(seq)
    if cds.strand == "+":
        inter_seq = helper.extract_gene(seq[cds.seq_id], start,
                                        end, "+")
    else:
        inter_seq = helper.extract_gene(seq[cds.seq_id], start,
                                        end, "-")
    inters.append(import_data(inter_seq, cds, inter["start"], inter["end"]))
class TargetFasta(object):

    def __init__(self, tar_folder, ref_folder):
        self.multiparser = Multiparser()
        self.seq_editer = SeqEditer()
        self.helper = Helper()
        self.folders = {"tmp_tar": os.path.join(tar_folder, "tmp"),
                        "tmp_ref": os.path.join(ref_folder, "tmp")}

    def get_target_fasta(self, mut_table, tar_folder, ref_folder, output):
        self.multiparser.parser_fasta(ref_folder)
        if "tmp" in os.listdir(tar_folder):
            shutil.rmtree(self.folders["tmp_tar"])
        os.mkdir(self.folders["tmp_tar"])
        self.seq_editer.modify_seq(self.folders["tmp_ref"], mut_table,
                                   self.folders["tmp_tar"])
        print("transfer to target fasta...")
        if output is not None:
            for file_ in output:
                first = True
                datas = file_.split(":")
                filename = datas[0]
                strains = datas[1].split("_and_")
                out = open(os.path.join(tar_folder, filename + ".fa"), "w")
                for strain in strains:
                    if strain + ".fa" in os.listdir(self.folders["tmp_tar"]):
                        if first:
                            first = False
                        else:
                            out.write("\n")
                        with open(os.path.join(
                                  self.folders["tmp_tar"],
                                  strain + ".fa")) as f_h:
                            for line in f_h:
                                out.write(line)
                    else:
                        print("Error:no fasta information of {0}.fa".format(
                              strain))
                out.close()
        else:
            self.helper.move_all_content(self.folders["tmp_tar"],
                                         tar_folder, [".fa"])
        shutil.rmtree(self.folders["tmp_tar"])
        shutil.rmtree(self.folders["tmp_ref"])
        self.helper.remove_all_content(ref_folder, "_folder", "dir")
        print("please use the new fasta file to remapping again.")
        print("Then copy BAMs and wigs back to input/align_results/BAMs "
              "and input/align_results/wigs")
 def __init__(self):
     self.seq_editer = SeqEditer()
     self.helper = Helper()
     self.tmp_fa = "tmp.fa"
     self.tmp_gff = "tmp.gff"
     self.tmp_wig_forward = "tmp_forward.wig"
     self.tmp_wig_reverse = "tmp_reverse.wig"
Exemple #4
0
 def _print_tssfile(self, nums, tss_features, tss, tss_pro, strain, method,
                    out, tss_libs):
     '''print gff file of TSS'''
     tss_merge_type = ",".join(tss_features["tss_types"])
     utr_length = ",".join(tss_features["utr_lengths"])
     merge_locus_tag = ",".join(tss_features["locus_tags"])
     libs = ",".join(tss_libs)
     strand = Helper().get_strand_name(tss.super_strand)
     attribute_string = ";".join([
         "=".join(items)
         for items in ([
             "Name", "".join(
                 [tss_pro, ":",
                  str(tss.super_pos), "_", strand])
         ], ["ID", tss_pro.lower() + str(nums["tss_uni"])],
                       ["type", tss_merge_type],
                       ["utr_length", str(utr_length)],
                       ["associated_gene", merge_locus_tag], ["libs", libs],
                       ["method", "TSSpredator"])
     ])
     out.write("\t".join([
         strain, method, tss_pro,
         str(tss.super_pos),
         str(tss.super_pos), ".", tss.super_strand, ".", attribute_string
     ]) + "\n")
Exemple #5
0
def get_upstream(seq, tss, out, name, nt_before):
    if tss.strand == "+":
        if (tss.start - nt_before + 1) <= 0:
            start = 1
        else:
            start = tss.start - nt_before + 1
        fasta = Helper().extract_gene(seq, start,
                                      tss.start, tss.strand)
    else:
        if (tss.start + nt_before - 1) > len(seq):
            end = len(seq)
        else:
            end = tss.start + nt_before - 1
        fasta = Helper().extract_gene(seq, tss.start,
                                      end, tss.strand)
    out.write("{0}\n{1}\n".format(name, fasta))
Exemple #6
0
 def __init__(self, args_go):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.out_all = os.path.join(args_go.out_folder, "all_CDSs")
     self.out_express = os.path.join(args_go.out_folder, "expressed_CDSs")
     self.result_all_path = os.path.join(self.out_all, "GO_term_results")
     self.result_express_path = os.path.join(self.out_express,
                                             "GO_term_results")
     self.gff_path = os.path.join(args_go.gffs, "tmp")
     if args_go.trans is not None:
         self.tran_path = os.path.join(args_go.trans, "tmp")
     else:
         self.tran_path = None
     self.stat_all_path = os.path.join(self.out_all, "statistics")
     self.stat_express_path = os.path.join(self.out_express, "statistics")
     self.all_strain = "all_genomes_uniprot.csv"
Exemple #7
0
 def __init__(self, args_circ):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.converter = Converter()
     self.alignment_path = os.path.join(args_circ.output_folder,
                                        "segemehl_align")
     self.splice_path = os.path.join(args_circ.output_folder,
                                     "segemehl_splice")
     self.candidate_path = os.path.join(args_circ.output_folder,
                                        "circRNA_tables")
     self.gff_folder = os.path.join(args_circ.output_folder, "gffs")
     self.gff_path = os.path.join(args_circ.gffs, "tmp")
     self.splices = {"all_file": "splicesites_all.bed",
                     "file": "splicesites.bed",
                     "all": "splicesites_all", "splice": "splicesites"}
     self.trans = {"all_file": "transrealigned_all.bed",
                   "file": "transrealigned.bed",
                   "all": "transrealigned_all", "trans": "transrealigned"}
     self.bams = {"whole": "whole_reads.bam", "sort": "whole_reads_sort"}
     if args_circ.align:
         if args_circ.fastas is None:
             print("Error: There is no genome fasta file!!!")
             sys.exit()
         else:
             self.fasta_path = os.path.join(args_circ.fastas, "tmp")
     else:
         self.fasta_path = os.path.join(args_circ.fastas, "tmp")
Exemple #8
0
 def __init__(self, args_snp):
     self.multiparser = Multiparser()
     self.seq_editer = SeqEditer()
     self.helper = Helper()
     if args_snp.types == "related_genome":
         file_type = "compare_related_and_reference_genomes"
     else:
         file_type = "mutations_of_reference_genomes"
     self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs")
     self.stat_path = os.path.join(args_snp.out_folder, file_type,
                                   "statistics")
     self.fig_path = os.path.join(self.stat_path, "figs")
     self.helper.check_make_folder(self.fig_path)
     self.outputs = {"table": os.path.join(
                     args_snp.out_folder, file_type, "SNP_tables"),
                     "raw": os.path.join(
                     args_snp.out_folder, file_type, "SNP_raw_outputs"),
                     "tmp": os.path.join(args_snp.out_folder, "tmp_bcf"),
                     "depth": os.path.join(args_snp.out_folder, "tmp_depth")}
     self.bams = {"whole": os.path.join(args_snp.out_folder,
                                        "whole_reads.bam"),
                  "sort": os.path.join(args_snp.out_folder,
                                       "whole_reads_sorted.bam"),
                  "bams": []}
     self.header = os.path.join(args_snp.out_folder, "header")
     self.baqs = {"with": "with_BAQ", "without": "without_BAQ",
                  "extend": "extend_BAQ"}
def deal_cds_reverse(cdss_r, target_folder, fasta, genes, tar_start, tar_end):
    '''for the reverse strand'''
    pre_id = ""
    out = None
    for cds in cdss_r:
        if cds.seq_id != pre_id:
            out = open(os.path.join(target_folder,
                       "_".join([cds.seq_id, "target.fa"])), "a")
            pre_id = cds.seq_id
        if (len(fasta) - cds.end > tar_start):
            end = cds.end + tar_start
        else:
            end = len(fasta)
        if ((cds.end - tar_end) > 1) and ((cds.end - cds.start) >= tar_end):
            start = cds.end - tar_end - 1
        elif cds.end - tar_end < 1:
            start = 1
        elif (cds.end - cds.start) < tar_end:
            start = cds.start
        seq = Helper().extract_gene(fasta, start, end, cds.strand)
        target = cds
        target_gene = check_parent_gene(cds, genes)
        print_fasta(target, seq, out, target_gene)
    if out is not None:
        out.close()
def deal_cds_forward(cdss_f, target_folder, fasta, genes, tar_start, tar_end):
    '''for forward strand'''
    pre_id = ""
    out = None
    for cds in cdss_f:
        if cds.seq_id != pre_id:
            out = open(os.path.join(target_folder,
                       "_".join([cds.seq_id, "target.fa"])), "w")
            pre_id = cds.seq_id
        if (cds.start > tar_start):
            start = cds.start - tar_start
        else:
            start = 1
        if ((cds.start + tar_end) < len(fasta)) and (
                (cds.end - cds.start) >= tar_end):
            end = cds.start + tar_end - 1
        elif cds.start + tar_end >= len(fasta):
            end = len(fasta)
        elif (cds.end - cds.start) < tar_end:
            end = cds.end
        seq = Helper().extract_gene(fasta, start, end, cds.strand)
        target = cds
        target_gene = check_parent_gene(cds, genes)
        print_fasta(target, seq, out, target_gene)
    if out is not None:
        out.close()
Exemple #11
0
 def __init__(self, args_snp):
     self.multiparser = Multiparser()
     self.seq_editer = SeqEditer()
     self.helper = Helper()
     if args_snp.types == "reference":
         file_type = "compare_reference"
     else:
         file_type = "validate_target"
     self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs")
     self.stat_path = os.path.join(args_snp.out_folder, file_type,
                                   "statistics")
     self.fasta_path = os.path.join(args_snp.fastas, "tmp")
     self.outputs = {"table": os.path.join(
                     args_snp.out_folder, file_type, "SNP_table"),
                     "raw": os.path.join(
                     args_snp.out_folder, file_type, "SNP_raw_outputs"),
                     "tmp": os.path.join(args_snp.out_folder, "tmp_bcf")}
     if "whole_reads.bam" in os.listdir(args_snp.out_folder):
         self.helper.remove_all_content(args_snp.out_folder,
                                        "whole_read", "file")
     self.bams = {"whole": os.path.join(args_snp.out_folder,
                                        "whole_reads.bam"),
                  "sort": os.path.join(args_snp.out_folder,
                                       "whole_reads_sorted.bam")}
     self.header = os.path.join(args_snp.out_folder, "header")
     self.baqs = {"with": "with_BAQ", "without": "without_BAQ",
                  "extend": "extend_BAQ"}
Exemple #12
0
 def __init__(self, args_ribo):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.gff_parser = Gff3Parser()
     self.gff_path = os.path.join(args_ribo.gffs, "tmp")
     if args_ribo.tsss is not None:
         self.tss_path = os.path.join(args_ribo.tsss, "tmp")
     else:
         self.tss_path = None
     self.tran_path = os.path.join(args_ribo.trans, "tmp")
     self.fasta_path = os.path.join(args_ribo.fastas, "tmp")
     if (args_ribo.program == "both") or (
             args_ribo.program == "riboswitch"):
         (self.ribos_stat_folder, self.ribos_gff_outfolder,
          self.ribos_table_folder, self.ribos_scan_folder,
          self.ribos_tmp_files, self.ribos_rfam,
          self.ribos_suffixs) = self._create_out_folders(
             args_ribo.ribos_out_folder, "riboswitch",
             args_ribo.database)
     if (args_ribo.program == "both") or (
             args_ribo.program == "thermometer"):
         (self.thermo_stat_folder, self.thermo_gff_outfolder,
          self.thermo_table_folder, self.thermo_scan_folder,
          self.thermo_tmp_files, self.thermo_rfam,
          self.thermo_suffixs) = self._create_out_folders(
             args_ribo.thermo_out_folder, "RNA_thermometer",
             args_ribo.database)
Exemple #13
0
def compare_sorf_srna(sorfs, srnas, srna_gff):
    if srna_gff is not None:
        for sorf in sorfs:
            sorf["srna"] = []
            for srna in srnas:
                if (sorf["strain"] == srna.seq_id) and (sorf["strand"]
                                                        == srna.strand):
                    if ((srna.start <= sorf["start"]) and
                        (srna.end >= sorf["end"])) or (
                            (srna.start >= sorf["start"]) and
                            (srna.end <= sorf["end"])) or (
                                (srna.start <= sorf["start"]) and
                                (srna.end >= sorf["start"]) and
                                (srna.end <= sorf["end"])) or (
                                    (srna.start >= sorf["start"]) and
                                    (srna.start <= sorf["end"]) and
                                    (srna.end >= sorf["end"])):
                        strand = Helper().get_strand_name(srna.strand)
                        sorf["srna"].append(srna.attributes["ID"] + ":" +
                                            str(srna.start) + "-" +
                                            str(srna.end) + "_" + strand)
            if len(sorf["srna"]) == 0:
                sorf["srna"] = ["NA"]
    else:
        for sorf in sorfs:
            sorf["srna"] = ["NA"]
Exemple #14
0
def assign_sorf(sorf, starts, ends, fasta):
    sorf["starts"] = starts
    sorf["ends"] = ends
    sorf["start"] = min(map(int, starts))
    sorf["end"] = max(map(int, ends))
    sorf["seq"] = Helper().extract_gene(fasta[sorf["strain"]], sorf["start"],
                                        sorf["end"], sorf["strand"])
Exemple #15
0
 def __init__(self, args_tar):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.fixer = FormatFixer()
     self.gff_parser = Gff3Parser()
     self.target_seq_path = os.path.join(args_tar.out_folder, "target_seqs")
     self.srna_seq_path = os.path.join(args_tar.out_folder, "sRNA_seqs")
     self.rnaplex_path = os.path.join(args_tar.out_folder, "RNAplex_results")
     self.rnaup_path = os.path.join(args_tar.out_folder, "RNAup_results")
     self.intarna_path = os.path.join(args_tar.out_folder, "IntaRNA_results")
     self.merge_path = os.path.join(args_tar.out_folder, "merged_results")
     self.srna_path = os.path.join(args_tar.srnas, "tmp")
     self.fasta_path = os.path.join(args_tar.fastas, "tmp")
     self.gff_path = os.path.join(args_tar.gffs, "tmp")
     self.tmps = {"tmp": "tmp_srna_target", "rnaup": "tmp_rnaup",
                  "log": "tmp_log",
                  "all_fa": "tmp*.fa", "all_txt": "tmp*.txt"}
Exemple #16
0
def get_feature(cds):
    '''get proper feature name'''
    if "locus_tag" in cds.attributes.keys():
        feature = cds.attributes["locus_tag"]
    elif "protein_id" in cds.attributes.keys():
        feature = cds.attributes["protein_id"]
    elif "ID" in cds.attributes.keys():
        strand = Helper().get_strand_name(cds.strand)
        feature = "".join([cds.attributes["ID"], ":",
                           str(cds.start), "-", str(cds.end),
                           "_", strand])
    else:
        strand = Helper().get_strand_name(cds.strand)
        feature = "".join([cds.feature, ":",
                           str(cds.start), "-", str(cds.end),
                           "_", strand])
    return feature
Exemple #17
0
def merge_libs(input_libs, wig_folder, program):
    if "merge_forward.wig" in os.listdir(os.getcwd()):
        os.remove("merge_forward.wig")
    if "merge_reverse.wig" in os.listdir(os.getcwd()):
        os.remove("merge_reverse.wig")
    if program == "TSS":
        type_ = "tex"
    elif program == "processing":
        type_ = "notex"
    for lib in input_libs:
        datas = lib.split(":")
        if (datas[1] == type_) and (datas[4] == "+"):
            Helper().merge_file(os.path.join(wig_folder, datas[0]),
                                os.path.join(os.getcwd(), "merge_forward.wig"))
        elif (datas[1] == type_) and (datas[4] == "-"):
            Helper().merge_file(os.path.join(wig_folder, datas[0]),
                                os.path.join(os.getcwd(), "merge_reverse.wig"))
Exemple #18
0
def srna_sorf_comparison(sRNA_file, sORF_file, sRNA_out, sORF_out):
    sorfs = []
    srnas = []
    out_r = open(sRNA_out, "w")
    out_o = open(sORF_out, "w")
    out_r.write("##gff-version 3\n")
    out_o.write("##gff-version 3\n")
    for entry in Gff3Parser().entries(open(sRNA_file)):
        entry.attributes = del_attributes("sORF", entry)
        srnas.append(entry)
    srnas = sorted(srnas, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    for entry in Gff3Parser().entries(open(sORF_file)):
        entry.attributes = del_attributes("sRNA", entry)
        sorfs.append(entry)
    sorfs = sorted(sorfs, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    for srna in srnas:
        for sorf in sorfs:
            if (srna.seq_id == sorf.seq_id) and (srna.strand == sorf.strand):
                if ((srna.start <= sorf.start) and (srna.end >= sorf.end)) or (
                    (srna.start >= sorf.start) and (srna.end <= sorf.end)) or (
                        (srna.start <= sorf.start) and
                        (srna.end >= sorf.start) and
                        (srna.end <= sorf.end)) or (
                            (srna.start >= sorf.start) and
                            (srna.start <= sorf.end) and
                            (srna.end >= sorf.end)):
                    if "sORF" not in srna.attributes.keys():
                        srna.attributes["sORF"] = []
                        strand = Helper().get_strand_name(sorf.strand)
                    srna.attributes["sORF"].append("".join([
                        sorf.attributes["ID"], ":",
                        str(sorf.start), "-",
                        str(sorf.end), "_", strand
                    ]))
                    if "sRNA" not in sorf.attributes.keys():
                        sorf.attributes["sRNA"] = []
                        strand = Helper().get_strand_name(srna.strand)
                    sorf.attributes["sRNA"].append("".join([
                        srna.attributes["ID"], ":",
                        str(srna.start), "-",
                        str(srna.end), "_", strand
                    ]))
    print_file(sorfs, out_o, "sRNA")
    print_file(srnas, out_r, "sORF")
    out_r.close()
    out_o.close()
Exemple #19
0
 def __init__(self, args_sorf):
     self.multiparser = Multiparser()
     self.helper = Helper()
     if args_sorf.tsss is not None:
         self.tss_path = os.path.join(args_sorf.tsss, "tmp")
     else:
         self.tss_path = None
     if args_sorf.srnas is not None:
         self.srna_path = os.path.join(args_sorf.srnas, "tmp")
     else:
         self.srna_path = None
     self.gff_output = os.path.join(args_sorf.out_folder, "gffs")
     self.table_output = os.path.join(args_sorf.out_folder, "tables")
     self.tran_path = os.path.join(args_sorf.trans, "tmp")
     self.fasta_path = os.path.join(args_sorf.fastas, "tmp")
     self.all_cand = "all_candidates"
     self.best = "best"
Exemple #20
0
 def __init__(self, args_ratt):
     self.multiparser = Multiparser()
     self.converter = Converter()
     self.format_fixer = FormatFixer()
     self.helper = Helper()
     self.gbk = os.path.join(args_ratt.ref_embls, "gbk_tmp")
     self.gbk_tmp = os.path.join(self.gbk, "tmp")
     self.embl = os.path.join(args_ratt.ref_embls, "embls")
     self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt")
     self.tmp_files = {
         "tar": os.path.join(args_ratt.tar_fastas, "tmp"),
         "ref": os.path.join(args_ratt.ref_fastas, "tmp"),
         "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"),
         "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"),
         "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"),
         "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt")
     }
Exemple #21
0
def get_gene_info(cds):
    if "locus_tag" in cds.attributes.keys():
        feature = cds.attributes["locus_tag"]
    else:
        strand = Helper().get_strand_name(cds.strand)
        feature = "".join([cds.feature, ":", str(cds.start),
                           "-", str(cds.end), "_", strand])
    return feature
Exemple #22
0
 def __init__(self, args_cris):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.gff_parser = Gff3Parser()
     self.gff_path = os.path.join(args_cris.gffs, "tmp")
     self.fasta_path = os.path.join(args_cris.fastas, "tmp")
     self.stat_folder = os.path.join(args_cris.out_folder, "statistics")
     self.gff_out = os.path.join(args_cris.out_folder, "gffs")
     self.all_out = os.path.join(args_cris.out_folder,
                                 "gffs", "all_candidates")
     self.best_out = os.path.join(args_cris.out_folder,
                                  "gffs", "best_candidates")
     self.helper.check_make_folder(self.all_out)
     self.helper.check_make_folder(self.best_out)
     self.data_folder = os.path.join(args_cris.out_folder, "CRT_results")
     self.helper.check_make_folder(self.data_folder)
     self.helper.check_make_folder(self.stat_folder)
Exemple #23
0
 def __init__(self, args_term):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.converter = Converter()
     self.gff_parser = Gff3Parser()
     self.gff_path = os.path.join(args_term.gffs, "tmp")
     self.fasta_path = os.path.join(args_term.fastas, "tmp")
     self.tran_path = os.path.join(args_term.trans, "tmp")
     self.outfolder = {
         "term": os.path.join(args_term.out_folder, "gffs"),
         "csv": os.path.join(args_term.out_folder, "tables")
     }
     self.terms = {
         "all": os.path.join(self.outfolder["term"], "all_candidates"),
         "express": os.path.join(self.outfolder["term"], "express"),
         "best": os.path.join(self.outfolder["term"], "best"),
         "non": os.path.join(self.outfolder["term"], "non_express")
     }
     self.csvs = {
         "all": os.path.join(self.outfolder["csv"], "all_candidates"),
         "express": os.path.join(self.outfolder["csv"], "express"),
         "best": os.path.join(self.outfolder["csv"], "best"),
         "non": os.path.join(self.outfolder["csv"], "non_express")
     }
     self.combine_path = os.path.join(self.gff_path, "combine")
     self.tmps = {
         "transterm": os.path.join(os.getcwd(), "tmp_transterm"),
         "hp": "transtermhp",
         "hp_gff": "transtermhp.gff",
         "hp_path": "tmp_transterm/tmp",
         "term_table": os.path.join(os.getcwd(), "tmp_term_table"),
         "merge": os.path.join(os.getcwd(), "tmp_merge_gff"),
         "gff": "tmp.gff",
         "folder": os.path.join(os.getcwd(), "tmp")
     }
     self.suffixs = {
         "gff": "term.gff",
         "csv": "term.csv",
         "allgff": "term_all.gff"
     }
     if args_term.srnas:
         self.srna_path = os.path.join(args_term.srnas, "tmp")
     else:
         self.srna_path = None
     self._make_gff_folder()
Exemple #24
0
def import_to_tss(tss_type, cds_pos, tss, locus_tag, tss_entry):
    if cds_pos == "NA":
        utr = "_".join([tss_type, "NA"])
    else:
        utr = "_".join([tss_type, str(int(math.fabs(cds_pos - tss.start)))])
    if len(tss_entry) != 0:
        tss_dict = tss_entry[1]
        tss_dict_types = tss_dict["type"].split("&")
        tss_dict_utrs = tss_dict["UTR_length"].split("&")
        tss_dict_tags = tss_dict["associated_gene"].split("&")
        if tss_type == "Primary" and ("Primary" in tss_dict["type"]):
            index = 0
            for tss_dict_type in tss_dict_types:
                if "Primary" in tss_dict_type:
                    utr_length = tss_dict_utrs[index].split("_")
                    if math.fabs(cds_pos - tss.start) < int(utr_length[1]):
                        tss_dict_utrs[index] = utr
                        tss_dict_tags[index] = locus_tag
                index += 1
        else:
            tss_dict_types.append(tss_type)
            tss_dict_utrs.append(utr)
            tss_dict_tags.append(locus_tag)
        strand = Helper().get_strand_name(tss.strand)
        tss_dict = {
            "Name": "_".join(["TSS:" + str(tss.start), strand]),
            "type": "&".join(tss_dict_types),
            "UTR_length": "&".join(tss_dict_utrs),
            "associated_gene": "&".join(tss_dict_tags)
        }
    else:
        strand = Helper().get_strand_name(tss.strand)
        tss_dict = {
            "Name": "_".join(["TSS:" + str(tss.start), strand]),
            "type": tss_type,
            "UTR_length": utr,
            "associated_gene": locus_tag
        }
    tss_string = ";".join([
        "=".join(["UTR_length", tss_dict["UTR_length"]]),
        "=".join(["associated_gene", tss_dict["associated_gene"]]),
        "=".join(["type",
                  tss_dict["type"]]), "=".join(["Name", tss_dict["Name"]])
    ])
    return (tss_string, tss_dict)
Exemple #25
0
def detect_start_stop(inters, seq, args_sorf):
    '''check the length is 3 -times or not'''
    sorfs = []
    for inter in inters:
        if inter.start <= 0:
            inter.start = 1
        if inter.end >= len(seq[inter.seq_id]):
            inter.end = len(seq[inter.seq_id])
        fasta = Helper().extract_gene(seq[inter.seq_id], inter.start,
                                      inter.end, inter.strand)
        starts = []
        stops = []
        for frame in range(0, 3):
            for index in range(frame, len(fasta), 3):
                if fasta[index:index + 3] in args_sorf.start_codon:
                    starts.append(index)
                elif fasta[index:index + 3] in args_sorf.stop_codon:
                    stops.append(index)
        for start in starts:
            for stop in stops:
                if ((stop - start) > 0) and \
                   (((stop - start) % 3) == 0) and \
                   ((stop - start) <= args_sorf.max_len) and \
                   ((stop - start) >= args_sorf.min_len):
                    rbs = detect_rbs_site(fasta, start, inter, args_sorf)
                    if (len(rbs) == 1) and (rbs[0] == "NA"):
                        pass
                    else:
                        if (inter.source == "intergenic") or (inter.source
                                                              == "antisense"):
                            if inter.strand == "+":
                                check_terminal_seq(seq[inter.seq_id],
                                                   inter.start + start,
                                                   inter.start + stop + 2,
                                                   args_sorf, inter.source,
                                                   inter, sorfs, rbs)
                            else:
                                check_terminal_seq(
                                    seq[inter.seq_id],
                                    inter.start + (len(fasta) - stop - 3),
                                    inter.start + (len(fasta) - start - 1),
                                    args_sorf, inter.source, inter, sorfs, rbs)
                        elif inter.source == "UTR_derived":
                            if inter.strand == "+":
                                check_terminal_seq(
                                    seq[inter.seq_id], inter.start + start,
                                    inter.start + stop + 2, args_sorf,
                                    inter.attributes["UTR_type"], inter, sorfs,
                                    rbs)
                            else:
                                check_terminal_seq(
                                    seq[inter.seq_id],
                                    inter.start + (len(fasta) - stop - 3),
                                    inter.start + (len(fasta) - start - 1),
                                    args_sorf, inter.attributes["UTR_type"],
                                    inter, sorfs, rbs)
    return sorfs
Exemple #26
0
 def __init__(self, args_utr):
     self.helper = Helper()
     self.multiparser = Multiparser()
     self.tss_path = os.path.join(args_utr.tsss, "tmp")
     self.tran_path = os.path.join(args_utr.trans, "tmp")
     self.utr5_path = os.path.join(args_utr.out_folder, "5UTR")
     self.utr3_path = os.path.join(args_utr.out_folder, "3UTR")
     self.utr5_stat_path = os.path.join(self.utr5_path, "statistics")
     self.utr3_stat_path = os.path.join(self.utr3_path, "statistics")
Exemple #27
0
 def _merge_wigs(self, wig_folder, prefix, libs):
     self.helper.check_make_folder(
         os.path.join(os.getcwd(), self.tmps["tmp"]))
     for wig_file in os.listdir(wig_folder):
         for lib in libs:
             info = lib.split(":")
             if (info[0][:-4] in wig_file) and (info[-1] == "+") and (
                     prefix in wig_file) and (os.path.isfile(
                         os.path.join(wig_folder, wig_file))):
                 Helper().merge_file(
                     os.path.join(wig_folder, wig_file),
                     os.path.join("tmp", "merge_forward.wig"))
             if (info[0][:-4] in wig_file) and (info[-1] == "-") and (
                     prefix in wig_file) and (os.path.isfile(
                         os.path.join(wig_folder, wig_file))):
                 Helper().merge_file(
                     os.path.join(wig_folder, wig_file),
                     os.path.join("tmp", "merge_reverse.wig"))
Exemple #28
0
 def __init__(self, args_tss):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.converter = Converter()
     self.master = os.path.join(args_tss.out_folder, "MasterTables")
     self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta":
                  "tmp_tss", "tmp": "tmp"}
     if args_tss.ta_files is not None:
         self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp")
     else:
         self.tmps["ta"] = None
     self.gff_path = os.path.join(args_tss.gffs, "tmp")
     if args_tss.manual is not None:
         self.manual_path = os.path.join(args_tss.manual, "tmp")
     self.wig_path = os.path.join(args_tss.wig_folder, "tmp")
     self.fasta_path = os.path.join(args_tss.fastas, "tmp")
     self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics")
     self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs")
Exemple #29
0
 def __init__(self, args_sc):
     self.helper = Helper()
     out_folder = os.path.join(args_sc.output_folder, "screenshots")
     if os.path.exists(out_folder):
         print("Error: The {0} already exist!".format(out_folder))
         sys.exit()
     else:
         os.mkdir(out_folder)
     args_sc.output_folder = out_folder
     filename = args_sc.fasta.split("/")[-1]
     self.strain = ".".join(filename.split(".")[0:-1])
     self.helper.check_make_folder(
         os.path.join(args_sc.output_folder, self.strain))
     self.forward_file = os.path.join(args_sc.output_folder, self.strain,
                                      "forward")
     self.reverse_file = os.path.join(args_sc.output_folder, self.strain,
                                      "reverse")
     os.mkdir(self.forward_file)
     os.mkdir(self.reverse_file)
Exemple #30
0
 def __init__(self, out_folder):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.converter = Converter()
     self.gffparser = Gff3Parser()
     self.tmp_id = os.path.join(out_folder, "tmp_id_list")
     self.all_result = os.path.join(out_folder, "all_results")
     self.best_result = os.path.join(out_folder, "best_results")
     self.fig = os.path.join(out_folder, "figures")
     self.with_strain = "with_strain"
     self.without_strain = "without_strain"
     self.tmp_files = {
         "log": "tmp_log",
         "action": "tmp_action.log",
         "pubmed": "tmp_pubmed.log",
         "specific": os.path.join(out_folder, "tmp_specific"),
         "nospecific": os.path.join(out_folder, "tmp_nospecific"),
         "wget_action": os.path.join(out_folder, "tmp_action")
     }
Exemple #31
0
def get_feature(cds):
    if "protein_id" in cds.attributes.keys():
        cds_name = cds.attributes["protein_id"]
    elif "locus_tag" in cds.attributes.keys():
        cds_name = cds.attributes["locus_tag"]
    else:
        strand = Helper().get_strand_name(cds.strand)
        cds_name = "".join([cds.feature, ":", str(cds.start),
                            "-", str(cds.end), "_", strand])
    return cds_name
def get_attributes(tss, cds):
    if tss.attributes["associated_gene"] == "orphan":
        if "locus_tag" in cds.attributes.keys():
            tss.attributes["associated_gene"] = cds.attributes["locus_tag"]
        else:
            strand = Helper().get_strand_name(cds.strand)
            tss.attributes["associated_gene"] = cds.feature + ":" + \
                str(cds.start) + "-" + str(cds.end) + "_" + strand
    else:
        if "locus_tag" in cds.attributes.keys():
            tss.attributes["associated_gene"] = "&".join([
                tss.attributes["associated_gene"], cds.attributes["locus_tag"]
            ])
        else:
            strand = Helper().get_strand_name(cds.strand)
            tss.attributes["associated_gene"] = "&".join([
                tss.attributes["associated_gene"], cds.feature + ":" +
                str(cds.start) + "-" + str(cds.end) + "_" + strand
            ])
Exemple #33
0
 def __init__(self, args_tran):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.converter = Converter()
     self.gff_outfolder = os.path.join(args_tran.out_folder, "gffs")
     self.tran_path = os.path.join(self.gff_outfolder, "tmp")
     self.stat_path = os.path.join(args_tran.out_folder, "statistics")
     self.tmps = {"gff": "tmp.gff", "merge": "tmp_merge",
                  "tran": os.path.join(args_tran.out_folder, "tmp_tran"),
                  "tss_ta": os.path.join(self.gff_outfolder, "tmp_tss_ta"),
                  "ta_tss": os.path.join(self.gff_outfolder, "tmp_ta_tss"),
                  "ta_gff": os.path.join(self.gff_outfolder, "tmp_ta_gff"),
                  "gff_ta": os.path.join(self.gff_outfolder, "tmp_gff_ta"),
                  "uni": os.path.join(self.gff_outfolder, "tmp_uni"),
                  "overlap": os.path.join(
                      self.gff_outfolder, "tmp_overlap")}
     self.frag = "transcript_fragment.gff"
     self.tex = "transcript_tex_notex.gff"
     self.endfix_tran = "transcript.gff"
Exemple #34
0
 def __init__(self, args_srna):
     self.args_container = ArgsContainer()
     self.helper = Helper()
     self.multiparser = Multiparser()
     self.gff_output = os.path.join(args_srna.out_folder, "gffs")
     self.table_output = os.path.join(args_srna.out_folder, "tables")
     self.stat_path = os.path.join(args_srna.out_folder, "statistics")
     self.tss_path = self._check_folder_exist(args_srna.tss_folder)
     self.pro_path = self._check_folder_exist(args_srna.pro_folder)
     self.sorf_path = self._check_folder_exist(args_srna.sorf_file)
     self.fasta_path = os.path.join(args_srna.fastas, "tmp")
     self.tran_path = os.path.join(args_srna.trans, "tmp")
     self.term_path = self._check_folder_exist(args_srna.terms)
     self.merge_wigs = os.path.join(args_srna.out_folder, "merge_wigs")
     self.prefixs = {
         "merge": os.path.join(args_srna.out_folder, "tmp_merge"),
         "utr": os.path.join(args_srna.out_folder, "tmp_utrsrna"),
         "normal": os.path.join(args_srna.out_folder, "tmp_normal"),
         "in_cds": os.path.join(args_srna.out_folder, "tmp_incds"),
         "merge_table": os.path.join(args_srna.out_folder,
                                     "tmp_merge_table"),
         "utr_table": os.path.join(args_srna.out_folder,
                                   "tmp_utrsrna_table"),
         "normal_table": os.path.join(args_srna.out_folder,
                                      "tmp_normal_table"),
         "in_cds_table": os.path.join(args_srna.out_folder,
                                      "tmp_incds_table"),
         "basic": os.path.join(args_srna.out_folder, "tmp_basic"),
         "energy": os.path.join(args_srna.out_folder, "tmp_energy")
     }
     self.tmps = {
         "nr": os.path.join(args_srna.out_folder, "tmp_nr"),
         "srna": os.path.join(args_srna.out_folder, "tmp_sRNA")
     }
     self.best_table = os.path.join(self.table_output, "best")
     self.table_output = os.path.join(args_srna.out_folder, "tables")
     self.stat_path = os.path.join(args_srna.out_folder, "statistics")
     self.all_best = {
         "all_gff": os.path.join(self.gff_output, "all_candidates"),
         "best_gff": os.path.join(self.gff_output, "best"),
         "all_table": os.path.join(self.table_output, "all_candidates"),
         "best_table": os.path.join(self.table_output, "best")
     }
def check_overlap(table_file, gff_file):
    out = open(table_file + "tmp", "w")
    gffs = []
    gff_f = open(gff_file, "r")
    for entry in Gff3Parser().entries(gff_f):
        if Helper().feature_without_notgene(entry):
            gffs.append(entry)
    fh = open(table_file, "r")
    out.write("\t".join([
        "Rank", "Genome", "Name", "Start", "End", "Strand",
        "Start_with_TSS/Cleavage_site", "End_with_cleavage", "Candidates",
        "Lib_type", "Best_avg_coverage", "Track/Coverage",
        "Normalized_secondary_energy_change(by_length)", "sRNA_types",
        "Conflict_sORF", "nr_hit_number", "sRNA_hit_number",
        "nr_hit_top3|ID|e-value|score", "sRNA_hit|e-value|score",
        "Overlap_CDS_forward", "Overlap_nts_forward", "Overlap_CDS_reverse",
        "Overlap_nts_reverse", "End_with_terminator", "Associated_promoter",
        "sRNA_length"
    ]) + "\n")
    for row in csv.reader(fh, delimiter='\t'):
        if row[3] != "Start":
            overlaps = {"forward": [], "reverse": [], "CDS_f": [], "CDS_r": []}
            start = int(row[3])
            end = int(row[4])
            for gff in gffs:
                if ((gff.end < end) and (gff.end > start) and
                    (gff.start <= start)) or (
                        (gff.start > start) and (gff.start < end) and
                        (gff.end >= end)) or ((gff.end >= end) and
                                              (gff.start <= start)) or (
                                                  (gff.end <= end) and
                                                  (gff.start >= start)):
                    overlap = min(gff.end, end) - max(gff.start, start) + 1
                    percent = "{0:.0f}%".format(
                        (float(overlap) / float(end - start + 1)) * 100)
                    if gff.strand == "+":
                        overlaps["forward"].append(
                            str(overlap) + "(" + str(percent) + ")")
                        overlaps["CDS_f"].append(import_cds(gff))
                    else:
                        overlaps["reverse"].append(
                            str(overlap) + "(" + str(percent) + ")")
                        overlaps["CDS_r"].append(import_cds(gff))
            if len(overlaps["forward"]) == 0:
                overlaps["forward"] = ["NA"]
                overlaps["CDS_f"] = ["NA"]
            if len(overlaps["reverse"]) == 0:
                overlaps["reverse"] = ["NA"]
                overlaps["CDS_r"] = ["NA"]
            out.write("\t".join(row[0:19] + [
                ";".join(overlaps["CDS_f"]), ";".join(overlaps["forward"]),
                ";".join(overlaps["CDS_r"]), ";".join(overlaps["reverse"])
            ] + row[21:]) + "\n")
    shutil.move(table_file + "tmp", table_file)
Exemple #36
0
def check_terminal_seq(seq, start, end, args_sorf, source, inter, sorfs, rbs):
    detect = None
    for i in [0, 1, -1, 2, -2]:
        fasta = Helper().extract_gene(seq, start + i, end + i, inter.strand)
        if (fasta[:3] in args_sorf.start_codon) and (fasta[-3:]
                                                     in args_sorf.stop_codon):
            detect = i
    if detect is not None:
        start = start + detect
        end = end + detect
        import_sorf(inter, sorfs, start, end, source, seq, rbs)
Exemple #37
0
def assign_parent(gff, tran, feature):
    if "Parent" not in gff.attributes.keys():
        gff.attributes["Parent"] = tran.attributes["ID"]
    else:
        gff.attributes["Parent"] = (
            ",".join([gff.attributes["Parent"], tran.attributes["ID"]]))
    if "_".join(["associated", feature]) not in tran.attributes.keys():
        if "locus_tag" in gff.attributes.keys():
            tran.attributes["_".join(["associated", feature])] = (
                gff.attributes["locus_tag"])
        elif "protein_id" in gff.attributes.keys():
            tran.attributes["_".join(["associated", feature])] = (
                gff.attributes["protein_id"])
        elif "Name" in gff.attributes.keys():
            tran.attributes["_".join(["associated", feature])] = (
                gff.attributes["Name"])
        else:
            strand = Helper().get_strand_name(gff.strand)
            tran.attributes["_".join(["associated", feature])] = (
                "".join([gff.feature, ":", str(gff.start),
                         "-", str(gff.end), "_", strand]))
    else:
        if "locus_tag" in gff.attributes.keys():
            tran.attributes["_".join(["associated", feature])] = (
                ",".join([tran.attributes["_".join(["associated", feature])],
                          gff.attributes["locus_tag"]]))
        elif "protein_id" in gff.attributes.keys():
            tran.attributes["_".join(["associated", feature])] = (
                ",".join([tran.attributes["_".join(["associated", feature])],
                          gff.attributes["protein_id"]]))
        elif "Name" in gff.attributes.keys():
            tran.attributes["_".join(["associated", feature])] = (
                ",".join([tran.attributes["_".join(["associated", feature])],
                          gff.attributes["Name"]]))
        else:
            strand = Helper().get_strand_name(gff.strand)
            tran.attributes["_".join(["associated", feature])] = (
                ",".join([tran.attributes["_".join(
                    ["associated", feature])], "".join(
                        [gff.feature, ":", str(gff.start),
                         "-", str(gff.end), "_", strand])]))
Exemple #38
0
def read_gff(gff_file):
    cdss = []
    genes = []
    g_f = open(gff_file, "r")
    for entry in Gff3Parser().entries(g_f):
        if (Helper().feature_without_notgene(entry)):
            cdss.append(entry)
        if entry.feature == "gene":
            genes.append(entry)
    cdss = sorted(cdss, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    genes = sorted(genes, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    return cdss, genes
Exemple #39
0
 def __init__(self, args_op):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.tss_path = os.path.join(args_op.tsss, "tmp")
     self.tran_path = os.path.join(args_op.trans, "tmp")
     self.utr5_path = os.path.join(args_op.utr5s, "tmp")
     self.utr3_path = os.path.join(args_op.utr3s, "tmp")
     self.table_path = os.path.join(args_op.output_folder, "tables")
     if args_op.terms is not None:
         self._check_gff(args_op.terms, "term")
         self.term_path = os.path.join(args_op.terms, "tmp")
     else:
         self.term_path = None
Exemple #40
0
 def __init__(self, args_sc, out_folder):
     self.helper = Helper()
     args_sc.output_folder = out_folder
     filename = args_sc.fasta.split("/")[-1]
     self.strain = ".".join(filename.split(".")[0:-1])
     self.helper.check_make_folder(os.path.join(args_sc.output_folder,
                                                self.strain))
     self.forward_file = os.path.join(args_sc.output_folder,
                                      self.strain, "forward")
     self.reverse_file = os.path.join(args_sc.output_folder,
                                      self.strain, "reverse")
     os.mkdir(self.forward_file)
     os.mkdir(self.reverse_file)
Exemple #41
0
 def setUp(self):
     self.example = ExampleData()
     self.helper = Helper()
     self.gff_out = self.example.gff_out
     self.rev_seq = self.example.rev_seq.replace("\n", "")
     self.test_folder = "test_folder"
     if (not os.path.exists(self.test_folder)):
         os.mkdir(self.test_folder)        
     self.gff_file = os.path.join(self.test_folder, "test.gff")
     with open(self.gff_file, "w") as rh:
         rh.write(self.example.gff_file)        
     self.seq_file = os.path.join(self.test_folder, "test.fa")
     with open(self.seq_file, "w") as rh:
         rh.write(self.example.seq)
Exemple #42
0
 def __init__(self, gffs):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.out_folder = os.path.join(gffs, "for_libs")
     if os.path.exists(self.out_folder):
         shutil.rmtree(self.out_folder)
     os.mkdir(self.out_folder)
     self.stat = os.path.join(self.out_folder, "statistics")
     os.mkdir(self.stat)
     self.gff_folder = os.path.join(self.out_folder, "gffs")
     os.mkdir(self.gff_folder)
     self.merge_wigs = os.path.join(gffs, "merge_wigs")
     if os.path.exists(self.merge_wigs):
         shutil.rmtree(self.merge_wigs)
 def __init__(self, args_tar):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.fixer = FormatFixer()
     self.gff_parser = Gff3Parser()
     self.target_seq_path = os.path.join(args_tar.out_folder, "target_seqs")
     self.srna_seq_path = os.path.join(args_tar.out_folder, "sRNA_seqs")
     self.rnaplex_path = os.path.join(args_tar.out_folder, "RNAplex")
     self.rnaup_path = os.path.join(args_tar.out_folder, "RNAup")
     self.merge_path = os.path.join(args_tar.out_folder, "merge")
     self.srna_path = os.path.join(args_tar.srnas, "tmp")
     self.fasta_path = os.path.join(args_tar.fastas, "tmp")
     self.gff_path = os.path.join(args_tar.gffs, "tmp")
     self.tmps = {"tmp": "tmp", "rnaup": "tmp_rnaup", "log": "tmp_log",
                  "all_fa": "tmp*.fa", "all_txt": "tmp*.txt"}
 def __init__(self, args_tss):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.converter = Converter()
     self.master = os.path.join(args_tss.out_folder, "MasterTables")
     self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta":
                  "tmp_tss", "tmp": "tmp"}
     if args_tss.ta_files is not None:
         self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp")
     else:
         self.tmps["ta"] = None
     self.gff_path = os.path.join(args_tss.gffs, "tmp")
     self.wig_path = os.path.join(args_tss.wig_folder, "tmp")
     self.fasta_path = os.path.join(args_tss.fastas, "tmp")
     self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics")
     self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs")
Exemple #45
0
 def __init__(self, args_go):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.out_all = os.path.join(args_go.out_folder, "all_CDSs")
     self.out_express = os.path.join(args_go.out_folder, "expressed_CDSs")
     self.result_all_path = os.path.join(self.out_all, "GO_term_results")
     self.result_express_path = os.path.join(self.out_express,
                                             "GO_term_results")
     self.gff_path = os.path.join(args_go.gffs, "tmp")
     if args_go.trans is not None:
         self.tran_path = os.path.join(args_go.trans, "tmp")
     else:
         self.tran_path = None
     self.stat_all_path = os.path.join(self.out_all, "statistics")
     self.stat_express_path = os.path.join(self.out_express,
                                           "statistics")
     self.all_strain = "all_genomes_uniprot.csv"
Exemple #46
0
 def __init__(self, args_circ):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.converter = Converter()
     self.alignment_path = os.path.join(args_circ.output_folder,
                                        "segemehl_alignment_files")
     self.splice_path = os.path.join(args_circ.output_folder,
                                     "segemehl_splice_results")
     self.candidate_path = os.path.join(args_circ.output_folder,
                                        "circRNA_tables")
     self.gff_folder = os.path.join(args_circ.output_folder, "gffs")
     self.gff_path = os.path.join(args_circ.gffs, "tmp")
     self.splices = {"file": "splicesites.bed",
                     "splice": "splicesites"}
     self.trans = {"file": "transrealigned.bed",
                   "trans": "transrealigned"}
     self.fasta_path = os.path.join(args_circ.fastas, "tmp")
Exemple #47
0
 def __init__(self, args_sorf):
     self.multiparser = Multiparser()
     self.helper = Helper()
     if args_sorf.tsss is not None:
         self.tss_path = os.path.join(args_sorf.tsss, "tmp")
     else:
         self.tss_path = None
     if args_sorf.srnas is not None:
         self.srna_path = os.path.join(args_sorf.srnas, "tmp")
     else:
         self.srna_path = None
     self.gff_output = os.path.join(args_sorf.out_folder, "gffs")
     self.table_output = os.path.join(args_sorf.out_folder, "tables")
     self.tran_path = os.path.join(args_sorf.trans, "tmp")
     self.fasta_path = os.path.join(args_sorf.fastas, "tmp")
     self.all_cand = "all_candidates"
     self.best = "best_candidates"
Exemple #48
0
 def __init__(self, args_srna):
     self.args_container = ArgsContainer()
     self.helper = Helper()
     self.multiparser = Multiparser()
     self.gff_output = os.path.join(args_srna.out_folder, "gffs")
     self.table_output = os.path.join(args_srna.out_folder, "tables")
     self.stat_path = os.path.join(args_srna.out_folder, "statistics")
     self.tss_path = self._check_folder_exist(args_srna.tss_folder)
     self.pro_path = self._check_folder_exist(args_srna.pro_folder)
     self.sorf_path = self._check_folder_exist(args_srna.sorf_file)
     self.fasta_path = os.path.join(args_srna.fastas, "tmp")
     self.tran_path = os.path.join(args_srna.trans, "tmp")
     self.term_path = self._check_folder_exist(args_srna.terms)
     self.merge_wigs = os.path.join(args_srna.out_folder, "merge_wigs")
     self.prefixs = {"merge": os.path.join(
                         args_srna.out_folder, "tmp_merge"),
                     "utr": os.path.join(
                         args_srna.out_folder, "tmp_utrsrna"),
                     "normal": os.path.join(
                         args_srna.out_folder, "tmp_normal"),
                     "in_cds": os.path.join(
                         args_srna.out_folder, "tmp_incds"),
                     "merge_table": os.path.join(
                         args_srna.out_folder, "tmp_merge_table"),
                     "utr_table": os.path.join(
                         args_srna.out_folder, "tmp_utrsrna_table"),
                     "normal_table": os.path.join(
                         args_srna.out_folder, "tmp_normal_table"),
                     "in_cds_table": os.path.join(
                         args_srna.out_folder, "tmp_incds_table"),
                     "basic": os.path.join(
                         args_srna.out_folder, "tmp_basic"),
                     "energy": os.path.join(
                         args_srna.out_folder, "tmp_energy")}
     self.tmps = {"nr": os.path.join(args_srna.out_folder, "tmp_nr"),
                  "srna": os.path.join(args_srna.out_folder, "tmp_sRNA")}
     self.best_table = os.path.join(self.table_output, "best")
     self.table_output = os.path.join(args_srna.out_folder, "tables")
     self.stat_path = os.path.join(args_srna.out_folder, "statistics")
     self.all_best = {"all_gff": os.path.join(
                          self.gff_output, "all_candidates"),
                      "best_gff": os.path.join(self.gff_output, "best"),
                      "all_table": os.path.join(
                          self.table_output, "all_candidates"),
                      "best_table": os.path.join(self.table_output, "best")}
Exemple #49
0
 def __init__(self, out_folder):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.converter = Converter()
     self.gffparser = Gff3Parser()
     self.tmp_id = os.path.join(out_folder, "tmp_id_list")
     self.all_result = os.path.join(out_folder, "all_results")
     self.best_result = os.path.join(out_folder, "best_results")
     self.fig = os.path.join(out_folder, "figures")
     self.with_strain = "with_strain"
     self.without_strain = "without_strain"
     self.tmp_files = {"log": "tmp_log", "action": "tmp_action.log",
                       "pubmed": "tmp_pubmed.log",
                       "specific": os.path.join(
                                   out_folder, "tmp_specific"),
                       "nospecific": os.path.join(
                                     out_folder, "tmp_nospecific"),
                       "wget_action": os.path.join(
                                      out_folder, "tmp_action")}
Exemple #50
0
 def __init__(self, args_ratt):
     self.multiparser = Multiparser()
     self.converter = Converter()
     self.format_fixer = FormatFixer()
     self.helper = Helper()
     self.gbk = os.path.join(args_ratt.ref_embls, "gbk_tmp")
     self.gbk_tmp = os.path.join(self.gbk, "tmp")
     self.embl = os.path.join(args_ratt.ref_embls, "embls")
     self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt")
     self.tmp_files = {"tar": os.path.join(args_ratt.tar_fastas, "tmp"),
                       "ref": os.path.join(args_ratt.ref_fastas, "tmp"),
                       "out_gff": os.path.join(args_ratt.gff_outfolder,
                                               "tmp"),
                       "gff": os.path.join(args_ratt.gff_outfolder,
                                           "tmp.gff"),
                       "ptt": os.path.join(args_ratt.gff_outfolder,
                                           "tmp.ptt"),
                       "rnt": os.path.join(args_ratt.gff_outfolder,
                                           "tmp.rnt")}
Exemple #51
0
 def __init__(self, args_tran):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.converter = Converter()
     self.gff_outfolder = os.path.join(args_tran.out_folder, "gffs")
     self.tran_path = os.path.join(self.gff_outfolder, "tmp")
     self.stat_path = os.path.join(args_tran.out_folder, "statistics")
     self.tmps = {"gff": "tmp.gff", "merge": "tmp_merge",
                  "tran": os.path.join(args_tran.out_folder, "tmp_tran"),
                  "tss_ta": os.path.join(self.gff_outfolder, "tmp_tss_ta"),
                  "ta_tss": os.path.join(self.gff_outfolder, "tmp_ta_tss"),
                  "ta_gff": os.path.join(self.gff_outfolder, "tmp_ta_gff"),
                  "gff_ta": os.path.join(self.gff_outfolder, "tmp_gff_ta"),
                  "uni": os.path.join(self.gff_outfolder, "tmp_uni"),
                  "overlap": os.path.join(
                      self.gff_outfolder, "tmp_overlap")}
     self.frag = "transcript_fragment.gff"
     self.tex = "transcript_tex_notex.gff"
     self.endfix_tran = "transcript.gff"
Exemple #52
0
 def __init__(self, args_term):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.converter = Converter()
     self.gff_parser = Gff3Parser()
     self.gff_path = os.path.join(args_term.gffs, "tmp")
     self.fasta_path = os.path.join(args_term.fastas, "tmp")
     self.tran_path = os.path.join(args_term.trans, "tmp")
     self.outfolder = {"term": os.path.join(args_term.out_folder, "gffs"),
                       "csv": os.path.join(args_term.out_folder, "tables")}
     self.terms = {"all": os.path.join(self.outfolder["term"],
                                       "all_candidates"),
                   "express": os.path.join(self.outfolder["term"],
                                           "expressed_candidates"),
                   "best": os.path.join(self.outfolder["term"],
                                        "best_candidates"),
                   "non": os.path.join(self.outfolder["term"],
                                       "non_expressed_candidates")}
     self.csvs = {"all": os.path.join(self.outfolder["csv"],
                                      "all_candidates"),
                  "express": os.path.join(self.outfolder["csv"],
                                          "expressed_candidates"),
                  "best": os.path.join(self.outfolder["csv"],
                                       "best_candidates"),
                  "non": os.path.join(self.outfolder["csv"],
                                      "non_expressed_candidates")}
     self.combine_path = os.path.join(self.gff_path, "combine")
     self.tmps = {"transterm": os.path.join(os.getcwd(), "tmp_transterm"),
                  "hp": "transtermhp", "hp_gff": "transtermhp.gff",
                  "hp_path": "tmp_transterm/tmp",
                  "term_table": os.path.join(os.getcwd(), "tmp_term_table"),
                  "merge": os.path.join(os.getcwd(), "tmp_merge_gff"),
                  "gff": "tmp.gff",
                  "folder": os.path.join(os.getcwd(), "tmp")}
     self.suffixs = {"gff": "term.gff", "csv": "term.csv",
                     "allgff": "term_all.gff"}
     if args_term.srnas:
         self.srna_path = os.path.join(args_term.srnas, "tmp")
     else:
         self.srna_path = None
     self._make_gff_folder()
Exemple #53
0
 def __init__(self, args_sc):
     self.multiparser = Multiparser()
     self.helper = Helper()
     out_folder = os.path.join(args_sc.output_folder, "screenshots")
     if os.path.exists(out_folder):
         print("Error: The {0} already exist!!!".format(
               out_folder))
         sys.exit()
     else:
         os.mkdir(out_folder)
     args_sc.output_folder = out_folder
     filename = args_sc.fasta.split("/")[-1]
     self.strain = ".".join(filename.split(".")[0:-1])
     self.helper.check_make_folder(os.path.join(args_sc.output_folder,
                                                self.strain))
     self.forward_file = os.path.join(args_sc.output_folder,
                                      self.strain, "forward")
     self.reverse_file = os.path.join(args_sc.output_folder,
                                      self.strain, "reverse")
     os.mkdir(self.forward_file)
     os.mkdir(self.reverse_file)
Exemple #54
0
 def __init__(self, args_pro):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.tss_path = os.path.join(args_pro.tsss, "tmp")
     if args_pro.gffs is not None:
         self.gff_path = os.path.join(args_pro.gffs, "tmp")
     else:
         self.gff_path = None
     self.out_fasta = os.path.join(args_pro.output_folder, "fasta_classes")
     self.tmp_folder = os.path.join(os.getcwd(), "tmp")
     self.fastas = {"pri": os.path.join(self.tmp_folder, "primary.fa"),
                    "sec": os.path.join(self.tmp_folder, "secondary.fa"),
                    "inter": os.path.join(self.tmp_folder, "internal.fa"),
                    "anti": os.path.join(self.tmp_folder, "antisense.fa"),
                    "orph": os.path.join(self.tmp_folder, "orphan.fa"),
                    "all_no_orph": "without_orphan.fa",
                    "all": "all_type.fa",
                    "tmp_fa": os.path.join(self.tmp_folder, "tmp.fa"),
                    "tmp_all": os.path.join(self.tmp_folder, "tmp_all.fa")}
     self.all_fasta = os.path.join(args_pro.fastas, "allfasta.fa")
     self.all_tss = os.path.join(self.tss_path, "allfasta_TSS.gff")
Exemple #55
0
 def __init__(self, args_sub):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.fixer = FormatFixer()
     self.gff_path = os.path.join(args_sub.gffs, "tmp")
     self.fasta_path = os.path.join(args_sub.fastas, "tmp")
     if args_sub.trans is not None:
         self.tran_path = os.path.join(args_sub.trans, "tmp")
     else:
         self.tran_path = None
     self.out_all = os.path.join(args_sub.out_folder, "all_CDS")
     self.out_express = os.path.join(args_sub.out_folder, "expressed_CDS")
     self.all_tmp_path = os.path.join(self.out_all, "tmp")
     self.express_tmp_path = os.path.join(self.out_express, "tmp")
     self.all_stat_path = os.path.join(self.out_all, "statistics")
     self.express_stat_path = os.path.join(self.out_express, "statistics")
     self.all_tmp_result = os.path.join(self.out_all, "tmp_results")
     self.express_tmp_result = os.path.join(self.out_express, "tmp_results")
     self.all_result = os.path.join(self.out_all, "psortb_results")
     self.express_result = os.path.join(self.out_express, "psortb_results")
     self.endfix_table = "table.csv"
     self.endfix_raw = "raw.txt"
     self._make_folder()
Exemple #56
0
 def __init__(self, args_ribo):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.gff_parser = Gff3Parser()
     self.gff_path = os.path.join(args_ribo.gffs, "tmp")
     self.tss_path = os.path.join(args_ribo.tsss, "tmp")
     self.tran_path = os.path.join(args_ribo.trans, "tmp")
     self.fasta_path = os.path.join(args_ribo.fastas, "tmp")
     self.stat_folder = os.path.join(args_ribo.out_folder, "statistics")
     self.gff_outfolder = os.path.join(args_ribo.out_folder, "gffs")
     self.table_folder = os.path.join(args_ribo.out_folder, "tables")
     self.scan_folder = os.path.join(args_ribo.out_folder, "scan_Rfam")
     self.ribos_rfam = os.path.join(args_ribo.database,
                                    "Rfam_riboswitch.cm")
     self.tmp_files = {"fasta": os.path.join(
                                args_ribo.out_folder, "tmp_fasta"),
                       "scan": os.path.join(
                               args_ribo.out_folder, "tmp_scan"),
                       "table": os.path.join(
                                args_ribo.out_folder, "tmp_table")}
     self.suffixs = {"csv": "riboswitch.csv",
                     "txt": "riboswitch_prescan.txt",
                     "re_txt": "riboswitch_scan.txt",
                     "re_csv": "riboswitch_scan.csv"}
 def __init__(self):
     self.multiparser = Multiparser()
     self.helper = Helper()
class ArgsContainer(object):

    def __init__(self):
        self.multiparser = Multiparser()
        self.helper = Helper()

    def _check_replicates(self, replicates_tex, replicates_frag):
        if (replicates_tex is not None) and (replicates_frag is not None):
            replicates = {"tex": int(replicates_tex),
                          "frag": int(replicates_frag)}
        elif replicates_tex is not None:
            replicates = {"tex": int(replicates_tex), "frag": -1}
        elif replicates_frag is not None:
            replicates = {"tex": -1, "frag": int(replicates_frag)}
        else:
            print("Error:No replicates number assign!!!")
            sys.exit()
        return replicates

    def _check_libs(self, tex_notex_libs, frag_libs):
        if (tex_notex_libs is None) and (frag_libs is None):
            print("Error: please input proper libraries!!")
        if (tex_notex_libs is not None) and (frag_libs is not None):
            libs = tex_notex_libs + frag_libs
        elif (tex_notex_libs is not None):
            libs = tex_notex_libs
        elif (frag_libs is not None):
            libs = frag_libs
        return libs

    def _parser_combine_wigs(self, subcommand):
        self.tex_path = None
        self.frag_path = None
        self.multiparser.parser_gff(self.gffs, None)
        if subcommand == "terminator":
            gff_path = os.path.join(self.gffs, "tmp")
            self.multiparser.parser_gff(gff_path, None)
        else:
            gff_path = self.gffs
        if self.tex_wigs is not None:
            self.tex_path = os.path.join(self.tex_wigs, "tmp")
            self.multiparser.parser_wig(self.tex_wigs)
            self.multiparser.combine_wig(gff_path, self.tex_path,
                                         None, self.libs)
            self.merge_wigs = self.tex_wigs
            self.wig_path = self.tex_path
        if self.frag_wigs is not None:
            self.frag_path = os.path.join(self.frag_wigs, "tmp")
            self.multiparser.parser_wig(self.frag_wigs)
            self.multiparser.combine_wig(gff_path, self.frag_path,
                                         None, self.libs)
            self.merge_wigs = self.frag_wigs
            self.wig_path = self.frag_path
        if (self.tex_path is not None) and (
                self.frag_path is not None):
            self = self._merge_wig()
        if (self.tex_path is None) and (
                self.frag_path is None):
            print("Error: There is no proper wig files assigned!!")
            sys.exit()
        return self

    def _merge_wig(self):
        self.merge_wigs = os.path.join(self.out_folder, "merge_wigs")
        if (self.tex_wigs is not None) and (
                self.frag_wigs is not None):
            self.helper.check_make_folder(self.merge_wigs)
            self.wig_path = os.path.join(self.merge_wigs, "tmp")
            self.helper.check_make_folder(self.wig_path)
            for wig in os.listdir(self.tex_wigs):
                if os.path.isfile(os.path.join(self.tex_wigs, wig)):
                    shutil.copy(os.path.join(self.tex_wigs, wig),
                                self.merge_wigs)
            for wig in os.listdir(self.frag_wigs):
                if os.path.isfile(os.path.join(self.frag_wigs, wig)):
                    shutil.copy(os.path.join(self.frag_wigs, wig),
                                self.merge_wigs)
            for wig in os.listdir(self.tex_path):
                if os.path.isfile(os.path.join(self.tex_path, wig)):
                    shutil.copy(os.path.join(self.tex_path, wig),
                                self.wig_path)
            for wig in os.listdir(self.frag_path):
                if os.path.isfile(os.path.join(self.frag_path, wig)):
                    self.helper.merge_file(os.path.join(self.frag_path, wig),
                                           os.path.join(self.wig_path, wig))
        elif (self.tex_wigs is not None):
            self.merge_wigs = self.tex_wigs
        elif (self.frag_wigs is not None):
            self.merge_wigs = self.frag_wigs
        return self

    def _deal_multi_inputs(self, inputs, file_type, num, command):
        if inputs is not None:
            datas = inputs.split(",")
            if num is not None:
                if (len(datas) != num):
                    print("Error: the amount of {0} is not correct!!".format(
                        command))
            new_inputs = []
            for data in datas:
                if file_type == "float":
                    new_inputs.append(float(data.strip()))
                elif file_type == "int":
                    new_inputs.append(int(data.strip()))
                else:
                    new_inputs.append(data)
            return new_inputs
        else:
            return inputs

    def container_ratt(self, ratt_path, element, transfer_type,
                       ref_embl_gbk, target_fasta, ref_fasta, ratt_folder,
                       convert_to_gff_rnt_ptt, tar_annotation_folder,
                       compare_pair):
        self.ratt_path = ratt_path
        self.element = element
        self.transfer_type = transfer_type
        self.ref_embls = ref_embl_gbk
        self.tar_fastas = target_fasta
        self.ref_fastas = ref_fasta
        self.output_path = ratt_folder
        self.convert = convert_to_gff_rnt_ptt
        self.gff_outfolder = tar_annotation_folder
        self.pairs = self._deal_multi_inputs(compare_pair, "str", None, None)
        return self

    def container_tsspredator(self, TSSpredator_path, compute_program,
                              fasta_folder, annotation_folder, wig_folder, lib,
                              output_prefix, height, height_reduction, factor,
                              factor_reduction, base_height, enrichment_factor,
                              processing_factor, replicate_match, out_folder,
                              statistics, validate_gene, merge_manual,
                              compare_transcript_assembly, fuzzy, utr_length,
                              cluster, length, re_check_orphan,
                              overlap_feature, reference_gff_folder,
                              remove_low_expression):
        self.tsspredator_path = TSSpredator_path
        self.program = compute_program
        self.fastas = fasta_folder
        self.gffs = annotation_folder
        self.wig_folder = wig_folder
        self.libs = self._deal_multi_inputs(lib, "str", None, None)
        self.output_prefixs = self._deal_multi_inputs(output_prefix, "str",
                                                      None, None)
        self.height = height
        self.height_reduction = height_reduction
        self.factor = factor
        self.factor_reduction = factor_reduction
        self.base_height = base_height
        self.enrichment_factor = enrichment_factor
        self.processing_factor = processing_factor
        self.repmatch = replicate_match
        self.out_folder = out_folder
        self.stat = statistics
        self.validate = validate_gene
        self.manual = merge_manual
        self.ta_files = compare_transcript_assembly
        self.fuzzy = fuzzy
        self.utr_length = utr_length
        self.cluster = cluster
        self.nt_length = length
        self.check_orphan = re_check_orphan
        self.overlap_feature = overlap_feature
        self.references = reference_gff_folder
        self.remove_low_expression = remove_low_expression
        return self

    def container_optimize(self, TSSpredator_path, fasta_file, annotation_file,
                           wig_folder, manual, out_folder, strain_name,
                           max_height, max_height_reduction, max_factor,
                           max_factor_reduction, max_base_height,
                           max_enrichment_factor, max_processing_factor,
                           utr_length, lib, output_prefix, cluster, length,
                           core, program, replicate_match, steps):
        self.tsspredator_path = TSSpredator_path
        self.fastas = fasta_file
        self.gffs = annotation_file
        self.wigs = wig_folder
        self.manual = manual
        self.output_folder = out_folder
        self.project_strain = strain_name
        self.height = max_height
        self.height_reduction = max_height_reduction
        self.factor = max_factor
        self.factor_reduction = max_factor_reduction
        self.base_height = max_base_height
        self.enrichment = max_enrichment_factor
        self.processing = max_processing_factor
        self.utr = utr_length
        self.libs = self._deal_multi_inputs(lib, "str", None, None)
        self.replicate_name = self._deal_multi_inputs(output_prefix, "str",
                                                      None, None)
        self.cluster = cluster
        self.length = length
        self.cores = core
        self.program = program
        self.replicate = replicate_match
        self.steps = steps
        return self

    def container_terminator(
            self, TransTermHP_path, expterm_path, RNAfold_path, out_folder,
            fasta_folder, annotation_folder, transcript_folder, srna,
            statistics, tex_wig_folder, frag_wig_folder, decrease,
            highest_coverage, fuzzy_detect_coverage, fuzzy_within_transcript,
            fuzzy_downstream_transcript, fuzzy_within_gene,
            fuzzy_downstream_gene, transtermhp_folder, tex_notex_libs,
            frag_libs, tex_notex, replicates_tex, replicates_frag, table_best,
            min_loop_length, max_loop_length, min_stem_length, max_stem_length,
            min_AT_tail_length, miss_rate, range_u):
        self.TransTermHP_path = TransTermHP_path
        self.expterm_path = expterm_path
        self.RNAfold_path = RNAfold_path
        self.out_folder = out_folder
        self.fastas = fasta_folder
        self.gffs = annotation_folder
        self.trans = transcript_folder
        self.srnas = srna
        self.stat = statistics
        self.tex_wigs = tex_wig_folder
        self.frag_wigs = frag_wig_folder
        self.decrease = decrease
        self.cutoff_coverage = highest_coverage
        self.fuzzy = fuzzy_detect_coverage
        self.fuzzy_up_ta = fuzzy_within_transcript
        self.fuzzy_down_ta = fuzzy_downstream_transcript
        self.fuzzy_up_gene = fuzzy_within_gene
        self.fuzzy_down_gene = fuzzy_downstream_gene
        self.hp_folder = transtermhp_folder
        self.tlibs = self._deal_multi_inputs(tex_notex_libs, "str", None, None)
        self.flibs = self._deal_multi_inputs(frag_libs, "str", None, None)
        self.libs = self._check_libs(self.tlibs, self.flibs)
        self.tex_notex = tex_notex
        self.replicates_tex = replicates_tex
        self.replicates_frag = replicates_frag
        self.replicates = self._check_replicates(
                replicates_tex, replicates_frag)
        self.table_best = table_best
        self.min_loop = min_loop_length
        self.max_loop = max_loop_length
        self.min_stem = min_stem_length
        self.max_stem = max_stem_length
        self.at_tail = min_AT_tail_length
        self.miss_rate = miss_rate
        self.range_u = range_u
        self = self._parser_combine_wigs("terminator")
        return self

    def container_transcript(
            self, frag_wig_path, tex_wig_path, tex_notex, length,
            annotation_folder, height, width, tolerance, tolerance_coverage,
            replicates_tex, replicates_frag, transcript_assembly_output_folder,
            compare_TSS, compare_genome_annotation, TSS_fuzzy,
            tex_treated_libs, fragmented_libs, compare_feature_genome,
            table_best, terminator_folder, fuzzy_term):
        self.frag_wigs = frag_wig_path
        self.tex_wigs = tex_wig_path
        self.tex = tex_notex
        self.length = length
        self.gffs = annotation_folder
        self.height = height
        self.width = width
        self.tolerance = tolerance
        self.low_cutoff = tolerance_coverage
        self.replicates_tex = replicates_tex
        self.replicates_frag = replicates_frag
        self.replicates = self._check_replicates(
                replicates_tex, replicates_frag)
        self.out_folder = transcript_assembly_output_folder
        self.compare_tss = compare_TSS
        self.compare_cds = compare_genome_annotation
        self.fuzzy = TSS_fuzzy
        self.tlibs = self._deal_multi_inputs(tex_treated_libs, "str", None,
                                             None)
        self.flibs = self._deal_multi_inputs(fragmented_libs, "str", None,
                                             None)
        self.libs = self._check_libs(self.tlibs, self.flibs)
        self.c_feature = self._deal_multi_inputs(compare_feature_genome, "str",
                                                 None, None)
        self.table_best = table_best
        self.terms = terminator_folder
        self.fuzzy_term = fuzzy_term
        self = self._parser_combine_wigs("transcript")
        return self

    def container_utr(self, tss_folder, annotation_folder,
                      transcript_assembly_folder, terminator_folder,
                      terminator_fuzzy, utr_folder, tss_source, base_5utr,
                      length, base_3utr):
        self.tsss = tss_folder
        self.gffs = annotation_folder
        self.trans = transcript_assembly_folder
        self.terms = terminator_folder
        self.fuzzy = terminator_fuzzy
        self.out_folder = utr_folder
        self.source = tss_source
        self.base_5utr = base_5utr
        self.base_3utr = base_3utr
        self.length = length
        return self

    def container_srna(
            self, Vienna_folder, Vienna_utils, blast_plus_folder,
            ps2pdf14_path, srna_folder, UTR_derived_sRNA, annotation_folder,
            TSS_folder, transcript_assembly_folder, TSS_intergenic_fuzzy,
            TSS_5UTR_fuzzy, TSS_3UTR_fuzzy, TSS_interCDS_fuzzy, import_info,
            tex_wig_folder, frag_wig_folder, processing_site_folder,
            fasta_folder, mountain_plot, nr_format, srna_format,
            sRNA_database_path, nr_database_path, cutoff_energy,
            run_intergenic_TEX_coverage, run_intergenic_noTEX_coverage,
            run_intergenic_fragmented_coverage, run_antisense_TEX_coverage,
            run_antisense_noTEX_coverage, run_antisense_fragmented_coverage,
            intergenic_tolerance, run_utr_TEX_coverage, run_utr_noTEX_coverage,
            run_utr_fragmented_coverage, max_length, min_length,
            tex_notex_libs, frag_libs, replicates_tex, replicates_frag,
            tex_notex, blast_e_nr, blast_e_srna, detect_sRNA_in_CDS,
            table_best, decrease_intergenic, decrease_utr, fuzzy_intergenic,
            fuzzy_utr, cutoff_nr_hit, sORF, best_with_all_sRNAhit,
            best_without_sORF_candidate, overlap_percent_CDS,
            terminator_folder, terminator_fuzzy_in_CDS,
            terminator_fuzzy_out_CDS, best_with_terminator,
            ignore_hypothetical_protein, TSS_source, min_utr_coverage,
            promoter_table, best_with_promoter, ranking_promoter,
            promoter_name):
        self.vienna_path = Vienna_folder
        self.vienna_util = Vienna_utils
        self.blast_path = blast_plus_folder
        self.ps2pdf14_path = ps2pdf14_path
        self.out_folder = srna_folder
        self.utr_srna = UTR_derived_sRNA
        self.gffs = annotation_folder
        self.tss_folder = TSS_folder
        self.trans = transcript_assembly_folder
        self.fuzzy_inter_tss = TSS_intergenic_fuzzy
        self.fuzzy_5utr_tss = TSS_5UTR_fuzzy
        self.fuzzy_3utr_tss = TSS_3UTR_fuzzy
        self.fuzzy_intercds_tss = TSS_interCDS_fuzzy
        self.fuzzy_tsss = {"5utr": self.fuzzy_5utr_tss,
                           "3utr": self.fuzzy_3utr_tss,
                           "interCDS": self.fuzzy_intercds_tss,
                           "inter": self.fuzzy_inter_tss}
        self.import_info = self._deal_multi_inputs(import_info, "str",
                                                   None, None)
        self.tex_wigs = tex_wig_folder
        self.frag_wigs = frag_wig_folder
        self.pro_folder = processing_site_folder
        self.fastas = fasta_folder
        self.mountain = mountain_plot
        self.nr_format = nr_format
        self.srna_format = srna_format
        self.srna_database = sRNA_database_path
        self.nr_database = nr_database_path
        self.energy = cutoff_energy
        self.coverage_tex = self._deal_multi_inputs(
                run_intergenic_TEX_coverage, "float", 5,
                "--run_intergenic_TEX_coverage")
        self.coverage_notex = self._deal_multi_inputs(
                run_intergenic_noTEX_coverage, "float", 5,
                "--run_intergenic_noTEX_coverage")
        self.coverage_frag = self._deal_multi_inputs(
                run_intergenic_fragmented_coverage, "float", 5,
                "--run_intergenic_fragmented_coverage")
        self.anti_cover_tex = self._deal_multi_inputs(
                run_antisense_TEX_coverage, "float", 5,
                "--run_antisense_TEX_coverage")
        self.anti_cover_notex = self._deal_multi_inputs(
                run_antisense_noTEX_coverage, "float", 5,
                "--run_antisense_noTEX_coverage")
        self.anti_cover_frag = self._deal_multi_inputs(
                run_antisense_fragmented_coverage, "float", 5,
                "--run_antisense_fragmented_coverage")
        self.tolerance = intergenic_tolerance
        self.utr_tex_cover = self._deal_multi_inputs(
                run_utr_TEX_coverage, "str", 3, "--run_utr_TEX_coverage")
        self.utr_notex_cover = self._deal_multi_inputs(
                run_utr_noTEX_coverage, "str", 3, "--run_utr_TEX_coverage")
        self.utr_frag_cover = self._deal_multi_inputs(
                run_utr_fragmented_coverage, "str", 3,
                "--run_utr_fragmented_coverage")
        self.max_len = max_length
        self.min_len = min_length
        self.tlibs = self._deal_multi_inputs(tex_notex_libs, "str", None, None)
        self.flibs = self._deal_multi_inputs(frag_libs, "str", None, None)
        self.libs = self._check_libs(self.tlibs, self.flibs)
        self.replicates_tex = replicates_tex
        self.replicates_frag = replicates_frag
        self.replicates = self._check_replicates(
                replicates_tex, replicates_frag)
        self.tex_notex = tex_notex
        self.e_nr = blast_e_nr
        self.e_srna = blast_e_srna
        self.in_cds = detect_sRNA_in_CDS
        self.table_best = table_best
        self.decrease_inter = decrease_intergenic
        self.decrease_utr = decrease_utr
        self.fuzzy_inter = fuzzy_intergenic
        self.fuzzy_utr = fuzzy_utr
        self.nr_hits_num = cutoff_nr_hit
        self.sorf_file = sORF
        self.all_hit = best_with_all_sRNAhit
        self.best_sorf = best_without_sORF_candidate
        self.cutoff_overlap = overlap_percent_CDS
        self.terms = terminator_folder
        self.fuzzy_b = terminator_fuzzy_in_CDS
        self.fuzzy_a = terminator_fuzzy_out_CDS
        self.best_term = best_with_terminator
        self.hypo = ignore_hypothetical_protein
        self.tss_source = TSS_source
        self.min_utr = min_utr_coverage
        self.promoter_table = promoter_table
        self.best_promoter = best_with_promoter
        if ranking_promoter < 1:
            print("Error: --ranking_time_promoter must larger than 1...")
            sys.exit()
        self.rank_promoter = ranking_promoter
        self.promoter_name = self._deal_multi_inputs(promoter_name, "str",
                                                     None, None)
        self = self._parser_combine_wigs("srna")
        return self

    def container_intersrna(self, file_type, files, args_srna, prefix,
                            gff_file, tran_file, tss_file, pro_file, fuzzy):
        args_srna.file_type = file_type
        args_srna.gff_file = gff_file
        args_srna.tran_file = tran_file
        args_srna.tss_file = tss_file
        args_srna.pro_file = pro_file
        args_srna.fuzzy = fuzzy
        args_srna.prefix = prefix
        if file_type == "frag":
            args_srna.wig_f_file = os.path.join(
                    args_srna.frag_path, "_".join([prefix, "forward.wig"]))
            args_srna.wig_r_file = os.path.join(
                    args_srna.frag_path, "_".join([prefix, "reverse.wig"]))
            args_srna.wig_folder = args_srna.frag_wigs
            args_srna.input_libs = args_srna.flibs
            args_srna.output_file = files["frag_gff"]
            args_srna.output_table = files["frag_csv"]
            args_srna.cutoffs = args_srna.coverage_frag
            args_srna.tss_source = True
            args_srna.cut_notex = None
            args_srna.anti_notex_cutoff = None
        else:
            args_srna.wig_f_file = os.path.join(
                    args_srna.tex_path, "_".join([prefix, "forward.wig"]))
            args_srna.wig_r_file = os.path.join(
                    args_srna.tex_path, "_".join([prefix, "reverse.wig"]))
            args_srna.wig_folder = args_srna.tex_wigs
            args_srna.input_libs = args_srna.tlibs
            args_srna.output_file = files["tex_gff"]
            args_srna.output_table = files["tex_csv"]
            args_srna.cutoffs = args_srna.coverage_tex
            args_srna.tss_source = args_srna.tss_source
            args_srna.cut_notex = args_srna.coverage_notex
            args_srna.anti_notex_cutoff = args_srna.anti_cover_notex
        return args_srna

    def container_utrsrna(self, gff, tran, tss, files, pro, fasta, file_type,
                          prefix, args_srna):
        args_srna.file_type = file_type
        args_srna.gff_file = gff
        args_srna.ta_file = tran
        args_srna.tss_file = tss
        args_srna.pro_file = pro
        args_srna.prefix = prefix
        args_srna.seq_file = fasta
        if file_type == "frag":
            args_srna.wig_f_file = os.path.join(
                    args_srna.frag_path, "_".join([prefix, "forward.wig"]))
            args_srna.wig_r_file = os.path.join(
                    args_srna.frag_path, "_".join([prefix, "reverse.wig"]))
            args_srna.wig_folder = args_srna.frag_wigs
            args_srna.input_libs = args_srna.flibs
            args_srna.output_file = files["frag_gff"]
            args_srna.output_table = files["frag_csv"]
            args_srna.utr_coverages = args_srna.utr_frag_cover
            args_srna.notex = None
        else:
            args_srna.wig_f_file = os.path.join(
                    args_srna.tex_path, "_".join([prefix, "forward.wig"]))
            args_srna.wig_r_file = os.path.join(
                    args_srna.tex_path, "_".join([prefix, "reverse.wig"]))
            args_srna.wig_folder = args_srna.tex_wigs
            args_srna.input_libs = args_srna.tlibs
            args_srna.output_file = files["tex_gff"]
            args_srna.output_table = files["tex_csv"]
            args_srna.utr_coverages = args_srna.utr_tex_cover
            args_srna.notex = args_srna.utr_notex_cover
        args_srna.coverages = {"5utr": args_srna.utr_coverages[0],
                               "3utr": args_srna.utr_coverages[1],
                               "interCDS": args_srna.utr_coverages[2]}
        if args_srna.notex is not None:
            args_srna.cover_notex = {"5utr": args_srna.notex[0],
                                     "3utr": args_srna.notex[1],
                                     "interCDS": args_srna.notex[2]}
        else:
            args_srna.cover_notex = None
        return args_srna

    def extend_inter_container(self, args_srna, tsss, pros, wigs_f, wigs_r,
                               nums, output, out_table, texs, detects,
                               cutoff_coverage, notex):
        args_srna.tsss = tsss
        args_srna.pros = pros
        args_srna.wigs_f = wigs_f
        args_srna.wigs_r = wigs_r
        args_srna.nums = nums
        args_srna.output = output
        args_srna.out_table = out_table
        args_srna.texs = texs
        args_srna.detects = detects
        args_srna.cutoff_coverage = cutoff_coverage
        args_srna.notex = notex
        return args_srna

    def extend_utr_container(self, args_srna, cdss, tsss, pros, wig_fs, wig_rs,
                             out, out_t, texs):
        args_srna.cdss = cdss
        args_srna.tsss = tsss
        args_srna.pros = pros
        args_srna.wig_fs = wig_fs
        args_srna.wig_rs = wig_rs
        args_srna.out = out
        args_srna.out_t = out_t
        args_srna.texs = texs
        args_srna.utrs = []
        args_srna.srnas = []
        return args_srna

    def container_sorf(self, sorf_folder, UTR_derived_sORF, transcript_folder,
                       annotation_folder, TSS_folder, utr_length, min_length,
                       max_length, tex_wig_folder, frag_wig_folder,
                       cutoff_intergenic_coverage, cutoff_antisense_coverage,
                       cutoff_5utr_coverage, cutoff_3utr_coverage,
                       cutoff_interCDS_coverage, fasta_folder, tex_notex_libs,
                       frag_libs, tex_notex, replicates_tex, replicates_frag,
                       table_best, sRNA_folder, start_codon, stop_codon,
                       cutoff_background, fuzzy_rbs, rbs_not_after_TSS,
                       print_all_combination, best_no_sRNA, best_no_TSS,
                       ignore_hypothetical_protein, min_rbs_distance,
                       max_rbs_distance):
        self.out_folder = sorf_folder
        self.utr_detect = UTR_derived_sORF
        self.trans = transcript_folder
        self.gffs = annotation_folder
        self.tsss = TSS_folder
        self.utr_length = utr_length
        self.min_len = min_length
        self.max_len = max_length
        self.tex_wigs = tex_wig_folder
        self.frag_wigs = frag_wig_folder
        self.cutoff_inter = cutoff_intergenic_coverage
        self.cutoff_anti = cutoff_antisense_coverage
        self.cutoff_5utr = cutoff_5utr_coverage
        self.cutoff_3utr = cutoff_3utr_coverage
        self.cutoff_intercds = cutoff_interCDS_coverage
        self.fastas = fasta_folder
        self.tlibs = self._deal_multi_inputs(tex_notex_libs, "str", None, None)
        self.flibs = self._deal_multi_inputs(frag_libs, "str", None, None)
        self.libs = self._check_libs(self.tlibs, self.flibs)
        self.tex_notex = tex_notex
        self.replicates_tex = replicates_tex
        self.replicates_frag = replicates_frag
        self.replicates = self._check_replicates(
                replicates_tex, replicates_frag)
        self.table_best = table_best
        self.srnas = sRNA_folder
        self.start_codon = self._deal_multi_inputs(start_codon, "str",
                                                   None, None)
        self.stop_codon = self._deal_multi_inputs(stop_codon, "str",
                                                  None, None)
        self.background = cutoff_background
        self.fuzzy_rbs = fuzzy_rbs
        self.noafter_tss = rbs_not_after_TSS
        self.print_all = print_all_combination
        self.no_srna = best_no_sRNA
        self.no_tss = best_no_TSS
        self.hypo = ignore_hypothetical_protein
        self.min_rbs = min_rbs_distance
        self.max_rbs = max_rbs_distance
        self = self._parser_combine_wigs("sorf")
        return self

    def container_srna_target(self, Vienna_folder, annotation_path, fasta_path,
                              sRNA_path, query_sRNA, program,
                              interaction_length, window_size_target,
                              span_target, window_size_srna, span_srna,
                              unstructured_region_RNAplex_target,
                              unstructured_region_RNAplex_srna,
                              unstructured_region_RNAup, energy_threshold,
                              duplex_distance, top, starget_output_folder,
                              process_rnaplex, process_rnaup, continue_rnaup,
                              potential_target_start, potential_target_end,
                              target_feature):
        self.vienna_path = Vienna_folder
        self.gffs = annotation_path
        self.fastas = fasta_path
        self.srnas = sRNA_path
        self.query = self._deal_multi_inputs(query_sRNA, "str", None, None)
        self.program = program
        self.inter_length = interaction_length
        self.win_size_t = window_size_target
        self.span_t = span_target
        self.win_size_s = window_size_srna
        self.span_s = span_srna
        self.unstr_region_rnaplex_t = unstructured_region_RNAplex_target
        self.unstr_region_rnaplex_s = unstructured_region_RNAplex_srna
        self.unstr_region_rnaup = unstructured_region_RNAup
        self.energy = energy_threshold
        self.duplex_dist = duplex_distance
        self.top = top
        self.out_folder = starget_output_folder
        self.core_plex = process_rnaplex
        self.core_up = process_rnaup
        self.continue_rnaup = continue_rnaup
        self.tar_start = potential_target_start
        self.tar_end = potential_target_end
        self.features = self._deal_multi_inputs(target_feature, "str",
                                                None, None)
        return self

    def container_goterm(self, annotation_path, goterm_output_folder,
                         UniProt_id, go_obo, goslim_obo, transcript_path):
        self.gffs = annotation_path
        self.out_folder = goterm_output_folder
        self.uniprot = UniProt_id
        self.go = go_obo
        self.goslim = goslim_obo
        self.trans = transcript_path
        return self

    def container_sublocal(self, Psortb_path, gff_path, fasta_path,
                           bacteria_type, difference_multi, merge_to_gff,
                           sublocal_output_folder, transcript_path):
        self.psortb_path = Psortb_path
        self.gffs = gff_path
        self.fastas = fasta_path
        self.gram = bacteria_type
        self.fuzzy = difference_multi
        self.merge = merge_to_gff
        self.out_folder = sublocal_output_folder
        self.trans = transcript_path
        return self

    def container_ppi(self, gff_path, proteinID_strains, without_strain_pubmed,
                      species_STRING, score, ppi_output_folder, node_size,
                      query):
        self.ptts = gff_path
        self.strains = self._deal_multi_inputs(proteinID_strains, "str",
                                               None, None)
        self.no_specific = without_strain_pubmed
        self.species = species_STRING
        self.score = score
        self.out_folder = ppi_output_folder
        self.size = node_size
        self.querys = self._deal_multi_inputs(query, "str", None, None)
        return self

    def container_promoter(self, MEME_path, promoter_output_folder, tex_libs,
                           TSS_folder, fasta_folder, num_motif, nt_before_TSS,
                           motif_width, TSS_source, tex_wig_path,
                           annotation_folder, combine_all, e_value):
        self.meme_path = MEME_path
        self.output_folder = promoter_output_folder
        self.input_libs = self._deal_multi_inputs(tex_libs, "str", None, None)
        self.tsss = TSS_folder
        self.fastas = fasta_folder
        self.num_motif = num_motif
        self.nt_before = nt_before_TSS
        self.widths = self._deal_multi_inputs(motif_width, "str", None, None)
        self.source = TSS_source
        self.wigs = tex_wig_path
        self.gffs = annotation_folder
        self.combine = combine_all
        self.e_value = e_value
        return self

    def container_operon(self, TSS_folder, annotation_folder,
                         transcript_folder, UTR5_folder, UTR3_folder,
                         term_folder, TSS_fuzzy, term_fuzzy, min_length,
                         statistics, operon_output_folder, combine_gff,
                         operon_statistics_folder):
        self.tsss = TSS_folder
        self.gffs = annotation_folder
        self.trans = transcript_folder
        self.utr5s = UTR5_folder
        self.utr3s = UTR3_folder
        self.terms = term_folder
        self.tss_fuzzy = TSS_fuzzy
        self.term_fuzzy = term_fuzzy
        self.length = min_length
        self.statistics = statistics
        self.output_folder = operon_output_folder
        self.combine = combine_gff
        self.stat_folder = operon_statistics_folder
        return self

    def container_snp(self, samtools_path, bcftools_path, bam_type, program,
                      fasta_path, tex_bam_path, frag_bam_path, quality,
                      read_depth, snp_output_folder, indel_fraction, chrom):
        self.samtools_path = samtools_path
        self.bcftools_path = bcftools_path
        self.types = bam_type
        self.program = self._deal_multi_inputs(program, "str", None, None)
        self.fastas = fasta_path
        self.normal_bams = tex_bam_path
        self.frag_bams = frag_bam_path
        self.quality = quality
        self.depth = read_depth
        self.out_folder = snp_output_folder
        self.fraction = indel_fraction
        if chrom == "haploid":
            chrom = "1"
        elif chrom == "diploid":
            chrom = "2"
        self.chrom = chrom
        return self

    def container_circrna(self, align, process, fasta_path, annotation_path,
                          tex_bam_path, fragmented_bam_path, read_folder,
                          circrna_stat_folder, support_reads,
                          segemehl_folder, samtools_path, start_ratio,
                          end_ratio, ignore_hypothetical_protein, out_folder):
        self.align = align
        self.cores = process
        self.fastas = fasta_path
        self.gffs = annotation_path
        self.normal_bams = tex_bam_path
        self.frag_bams = fragmented_bam_path
        self.read_folder = read_folder
        self.stat_folder = circrna_stat_folder
        self.support = support_reads
        self.segemehl_path = segemehl_folder
        self.samtools_path = samtools_path
        self.start_ratio = start_ratio
        self.end_ratio = end_ratio
        self.hypo = ignore_hypothetical_protein
        self.output_folder = out_folder
        return self

    def container_ribos(self, infernal_path, riboswitch_ID, gff_path,
                        fasta_path, tss_path, transcript_path, Rfam,
                        ribos_output_folder, e_value, output_all,
                        database_folder, fuzzy, start_codon, min_dist_rbs,
                        max_dist_rbs, fuzzy_rbs, UTR_length):
        self.infernal_path = infernal_path
        self.ribos_id = riboswitch_ID
        self.gffs = gff_path
        self.fastas = fasta_path
        self.tsss = tss_path
        self.trans = transcript_path
        self.rfam = Rfam
        self.out_folder = ribos_output_folder
        self.e_value = e_value
        self.output_all = output_all
        self.database = database_folder
        self.fuzzy = fuzzy
        self.start_codons = self._deal_multi_inputs(start_codon, "str",
                                                    None, None)
        self.start_rbs = min_dist_rbs
        self.end_rbs = max_dist_rbs
        self.fuzzy_rbs = fuzzy_rbs
        self.utr = UTR_length
        return self

    def container_screen(self, main_gff, side_gffs, fasta, frag_wig_folder,
                         tex_wig_folder, height, tex_libs, frag_libs, present,
                         output_folder):
        self.main_gff = main_gff
        self.side_gffs = self._deal_multi_inputs(side_gffs, "str", None, None)
        self.fasta = fasta
        self.frag_wigs = frag_wig_folder
        self.tex_wigs = tex_wig_folder
        self.height = height
        self.tlibs = self._deal_multi_inputs(tex_libs, "str", None, None)
        self.flibs = self._deal_multi_inputs(frag_libs, "str", None, None)
        self.present = present
        self.output_folder = output_folder
        return self
Exemple #59
0
class MEME(object):
    '''detection of promoter'''

    def __init__(self, args_pro):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.tss_path = os.path.join(args_pro.tsss, "tmp")
        if args_pro.gffs is not None:
            self.gff_path = os.path.join(args_pro.gffs, "tmp")
        else:
            self.gff_path = None
        self.out_fasta = os.path.join(args_pro.output_folder, "fasta_classes")
        self.tmp_folder = os.path.join(os.getcwd(), "tmp")
        self.fastas = {"pri": os.path.join(self.tmp_folder, "primary.fa"),
                       "sec": os.path.join(self.tmp_folder, "secondary.fa"),
                       "inter": os.path.join(self.tmp_folder, "internal.fa"),
                       "anti": os.path.join(self.tmp_folder, "antisense.fa"),
                       "orph": os.path.join(self.tmp_folder, "orphan.fa"),
                       "all_no_orph": "without_orphan.fa",
                       "all": "all_type.fa",
                       "tmp_fa": os.path.join(self.tmp_folder, "tmp.fa"),
                       "tmp_all": os.path.join(self.tmp_folder, "tmp_all.fa")}
        self.all_fasta = os.path.join(args_pro.fastas, "allfasta.fa")
        self.all_tss = os.path.join(self.tss_path, "allfasta_TSS.gff")

    def _gen_and_check_folder(self, out_path, folder, type_):
        sub_out_folder = os.path.join(out_path, type_)
        if folder in os.listdir(sub_out_folder):
            shutil.rmtree(os.path.join(sub_out_folder, folder))
        return sub_out_folder

    def _run_normal_motif(self, input_path, out_path, filename,
                          fasta, width, args_pro, log):
        '''run MEME with specific width'''
        folder = "_".join(["promoter_motifs", filename,
                           str(width), "nt"])
        if (args_pro.program.lower() == "meme") or (
                args_pro.program.lower() == "both"):
            meme_folder = self._gen_and_check_folder(
                            out_path, folder, "MEME")
            command = [args_pro.meme_path, "-maxsize", "1000000",
                       "-dna", "-nmotifs", str(args_pro.num_motif),
                       "-w", str(width), "-maxiter", "100",
                       "-evt", str(args_pro.e_value)]
            if args_pro.para is not None:
                command = command + ["-p", args_pro.para]
            log.write(" ".join(command + ["-oc", os.path.join(
                      meme_folder, folder),
                      os.path.join(input_path, fasta)]) + "\n")
            call(command + ["-oc", os.path.join(meme_folder, folder),
                            os.path.join(input_path, fasta)])
        if (args_pro.program.lower() == "glam2") or (
                args_pro.program.lower() == "both"):
            glam_folder = self._gen_and_check_folder(
                            out_path, folder, "GLAM2")
            log.write(" ".join([args_pro.glam2_path,
                  "-O", os.path.join(glam_folder, folder), "-w",
                  str(width), "-b", str(width), "-r",
                  str(args_pro.num_motif), "-n", str(args_pro.end_run),
                  "n", os.path.join(input_path, fasta)]) + "\n")
            call([args_pro.glam2_path,
                  "-O", os.path.join(glam_folder, folder), "-w",
                  str(width), "-b", str(width), "-r",
                  str(args_pro.num_motif), "-n", str(args_pro.end_run),
                  "n", os.path.join(input_path, fasta)])

    def _run_small_motif(self, input_path, out_path, filename,
                         fasta, width, args_pro, log):
        '''run MEME with range of width'''
        data = width.split("-")
        min_width = data[0]
        max_width = data[1]
        folder = "_".join(["promoter_motifs", filename,
                           "-".join([str(min_width), str(max_width)]), "nt"])
        if (args_pro.program.lower() == "meme") or (
                args_pro.program.lower() == "both"):
            meme_folder = self._gen_and_check_folder(
                            out_path, folder, "MEME")
            command = [args_pro.meme_path, "-maxsize", "1000000",
                       "-dna", "-nmotifs", str(args_pro.num_motif),
                       "-minsites", "0", "-maxsites", "2",
                       "-minw", str(min_width), "-maxw", str(max_width),
                       "-maxiter", "100",
                       "-evt", str(args_pro.e_value)]
            if args_pro.para is not None:
                command = command + ["-p", args_pro.para]
            log.write(" ".join(command + ["-oc", os.path.join(
                      meme_folder, folder),
                      os.path.join(input_path, fasta)]) + "\n")
            call(command + ["-oc", os.path.join(meme_folder, folder),
                            os.path.join(input_path, fasta)])
        if (args_pro.program.lower() == "glam2") or (
                args_pro.program.lower() == "both"):
            glam_folder = self._gen_and_check_folder(
                            out_path, folder, "GLAM2")
            log.write(" ".join([args_pro.glam2_path,
                  "-O", os.path.join(glam_folder, folder), "-a",
                  str(min_width), "-b", str(max_width), "-r",
                  str(args_pro.num_motif), "-n", str(args_pro.end_run),
                  "n", os.path.join(input_path, fasta)]) + "\n")
            call([args_pro.glam2_path,
                  "-O", os.path.join(glam_folder, folder), "-a",
                  str(min_width), "-b", str(max_width), "-r",
                  str(args_pro.num_motif), "-n", str(args_pro.end_run),
                  "n", os.path.join(input_path, fasta)])

    def _get_fasta_file(self, fasta_path, prefix):
        for fasta in os.listdir(fasta_path):
            if (fasta.endswith(".fa")) and \
               (prefix == fasta.replace(".fa", "")):
                break
            elif (fasta.endswith(".fna")) and \
                 (prefix == fasta.replace(".fna", "")):
                break
            elif (fasta.endswith(".fasta")) and \
                 (prefix == fasta.replace(".fasta", "")):
                break
        return fasta

    def _check_gff(self, gffs):
        for gff in os.listdir(gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(gffs, gff))

    def _move_and_merge_fasta(self, input_path, prefix):
        all_type = os.path.join(self.tmp_folder, self.fastas["all"])
        all_no_orph = os.path.join(self.tmp_folder, self.fastas["all_no_orph"])
        if self.fastas["all"] in os.listdir(self.tmp_folder):
            os.remove(all_type)
        if self.fastas["all_no_orph"] in os.listdir(self.tmp_folder):
            os.remove(all_no_orph)
        shutil.copyfile(self.fastas["pri"], self.fastas["tmp_fa"])
        self.helper.merge_file(self.fastas["sec"], self.fastas["tmp_fa"])
        self.helper.merge_file(self.fastas["inter"], self.fastas["tmp_fa"])
        self.helper.merge_file(self.fastas["anti"], self.fastas["tmp_fa"])
        shutil.copyfile(self.fastas["tmp_fa"], self.fastas["tmp_all"])
        self.helper.merge_file(self.fastas["orph"], self.fastas["tmp_all"])
        del_repeat_fasta(self.fastas["tmp_fa"], all_no_orph)
        del_repeat_fasta(self.fastas["tmp_all"], all_type)
        os.remove(self.fastas["tmp_fa"])
        os.remove(self.fastas["tmp_all"])
        out_prefix = os.path.join(input_path, prefix)
        shutil.move(self.fastas["pri"], "_".join([
            out_prefix, "allgenome_primary.fa"]))
        shutil.move(self.fastas["sec"], "_".join([
            out_prefix, "allgenome_secondary.fa"]))
        shutil.move(self.fastas["inter"], "_".join([
            out_prefix, "allgenome_internal.fa"]))
        shutil.move(self.fastas["anti"], "_".join([
            out_prefix, "allgenome_antisense.fa"]))
        shutil.move(self.fastas["orph"], "_".join([
            out_prefix, "allgenome_orphan.fa"]))
        shutil.move(all_type, "_".join([
            out_prefix, "allgenome_all_types.fa"]))
        shutil.move(all_no_orph, "_".join([
            out_prefix, "allgenome_without_orphan.fa"]))

    def _split_fasta_by_strain(self, input_path):
        for fasta in os.listdir(input_path):
            if "allgenome" not in fasta:
                os.remove(os.path.join(input_path, fasta))
        out = None
        for fasta in os.listdir(input_path):
            if fasta.endswith(".fa"):
                pre_strain = ""
                num_strain = 0
                with open(os.path.join(input_path, fasta), "r") as f_h:
                    for line in f_h:
                        line = line.strip()
                        if line.startswith(">"):
                            datas = line.split("_")
                            strain = "_".join(datas[2:])
                            if pre_strain != strain:
                                num_strain += 1
                                filename = fasta.split("allgenome")
                                if out is not None:
                                    out.close()
                                out = open(os.path.join(
                                           input_path, "".join([
                                               filename[0], strain,
                                               filename[-1]])), "a")
                                pre_strain = strain
                            out.write(line + "\n")
                        else:
                            out.write(line + "\n")
                if num_strain <= 1:
                    os.remove(os.path.join(input_path,
                              "".join([filename[0], strain, filename[-1]])))
        out.close()

    def _run_program(self, prefixs, args_pro, log, input_fastas):
        log.write("Using MEME or GLAM2 to predict promoter.\n")
        log.write("Please make sure their versions are at least 4.11.1.\n")
        log.write("If you are running for parallel, please make sure you "
                  "have install MPICH and its version is at least 3.2.\n")
        for prefix in prefixs:
            input_path = os.path.join(self.out_fasta, prefix)
            out_path = os.path.join(args_pro.output_folder, prefix)
            if args_pro.program.lower() == "both":
                self.helper.check_make_folder(os.path.join(out_path, "MEME"))
                self.helper.check_make_folder(os.path.join(out_path, "GLAM2"))
            elif args_pro.program.lower() == "meme":
                self.helper.check_make_folder(os.path.join(out_path, "MEME"))
            elif args_pro.program.lower() == "glam2":
                self.helper.check_make_folder(os.path.join(out_path, "GLAM2"))
            for fasta in os.listdir(input_path):
                filename = fasta.replace(".fa", "")
                names = filename.split("_")
                if (names[-1] in input_fastas) or (
                        ("_".join(names[-2:]) == "all_types") and (
                         "all_types" in input_fastas)) or (
                        ("_".join(names[-2:]) == "without_orphan") and (
                         "without_orphan" in input_fastas)):
                    for width in args_pro.widths:
                        print("Computing promoters of {0} - {1}".format(
                              fasta, width))
                        log.write("Computing promoters of {0} - length {1}.\n".format(
                                  fasta, width))
                        if "-" in width:
                            self._run_small_motif(input_path, out_path, filename,
                                                  fasta, width, args_pro, log)
                        else:
                            self._run_normal_motif(input_path, out_path, filename,
                                                   fasta, width, args_pro, log)
            log.write("Promoter search for {0} is done.\n".format(prefix))
            log.write("All the output files from MEME or GLAM2 are generated "
                      "and stored in {0}.\n".format(out_path))

    def _combine_file(self, prefixs, args_pro):
        '''combine all TSS file in the input folder to generate the 
        global TSS for detecting the global promoter'''
        if args_pro.source:
            for tss in os.listdir(self.tss_path):
                if tss.endswith("_TSS.gff"):
                    self.helper.merge_file(os.path.join(
                         self.tss_path, tss), self.all_tss)
            for fasta in os.listdir(args_pro.fastas):
                if (fasta.endswith(".fa")) or (
                        fasta.endswith(".fna")) or (
                        fasta.endswith(".fasta")):
                    self.helper.merge_file(os.path.join(
                         args_pro.fastas, fasta), self.all_fasta)
        else:
            for tss in os.listdir(os.path.join(
                                  args_pro.output_folder, "TSS_classes")):
                if tss.endswith("_TSS.gff"):
                    self.helper.merge_file(os.path.join(
                         self.tss_path, tss), self.all_tss)
            for fasta in os.listdir(args_pro.fastas):
                if (fasta.endswith(".fa")) or (
                        fasta.endswith(".fna")) or (
                        fasta.endswith(".fasta")):
                    self.helper.merge_file(os.path.join(
                         args_pro.fastas, fasta), self.all_fasta)
        print("Generating fasta file of all sequences")
        prefixs.append("allfasta")
        input_path = os.path.join(self.out_fasta, "allfasta")
        self.helper.check_make_folder(os.path.join(
                                      args_pro.output_folder, "allfasta"))
        self.helper.check_make_folder(os.path.join(
                                      self.out_fasta, "allfasta"))
        args_pro.source = True
        upstream(self.all_tss, self.all_fasta, None,
                 None, args_pro, None)
        self._move_and_merge_fasta(input_path, "allfasta")

    def _remove_files(self, args_pro):
        self.helper.remove_tmp_dir(args_pro.fastas)
        self.helper.remove_tmp_dir(args_pro.tsss)
        self.helper.remove_tmp_dir(args_pro.gffs)
        if "tmp_wig" in os.listdir(args_pro.output_folder):
            shutil.rmtree(os.path.join(args_pro.output_folder, "tmp_wig"))
        if "allfasta" in os.listdir(os.getcwd()):
            shutil.rmtree("allfasta")
        if "tmp" in os.listdir(os.getcwd()):
            shutil.rmtree("tmp")

    def _gen_table(self, output_folder, prefixs, combine, program, log):
        '''generate the promoter table'''
        log.write("Running gen_promoter_table.py to generate promoter "
                  "table which is useful for sRNA prediction.\n")
        log.write("The following files are generated:\n")
        if combine:
            strains = prefixs + ["allfasta"]
        else:
            strains = prefixs
        for strain in strains:
            tss_file = os.path.join(self.tss_path, strain + "_TSS.gff")
            if (program.lower() == "both") or (
                    program.lower() == "meme"):
                for folder in os.listdir(os.path.join(output_folder,
                                                      strain, "MEME")):
                    csv_file = os.path.join(output_folder, strain,
                                            "MEME", folder, "meme.csv")
                    gen_promoter_table(os.path.join(output_folder, strain,
                                       "MEME", folder, "meme.txt"),
                                       csv_file, tss_file, "meme")
                    log.write("\t" + csv_file + "\n")
            if (program.lower() == "both") or (
                    program.lower() == "glam2"):
                for folder in os.listdir(os.path.join(output_folder,
                                                      strain, "GLAM2")):
                    csv_file = os.path.join(output_folder, strain,
                                            "GLAM2", folder, "glam2.csv")
                    gen_promoter_table(os.path.join(output_folder, strain,
                                        "GLAM2", folder, "glam2.txt"),
                                        csv_file, tss_file, "glam2")
                    log.write("\t" + csv_file + "\n")

    def _get_upstream(self, args_pro, prefix, tss, fasta):
        '''get upstream sequence of TSS'''
        if args_pro.source:
            print("Generating fasta file of {0}".format(prefix))
            upstream(os.path.join(self.tss_path, tss),
                     os.path.join(args_pro.fastas, fasta),
                     None, None, args_pro, prefix)
        else:
            if (args_pro.gffs is None):
                print("Error: Please assign proper annotation!!!")
                sys.exit()
            if "TSS_classes" not in os.listdir(args_pro.output_folder):
                os.mkdir(os.path.join(args_pro.output_folder, "TSS_classes"))            
            print("Classifying TSSs and extracting sequence of {0}".format(prefix))
            upstream(os.path.join(self.tss_path, tss),
                     os.path.join(args_pro.fastas, fasta),
                     os.path.join(self.gff_path, prefix + ".gff"),
                     os.path.join(args_pro.output_folder, "TSS_classes",
                     "_".join([prefix, "TSS.gff"])), args_pro, prefix)

    def _get_used_tss_type(self, args_pro):
        input_fastas = []
        for tss in args_pro.use_tss:
            if int(tss) == 1:
                input_fastas.append("all_types")
            elif int(tss) == 2:
                input_fastas.append("primary")
            elif int(tss) == 3:
                input_fastas.append("secondary")
            elif int(tss) == 4:
                input_fastas.append("internal")
            elif int(tss) == 5:
                input_fastas.append("antisense")
            elif int(tss) == 6:
                input_fastas.append("orphan")
            elif int(tss) == 7:
                input_fastas.append("without_orphan")
            else:
                print("Error: The assignment of --use_tss_typ is wrong!")
                sys.exit()
        return input_fastas

    def run_meme(self, args_pro, log):
        if "allfasta.fa" in os.listdir(args_pro.fastas):
            os.remove(self.all_fasta)
            if "allfasta.fa_folder" in os.listdir(args_pro.fastas):
                shutil.rmtree(os.path.join(args_pro.fastas,
                              "allfasta.fa_folder"))
        self.multiparser.parser_fasta(args_pro.fastas)
        self.multiparser.parser_gff(args_pro.tsss, "TSS")
        if "allfasta_TSS.gff" in os.listdir(self.tss_path):
            os.remove(self.all_tss)
        if args_pro.gffs is not None:
            self._check_gff(args_pro.gffs)
            self.multiparser.parser_gff(args_pro.gffs, None)
            self.multiparser.combine_gff(args_pro.fastas, self.gff_path,
                                         "fasta", None)
        self._check_gff(args_pro.tsss)
        self.multiparser.combine_gff(args_pro.fastas, self.tss_path,
                                     "fasta", "TSS")
        self.helper.check_make_folder(self.out_fasta)
        self.helper.check_make_folder(self.tmp_folder)
        prefixs = []
        log.write("Running .TSS_upstream.py to extract the upstream "
                  "sequences of TSSs.\n")
        log.write("The following files are generated:\n")
        for tss in os.listdir(self.tss_path):
            prefix = tss.replace("_TSS.gff", "")
            prefixs.append(prefix)
            self.helper.check_make_folder(os.path.join(args_pro.output_folder,
                                                       prefix))
            self.helper.check_make_folder(os.path.join(self.out_fasta,
                                                       prefix))
            input_path = os.path.join(self.out_fasta, prefix)
            fasta = self._get_fasta_file(args_pro.fastas, prefix)
            self._get_upstream(args_pro, prefix, tss, fasta)
            self._move_and_merge_fasta(input_path, prefix)
            self._split_fasta_by_strain(input_path)
            for file_ in os.listdir(input_path):
                log.write("\t" + os.path.join(input_path, file_) + "\n")
        if args_pro.combine:
            self._combine_file(prefixs, args_pro)
            for file_ in os.listdir(os.path.join(self.out_fasta, "allfasta")):
                log.write("\t" + os.path.join(
                    self.out_fasta, "allfasta", file_) + "\n")
        input_fastas = self._get_used_tss_type(args_pro)
        self._run_program(prefixs, args_pro, log, input_fastas)
        print("Generating the tables")
        self._gen_table(args_pro.output_folder, prefixs,
                        args_pro.combine, args_pro.program, log)
        self._remove_files(args_pro)
Exemple #60
0
class sORFDetection(object):
    '''detection of sORF'''

    def __init__(self, args_sorf):
        self.multiparser = Multiparser()
        self.helper = Helper()
        if args_sorf.tsss is not None:
            self.tss_path = os.path.join(args_sorf.tsss, "tmp")
        else:
            self.tss_path = None
        if args_sorf.srnas is not None:
            self.srna_path = os.path.join(args_sorf.srnas, "tmp")
        else:
            self.srna_path = None
        self.gff_output = os.path.join(args_sorf.out_folder, "gffs")
        self.table_output = os.path.join(args_sorf.out_folder, "tables")
        self.tran_path = os.path.join(args_sorf.trans, "tmp")
        self.fasta_path = os.path.join(args_sorf.fastas, "tmp")
        self.all_cand = "all_candidates"
        self.best = "best_candidates"

    def _check_gff(self, gffs):
        for gff in os.listdir(gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(gffs, gff))

    def _check_necessary_files(self, args_sorf, log):
        if (args_sorf.gffs is None) or (args_sorf.trans is None) or (
               (args_sorf.tex_wigs is None) and (args_sorf.frag_wigs is None)):
            print("Error: lack required files!")
            log.write("genome annotation, transcript file or wiggle files "
                      "are not assigned.\n")
            sys.exit()
        if args_sorf.utr_detect:
            if (args_sorf.tsss is None):
                print("Error: TSS files are required for UTR derived"
                      " sORF detection!")
                log.write("TSS files are required for UTR derived"
                          " sORF detection!\n")
                sys.exit()
        self._check_gff(args_sorf.gffs)
        self.multiparser.parser_gff(args_sorf.gffs, None)
        if args_sorf.tsss is not None:
            self._check_gff(args_sorf.tsss)
            self.multiparser.parser_gff(args_sorf.tsss, "TSS")
            self.multiparser.combine_gff(args_sorf.gffs, self.tss_path,
                                         None, "TSS")
        self._check_gff(args_sorf.trans)
        if args_sorf.srnas is not None:
            self._check_gff(args_sorf.srnas)
            self.multiparser.parser_gff(args_sorf.srnas, "sRNA")
            self.multiparser.combine_gff(args_sorf.gffs, self.srna_path,
                                         None, "sRNA")

    def _start_stop_codon(self, prefixs, args_sorf, log):
        '''detect the sORF based on start and stop codon 
        and ribosome binding site'''
        log.write("Running sORF_detection.py for detecting sORFs.\n")
        log.write("The following files are generated:\n")
        for prefix in prefixs:
            print("Searching sORFs of {0}".format(prefix))
            if self.srna_path is not None:
                srna_file = os.path.join(self.srna_path,
                                         "_".join([prefix, "sRNA.gff"]))
            else:
                srna_file = None
            if self.tss_path is not None:
                tss_file = os.path.join(self.tss_path,
                                        "_".join([prefix, "TSS.gff"]))
            else:
                tss_file = None
            sorf_detection(os.path.join(self.fasta_path, prefix + ".fa"),
                           srna_file, os.path.join(args_sorf.out_folder,
                           "_".join([prefix, "inter.gff"])), tss_file,
                           os.path.join(args_sorf.wig_path,
                           "_".join([prefix, "forward.wig"])),
                           os.path.join(args_sorf.wig_path,
                           "_".join([prefix, "reverse.wig"])),
                           os.path.join(self.gff_output, self.all_cand,
                           "_".join([prefix, "sORF"])), args_sorf)
            if "_".join([prefix, "sORF_all.gff"]) in os.listdir(
                         os.path.join(self.gff_output, self.all_cand)):
                gff_all = os.path.join(self.gff_output, self.all_cand,
                                       "_".join([prefix, "sORF.gff"]))
                gff_best = os.path.join(self.gff_output, self.best,
                                        "_".join([prefix, "sORF.gff"]))
                csv_all = os.path.join(self.table_output, self.all_cand,
                                       "_".join([prefix, "sORF.csv"]))
                csv_best =  os.path.join(self.table_output, self.best,
                                         "_".join([prefix, "sORF.csv"]))
                shutil.move(os.path.join(self.gff_output, self.all_cand,
                            "_".join([prefix, "sORF_all.gff"])), gff_all)
                shutil.move(os.path.join(self.gff_output, self.all_cand,
                            "_".join([prefix, "sORF_best.gff"])), gff_best)
                shutil.move(os.path.join(self.gff_output, self.all_cand,
                            "_".join([prefix, "sORF_all.csv"])), csv_all)
                shutil.move(os.path.join(self.gff_output, self.all_cand,
                            "_".join([prefix, "sORF_best.csv"])), csv_best)
                log.write("\t" + gff_all + "\n")
                log.write("\t" + gff_best + "\n")
                log.write("\t" + csv_all + "\n")
                log.write("\t" + csv_best + "\n")

    def _remove_tmp(self, args_sorf):
        self.helper.remove_all_content(args_sorf.out_folder, ".gff", "file")
        self.helper.remove_tmp_dir(args_sorf.fastas)
        self.helper.remove_tmp_dir(args_sorf.gffs)
        self.helper.remove_tmp_dir(args_sorf.tsss)
        self.helper.remove_tmp_dir(args_sorf.trans)
        self.helper.remove_tmp_dir(args_sorf.srnas)
        if "temp_wig" in os.listdir(args_sorf.out_folder):
            shutil.rmtree(os.path.join(args_sorf.out_folder, "temp_wig"))
        if "merge_wigs" in os.listdir(args_sorf.out_folder):
            shutil.rmtree(os.path.join(args_sorf.out_folder, "merge_wigs"))

    def _compare_tran_cds(self, args_sorf, log):
        '''compare transcript and CDS to find the intergenic region'''
        prefixs = []
        log.write("Running sORF_intergenic.py to extract the sequences of "
                  "potential sORFs\n")
        for gff in os.listdir(args_sorf.gffs):
            if gff.endswith(".gff"):
                prefix = gff.replace(".gff", "")
                prefixs.append(prefix)
                print("Comparing transcripts and CDSs of {0}".format(prefix))
                get_intergenic(os.path.join(args_sorf.gffs, gff),
                               os.path.join(self.tran_path,
                               "_".join([prefix, "transcript.gff"])),
                               os.path.join(args_sorf.out_folder,
                               "_".join([prefix, "inter.gff"])),
                               args_sorf.utr_detect, args_sorf.hypo,
                               args_sorf.extend_5, args_sorf.extend_3)
                log.write("\t" + os.path.join(args_sorf.out_folder,
                          "_".join([prefix, "inter.gff"])) + 
                          " is generated to temporary store the sequences.\n")
        return prefixs

    def _re_table(self, args_sorf, prefixs, log):
        log.write("Running re_table.py for generating coverage information.\n")
        log.write("The following files are updated:\n")
        for type_ in ["all_candidates", "best_candidates"]:
            for prefix in prefixs:
                table_file = os.path.join(args_sorf.out_folder, "tables",
                                          type_, "_".join([
                                          prefix, "sORF.csv"]))
                reorganize_table(args_sorf.libs, args_sorf.merge_wigs,
                                 "Track_detail", table_file)
                log.write("\t" + table_file + "\n")

    def run_sorf_detection(self, args_sorf, log):
        if args_sorf.fuzzy_rbs > 6:
            log.write("--fuzzy_rbs should be equal or less than 6!\n")
            print("Error: --fuzzy_rbs should be equal or less than 6!")
            sys.exit()
        self._check_necessary_files(args_sorf, log)
        self.multiparser.parser_gff(args_sorf.trans, "transcript")
        self.multiparser.combine_gff(args_sorf.gffs, self.tran_path,
                                     None, "transcript")
        self.multiparser.parser_fasta(args_sorf.fastas)
        self.multiparser.combine_fasta(args_sorf.gffs, self.fasta_path, None)
        prefixs = self._compare_tran_cds(args_sorf, log)
        self._start_stop_codon(prefixs, args_sorf, log)
        log.write("Running stat_sorf.py to do statistics.\n")
        for sorf in os.listdir(os.path.join(self.gff_output, self.all_cand)):
            print("Running statistics of {0}".format(sorf))
            if sorf.endswith("_sORF.gff"):
                stat_file = os.path.join(args_sorf.out_folder, "statistics",
                            "_".join(["stat", sorf.replace(".gff", ".csv")]))
                stat(os.path.join(self.gff_output, self.all_cand, sorf),
                     os.path.join(self.gff_output, self.best, sorf), stat_file,
                     args_sorf.utr_detect)
                log.write("\t" + stat_file + " is generated.\n")
        self._re_table(args_sorf, prefixs, log)
        self._remove_tmp(args_sorf)