Exemple #1
0
def intergenic_seq(seq_file, tran_file, gff_file, out_file):
    out = open(out_file, "w")
    seq, tas, merges, genes = read_file(seq_file, tran_file, gff_file)
    inter_tas = get_inter(tas, seq, "tran")
    inter_genes = get_inter(genes, seq, "gene")
    merges = merge_inter(inter_tas, inter_genes)
    num = 0
    for tmp_merge in merges:
        corr_merges = detect_confliction(tmp_merge, genes, seq)
        for merge in corr_merges:
            if merge["start"] < merge["end"]:
                if merge["strand"] == "+":
                    inter_seq = Helper().extract_gene(seq[merge["strain"]],
                                                      merge["start"],
                                                      merge["end"], "+")
                    out.write(">" + "|".join([
                        "inter_" + str(num),
                        str(merge["start"]),
                        str(merge["end"]), merge["strain"], merge["parent_p"],
                        merge["parent_m"], "+"
                    ]) + "\n")
                    out.write(inter_seq + "\n")
                    num += 1
                else:
                    inter_seq = Helper().extract_gene(seq[merge["strain"]],
                                                      merge["start"],
                                                      merge["end"], "-")
                    out.write(">" + "|".join([
                        "inter_" + str(num),
                        str(merge["start"]),
                        str(merge["end"]), merge["strain"], merge["parent_p"],
                        merge["parent_m"], "-"
                    ]) + "\n")
                    out.write(inter_seq + "\n")
                    num += 1
Exemple #2
0
def assign_tss(tss, tran):
    if "ID" in tran.attributes.keys():
        tran_id = tran.attributes["ID"]
    else:
        strand = Helper().get_strand_name(tran.strand)
        tran_id = "".join([
            tran.feature, ":",
            str(tran.start), "-",
            str(tran.end), "_", strand
        ])
    if "Parent" not in tss.attributes.keys():
        tss.attributes["Parent"] = tran_id
    else:
        tss.attributes["Parent"] = \
            ",".join([tss.attributes["Parent"], tran_id])
    if "Name" in tss.attributes.keys():
        tss_name = tss.attributes["Name"]
    else:
        strand = Helper().get_strand_name(tss.strand)
        tss_name = "".join(["TSS:", str(tss.start), "_", strand])
    if "associated_tss" not in tran.attributes.keys():
        tran.attributes["associated_tss"] = tss_name
    else:
        tran.attributes["associated_tss"] = \
            ",".join([tran.attributes["associated_tss"], tss_name])
Exemple #3
0
def get_fasta(seq, merge, num, strand, args_term, out, out_i):
    if (merge["end"] - merge["start"]) > args_term.window:
        detect_out = False
        for start in range(merge["start"], merge["end"] + 1, args_term.shift):
            if (merge["end"] - (start + args_term.window)) < args_term.shift:
                end = merge["end"]
                detect_out = True
            else:
                end = start + args_term.window
            inter_seq = Helper().extract_gene(seq[merge["strain"]], start, end,
                                              strand)
            out_i.write(">" + "|".join([
                "inter_" + str(num),
                str(start),
                str(end), merge["strain"], merge["parent_p"],
                merge["parent_m"], merge["p_pos"], merge["m_pos"], strand
            ]) + "\n")
            out.write(">inter_" + str(num) + "\n")
            out.write(inter_seq + "\n")
            num += 1
            if detect_out:
                break
    else:
        inter_seq = Helper().extract_gene(seq[merge["strain"]], merge["start"],
                                          merge["end"], strand)
        out_i.write(">" + "|".join([
            "inter_" + str(num),
            str(merge["start"]),
            str(merge["end"]), merge["strain"], merge["parent_p"],
            merge["parent_m"], merge["p_pos"], merge["m_pos"], strand
        ]) + "\n")
        out.write(">inter_" + str(num) + "\n")
        out.write(inter_seq + "\n")
        num += 1
    return num
Exemple #4
0
def print_file(final_tsss, program, out_gff):
    num_final = 0
    out = open(out_gff, "w")
    out.write("##gff-version 3\n")
    for tss in final_tsss:
        if "print" in tss.attributes.keys():
            del tss.attributes["print"]
        tss.attributes["ID"] = "_".join(
            [tss.seq_id, program.lower() + str(num_final)])
        num_final += 1
        if program == "TSS":
            strand = Helper().get_strand_name(tss.strand)
            tss.attributes["Name"] = "TSS:" + "_".join(
                [str(tss.start), strand])
        else:
            strand = Helper().get_strand_name(tss.strand)
            tss.attributes["Name"] = "processing:" + "_".join(
                [str(tss.start), strand])
        tss.attribute_string = ";".join(
            ["=".join(items) for items in tss.attributes.items()])
        out.write("\t".join([
            str(field) for field in [
                tss.seq_id, "ANNOgesic", tss.feature, tss.start, tss.end,
                tss.score, tss.strand, tss.phase, tss.attribute_string
            ]
        ]) + "\n")
def get_upstream(seq, tss, out, name, nt_before):
    if tss.strand == "+":
        fasta = Helper().extract_gene(seq, tss.start - nt_before + 1,
                                      tss.start, tss.strand)
    else:
        fasta = Helper().extract_gene(seq, tss.start,
                                      tss.start + nt_before - 1, tss.strand)
    out.write("{0}\n{1}\n".format(name, fasta))
def compare_cds_check_orphan(tsss, cdss):
    for tss in tsss:
        if tss.attributes["type"] == "Orphan":
            for cds in cdss:
                if (tss.seq_id == cds.seq_id) and \
                   (tss.strand == cds.strand):
                    if is_primary(cds.start, cds.end, tss.start, tss.strand):
                        if tss.attributes["type"] != "Orphan":
                            tss.attributes["type"] = "&".join(
                                [tss.attributes["type"], "Primary"])
                            if tss.strand == "+":
                                tss.attributes["UTR_length"] = "&".join([
                                    tss.attributes["UTR_length"],
                                    "Primary_" + str(cds.start - tss.start)
                                ])
                            else:
                                tss.attributes["UTR_length"] = "&".join([
                                    tss.attributes["UTR_length"],
                                    "Primary_" + str(tss.start - cds.end)
                                ])
                        else:
                            tss.attributes["type"] = "Primary"
                            if tss.strand == "+":
                                tss.attributes["UTR_length"] = (
                                    "Primary_" + str(cds.start - tss.start))
                            else:
                                tss.attributes["UTR_length"] = (
                                    "Primary_" + str(tss.start - cds.end))
                        get_attributes(tss, cds)
                    if is_internal(cds.start, cds.end, tss.start, tss.strand):
                        if "locus_tag" in cds.attributes.keys():
                            if (cds.attributes["locus_tag"]
                                    not in tss.attributes["associated_gene"]):
                                get_attributes_int_anti(tss, cds, "Internal")
                        else:
                            strand = Helper().get_strand_name(cds.strand)
                            if ("".join([
                                    cds.feature, ":",
                                    str(cds.start), "-",
                                    str(cds.end), "_", strand
                            ]) not in tss.attributes["associated_gene"]):
                                get_attributes_int_anti(tss, cds, "Internal")
                    if is_antisense(cds.start, cds.end, tss.start, tss.strand):
                        if "locus_tag" in cds.attributes.keys():
                            if (cds.attributes["locus_tag"]
                                    not in tss.attributes["associated_gene"]):
                                get_attributes_int_anti(tss, cds, "Antisense")
                        else:
                            strand = Helper().get_strand_name(cds.strand)
                            if ("".join([
                                    cds.feature, ":",
                                    str(cds.start), "-",
                                    str(cds.end), "_", strand
                            ]) not in tss.attributes["associated_gene"]):
                                get_attributes_int_anti(tss, cds, "Antisense")
Exemple #7
0
def get_upstream(seq, tss, out, name, nt_before):
    if tss.strand == "+":
        if (tss.start - nt_before + 1) <= 0:
            start = 1
        else:
            start = tss.start - nt_before + 1
        fasta = Helper().extract_gene(seq, start, tss.start, tss.strand)
    else:
        if (tss.start + nt_before - 1) > len(seq):
            end = len(seq)
        else:
            end = tss.start + nt_before - 1
        fasta = Helper().extract_gene(seq, tss.start, end, tss.strand)
    out.write("{0}\n{1}\n".format(name, fasta))
Exemple #8
0
def srna_sorf_comparison(sRNA_file, sORF_file, sRNA_out, sORF_out):
    '''Comparison of sRNA and sORF. It can be a filter of sRNA detection'''
    sorfs = []
    srnas = []
    out_r = open(sRNA_out, "w")
    out_o = open(sORF_out, "w")
    out_r.write("##gff-version 3\n")
    out_o.write("##gff-version 3\n")
    for entry in Gff3Parser().entries(open(sRNA_file)):
        entry.attributes = del_attributes("sORF", entry)
        srnas.append(entry)
    srnas = sorted(srnas, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    for entry in Gff3Parser().entries(open(sORF_file)):
        entry.attributes = del_attributes("sRNA", entry)
        sorfs.append(entry)
    sorfs = sorted(sorfs, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    for srna in srnas:
        for sorf in sorfs:
            if (srna.seq_id == sorf.seq_id) and (srna.strand == sorf.strand):
                if ((srna.start <= sorf.start) and (
                        srna.end >= sorf.end)) or (
                        (srna.start >= sorf.start) and (
                         srna.end <= sorf.end)) or (
                        (srna.start <= sorf.start) and (
                         srna.end >= sorf.start) and (
                         srna.end <= sorf.end)) or (
                        (srna.start >= sorf.start) and (
                         srna.start <= sorf.end) and (
                         srna.end >= sorf.end)):
                    if "sORF" not in srna.attributes.keys():
                        srna.attributes["sORF"] = []
                        strand = Helper().get_strand_name(sorf.strand)
                    srna.attributes["sORF"].append("".join(
                                              [sorf.attributes["ID"], ":",
                                               str(sorf.start), "-",
                                               str(sorf.end),
                                               "_", strand]))
                    if "sRNA" not in sorf.attributes.keys():
                        sorf.attributes["sRNA"] = []
                        strand = Helper().get_strand_name(srna.strand)
                    sorf.attributes["sRNA"].append("".join(
                                              [srna.attributes["ID"], ":",
                                               str(srna.start), "-",
                                               str(srna.end),
                                               "_", strand]))
    print_file(sorfs, out_o, "sRNA")
    print_file(srnas, out_r, "sORF")
    out_r.close()
    out_o.close()
def read_libs(input_libs, wig_folder):
    libs = {}
    if "merge_forward.wig" in os.listdir(os.path.join(os.getcwd(), "tmp")):
        os.remove("tmp/merge_forward.wig")
    if "merge_reverse.wig" in os.listdir(os.path.join(os.getcwd(), "tmp")):
        os.remove("tmp/merge_reverse.wig")
    for lib in input_libs:
        datas = lib.split(":")
        if (datas[1] == "tex") and (datas[4] == "+"):
            Helper().merge_file(os.path.join(wig_folder, datas[0]),
                                os.path.join("tmp", "merge_forward.wig"))
        elif (datas[1] == "tex") and (datas[4] == "-"):
            Helper().merge_file(os.path.join(wig_folder, datas[0]),
                                os.path.join("tmp", "merge_reverse.wig"))
    return libs
Exemple #10
0
def assign_parent(gff, tran, feature):
    if "Parent" not in gff.attributes.keys():
        gff.attributes["Parent"] = tran.attributes["ID"]
    else:
        gff.attributes["Parent"] = (",".join(
            [gff.attributes["Parent"], tran.attributes["ID"]]))
    if "_".join(["associated", feature]) not in tran.attributes.keys():
        if "locus_tag" in gff.attributes.keys():
            tran.attributes["_".join(["associated", feature
                                      ])] = (gff.attributes["locus_tag"])
        elif "protein_id" in gff.attributes.keys():
            tran.attributes["_".join(["associated", feature
                                      ])] = (gff.attributes["protein_id"])
        elif "Name" in gff.attributes.keys():
            tran.attributes["_".join(["associated",
                                      feature])] = (gff.attributes["Name"])
        else:
            strand = Helper().get_strand_name(gff.strand)
            tran.attributes["_".join(["associated", feature])] = ("".join([
                gff.feature, ":",
                str(gff.start), "-",
                str(gff.end), "_", strand
            ]))
    else:
        if "locus_tag" in gff.attributes.keys():
            tran.attributes["_".join(["associated", feature])] = (",".join([
                tran.attributes["_".join(["associated", feature])],
                gff.attributes["locus_tag"]
            ]))
        elif "protein_id" in gff.attributes.keys():
            tran.attributes["_".join(["associated", feature])] = (",".join([
                tran.attributes["_".join(["associated", feature])],
                gff.attributes["protein_id"]
            ]))
        elif "Name" in gff.attributes.keys():
            tran.attributes["_".join(["associated", feature])] = (",".join([
                tran.attributes["_".join(["associated", feature])],
                gff.attributes["Name"]
            ]))
        else:
            strand = Helper().get_strand_name(gff.strand)
            tran.attributes["_".join(["associated", feature])] = (",".join([
                tran.attributes["_".join(["associated", feature])], "".join([
                    gff.feature, ":",
                    str(gff.start), "-",
                    str(gff.end), "_", strand
                ])
            ]))
Exemple #11
0
 def __init__(self, args_snp):
     self.multiparser = Multiparser()
     self.seq_editer = SeqEditer()
     self.helper = Helper()
     if args_snp.types == "related_genome":
         file_type = "compare_related_and_reference_genomes"
     else:
         file_type = "mutations_of_reference_genomes"
     self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs")
     self.stat_path = os.path.join(args_snp.out_folder, file_type,
                                   "statistics")
     self.fig_path = os.path.join(self.stat_path, "figs")
     self.helper.check_make_folder(self.fig_path)
     self.outputs = {
         "table": os.path.join(args_snp.out_folder, file_type,
                               "SNP_tables"),
         "raw": os.path.join(args_snp.out_folder, file_type,
                             "SNP_raw_outputs"),
         "tmp": os.path.join(args_snp.out_folder, "tmp_bcf"),
         "depth": os.path.join(args_snp.out_folder, "tmp_depth")
     }
     self.bams = {
         "whole": os.path.join(args_snp.out_folder, "whole_reads.bam"),
         "sort": os.path.join(args_snp.out_folder,
                              "whole_reads_sorted.bam"),
         "bams": []
     }
     self.header = os.path.join(args_snp.out_folder, "header")
     self.baqs = {
         "with": "with_BAQ",
         "without": "without_BAQ",
         "extend": "extend_BAQ"
     }
Exemple #12
0
 def __init__(self, args_ribo):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.gff_parser = Gff3Parser()
     self.gff_path = os.path.join(args_ribo.gffs, "tmp")
     if args_ribo.tsss is not None:
         self.tss_path = os.path.join(args_ribo.tsss, "tmp")
     else:
         self.tss_path = None
     self.tran_path = os.path.join(args_ribo.trans, "tmp")
     self.fasta_path = os.path.join(args_ribo.fastas, "tmp")
     if (args_ribo.program == "both") or (args_ribo.program
                                          == "riboswitch"):
         (self.ribos_stat_folder, self.ribos_gff_outfolder,
          self.ribos_table_folder, self.ribos_scan_folder,
          self.ribos_tmp_files, self.ribos_rfam,
          self.ribos_suffixs) = self._create_out_folders(
              args_ribo.ribos_out_folder, "riboswitch", args_ribo.database)
     if (args_ribo.program == "both") or (args_ribo.program
                                          == "thermometer"):
         (self.thermo_stat_folder, self.thermo_gff_outfolder,
          self.thermo_table_folder, self.thermo_scan_folder,
          self.thermo_tmp_files, self.thermo_rfam,
          self.thermo_suffixs) = self._create_out_folders(
              args_ribo.thermo_out_folder, "RNA_thermometer",
              args_ribo.database)
Exemple #13
0
def deal_cds_forward(cdss_f, target_folder, fasta, genes, tar_start, tar_end):
    '''for forward strand'''
    pre_id = ""
    out = None
    for cds in cdss_f:
        if cds.seq_id != pre_id:
            out = open(
                os.path.join(target_folder, "_".join([cds.seq_id,
                                                      "target.fa"])), "w")
            pre_id = cds.seq_id
        if (cds.start > tar_start):
            start = cds.start - tar_start
        else:
            start = 1
        if ((cds.start + tar_end) < len(fasta)) and (
            (cds.end - cds.start) >= tar_end):
            end = cds.start + tar_end - 1
        elif cds.start + tar_end >= len(fasta):
            end = len(fasta)
        elif (cds.end - cds.start) < tar_end:
            end = cds.end
        seq = Helper().extract_gene(fasta, start, end, cds.strand)
        target = cds
        for gene in genes:
            if "Parent" in cds.attributes.keys():
                if (gene.attributes["ID"]
                        in cds.attributes["Parent"].split(",")):
                    target = gene
                    break
        print_fasta(target, seq, out)
    if out is not None:
        out.close()
Exemple #14
0
def deal_cds_forward(cdss_f, target_folder, fasta, genes, tar_start, tar_end):
    '''for forward strand'''
    pre_id = ""
    out = None
    for cds in cdss_f:
        if cds.seq_id != pre_id:
            out = open(
                os.path.join(target_folder, "_".join([cds.seq_id,
                                                      "target.fa"])), "w")
            pre_id = cds.seq_id
        if (cds.start > tar_start):
            start = cds.start - tar_start
        else:
            start = 1
        if ((cds.start + tar_end) < len(fasta)) and (
            (cds.end - cds.start) >= tar_end):
            end = cds.start + tar_end - 1
        elif cds.start + tar_end >= len(fasta):
            end = len(fasta)
        elif (cds.end - cds.start) < tar_end:
            end = cds.end
        seq = Helper().extract_gene(fasta, start, end, cds.strand)
        target = cds
        target_gene = check_parent_gene(cds, genes)
        print_fasta(target, seq, out, target_gene)
    if out is not None:
        out.close()
Exemple #15
0
def deal_cds_reverse(cdss_r, target_folder, fasta, genes, tar_start, tar_end):
    '''for the reverse strand'''
    pre_id = ""
    out = None
    for cds in cdss_r:
        if cds.seq_id != pre_id:
            out = open(
                os.path.join(target_folder, "_".join([cds.seq_id,
                                                      "target.fa"])), "a")
            pre_id = cds.seq_id
        if (len(fasta) - cds.end > tar_start):
            end = cds.end + tar_start
        else:
            end = len(fasta)
        if ((cds.end - tar_end) > 1) and ((cds.end - cds.start) >= tar_end):
            start = cds.end - tar_end - 1
        elif cds.end - tar_end < 1:
            start = 1
        elif (cds.end - cds.start) < tar_end:
            start = cds.start
        seq = Helper().extract_gene(fasta, start, end, cds.strand)
        target = cds
        target_gene = check_parent_gene(cds, genes)
        print_fasta(target, seq, out, target_gene)
    if out is not None:
        out.close()
Exemple #16
0
 def __init__(self, args_snp):
     self.multiparser = Multiparser()
     self.seq_editer = SeqEditer()
     self.helper = Helper()
     if args_snp.types == "reference":
         file_type = "compare_reference"
     else:
         file_type = "validate_target"
     self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs")
     self.stat_path = os.path.join(args_snp.out_folder, file_type,
                                   "statistics")
     self.fasta_path = os.path.join(args_snp.fastas, "tmp")
     self.outputs = {"table": os.path.join(
                     args_snp.out_folder, file_type, "SNP_table"),
                     "raw": os.path.join(
                     args_snp.out_folder, file_type, "SNP_raw_outputs"),
                     "tmp": os.path.join(args_snp.out_folder, "tmp_bcf"),
                     "depth": os.path.join(args_snp.out_folder, "tmp_depth")}
     if "whole_reads.bam" in os.listdir(args_snp.out_folder):
         self.helper.remove_all_content(args_snp.out_folder,
                                        "whole_read", "file")
     self.bams = {"whole": os.path.join(args_snp.out_folder,
                                        "whole_reads.bam"),
                  "sort": os.path.join(args_snp.out_folder,
                                       "whole_reads_sorted.bam"),
                  "bams": []}
     self.header = os.path.join(args_snp.out_folder, "header")
     self.baqs = {"with": "with_BAQ", "without": "without_BAQ",
                  "extend": "extend_BAQ"}
Exemple #17
0
def deal_cds_reverse(cdss_r, target_folder, fasta, genes, tar_start, tar_end):
    pre_id = ""
    out = None
    for cds in cdss_r:
        if cds.seq_id != pre_id:
            out = open(
                os.path.join(target_folder, "_".join([cds.seq_id,
                                                      "target.fa"])), "a")
            pre_id = cds.seq_id
        if (len(fasta) - cds.end > tar_start):
            end = cds.end + tar_start
        else:
            end = len(fasta)
        if ((cds.end - tar_end) > 1) and ((cds.end - cds.start) >= tar_end):
            start = cds.end - tar_end - 1
        elif cds.end - tar_end < 1:
            start = 1
        elif (cds.end - cds.start) < tar_end:
            start = cds.start
        seq = Helper().extract_gene(fasta, start, end, cds.strand)
        target = cds
        for gene in genes:
            if "Parent" in cds.attributes.keys():
                if cds.attributes["Parent"] == gene.attributes["ID"]:
                    target = gene
                    break
        print_fasta(target, seq, out)
    if out is not None:
        out.close()
Exemple #18
0
def compare_sorf_srna(sorfs, srnas, srna_gff):
    if srna_gff is not None:
        for sorf in sorfs:
            sorf["srna"] = []
            for srna in srnas:
                if (sorf["strain"] == srna.seq_id) and (
                        sorf["strand"] == srna.strand):
                    if ((srna.start <= sorf["start"]) and (
                             srna.end >= sorf["end"])) or (
                            (srna.start >= sorf["start"]) and (
                             srna.end <= sorf["end"])) or (
                            (srna.start <= sorf["start"]) and (
                             srna.end >= sorf["start"]) and (
                             srna.end <= sorf["end"])) or (
                            (srna.start >= sorf["start"]) and (
                             srna.start <= sorf["end"]) and (
                             srna.end >= sorf["end"])):
                        strand = Helper().get_strand_name(srna.strand)
                        sorf["srna"].append(srna.attributes["ID"] + ":" +
                                            str(srna.start) + "-" +
                                            str(srna.end) + "_" + strand)
            if len(sorf["srna"]) == 0:
                sorf["srna"] = ["NA"]
    else:
        for sorf in sorfs:
            sorf["srna"] = ["NA"]
Exemple #19
0
 def __init__(self, args_circ):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.converter = Converter()
     self.alignment_path = os.path.join(args_circ.output_folder,
                                        "segemehl_align")
     self.splice_path = os.path.join(args_circ.output_folder,
                                     "segemehl_splice")
     self.candidate_path = os.path.join(args_circ.output_folder,
                                        "circRNA_tables")
     self.gff_folder = os.path.join(args_circ.output_folder, "gffs")
     self.gff_path = os.path.join(args_circ.gffs, "tmp")
     self.splices = {
         "all_file": "splicesites_all.bed",
         "file": "splicesites.bed",
         "all": "splicesites_all",
         "splice": "splicesites"
     }
     self.trans = {
         "all_file": "transrealigned_all.bed",
         "file": "transrealigned.bed",
         "all": "transrealigned_all",
         "trans": "transrealigned"
     }
     self.bams = {"whole": "whole_reads.bam", "sort": "whole_reads_sort"}
     if args_circ.align:
         if args_circ.fastas is None:
             print("Error: There is no genome fasta file!!!")
             sys.exit()
         else:
             self.fasta_path = os.path.join(args_circ.fastas, "tmp")
     else:
         self.fasta_path = os.path.join(args_circ.fastas, "tmp")
Exemple #20
0
def get_feature(cds):
    if "locus_tag" in cds.attributes.keys():
        feature = cds.attributes["locus_tag"]
    elif "protein_id" in cds.attributes.keys():
        feature = cds.attributes["protein_id"]
    elif "ID" in cds.attributes.keys():
        strand = Helper().get_strand_name(cds.strand)
        feature = "".join([cds.attributes["ID"], ":",
                           str(cds.start), "-", str(cds.end),
                           "_", strand])
    else:
        strand = Helper().get_strand_name(cds.strand)
        feature = "".join([cds.feature, ":",
                           str(cds.start), "-", str(cds.end),
                           "_", strand])
    return feature
Exemple #21
0
 def __init__(self):
     self.seq_editer = SeqEditer()
     self.helper = Helper()
     self.tmp_fa = "tmp.fa"
     self.tmp_gff = "tmp.gff"
     self.tmp_wig_forward = "tmp_forward.wig"
     self.tmp_wig_reverse = "tmp_reverse.wig"
Exemple #22
0
def assign_sorf(sorf, starts, ends, fasta):
    sorf["starts"] = starts
    sorf["ends"] = ends
    sorf["start"] = min(map(int, starts))
    sorf["end"] = max(map(int, ends))
    sorf["seq"] = Helper().extract_gene(fasta[sorf["strain"]], sorf["start"],
                                        sorf["end"], sorf["strand"])
Exemple #23
0
 def _print_tssfile(self, nums, tss_features, tss, tss_pro, strain, method,
                    out, tss_libs):
     tss_pro = tss_pro[0].upper() + tss_pro[1:]
     tss_merge_type = "&".join(tss_features["tss_types"])
     utr_length = "&".join(tss_features["utr_lengths"])
     merge_locus_tag = "&".join(tss_features["locus_tags"])
     libs = "&".join(tss_libs)
     strand = Helper().get_strand_name(tss.super_strand)
     attribute_string = ";".join([
         "=".join(items)
         for items in ([
             "Name", "".join(
                 [tss_pro, ":",
                  str(tss.super_pos), "_", strand])
         ], ["ID", tss_pro.lower() + str(nums["tss_uni"])],
                       ["type", tss_merge_type],
                       ["UTR_length", str(utr_length)],
                       ["associated_gene", merge_locus_tag], ["libs", libs],
                       ["Method", "TSSpredator"])
     ])
     out.write("\t".join([
         strain, method, tss_pro,
         str(tss.super_pos),
         str(tss.super_pos), ".", tss.super_strand, ".", attribute_string
     ]) + "\n")
Exemple #24
0
def read_gff(tss_predict_file, tss_manual_file, gff_file, lengths):
    tsss = {"tsss_p": [], "tsss_m": [], "merge": []}
    cdss = []
    genes = []
    gff_parser = Gff3Parser()
    tssp_fh = open(tss_predict_file, "r")
    tssm_fh = open(tss_manual_file, "r")
    g_f = open(gff_file, "r")
    for entry in gff_parser.entries(tssp_fh):
        entry.attributes["print"] = False
        tsss["tsss_p"].append(entry)
    tssp_fh.close()
    tsss["tsss_p"] = sorted(tsss["tsss_p"], key=lambda k: (k.seq_id, k.start,
                                                           k.end, k.strand))
    for entry in gff_parser.entries(tssm_fh):
        if (entry.seq_id in lengths.keys()) or ("all" in lengths.keys()):
            entry.attributes["print"] = False
            entry.attributes["libs"] = "manual"
            entry.attributes["method"] = "manual"
            tsss["tsss_m"].append(entry)
    tssm_fh.close()
    tsss["tsss_m"] = sorted(tsss["tsss_m"], key=lambda k: (k.seq_id, k.start,
                                                           k.end, k.strand))
    for entry in gff_parser.entries(g_f):
        if (Helper().feature_without_notgene(entry)):
            cdss.append(entry)
        if entry.feature == "gene":
            genes.append(entry)
    g_f.close()
    cdss = sorted(cdss, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    genes = sorted(genes, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
    return tsss, cdss, genes
Exemple #25
0
 def __init__(self, args_ribo):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.gff_parser = Gff3Parser()
     self.gff_path = os.path.join(args_ribo.gffs, "tmp")
     self.tss_path = os.path.join(args_ribo.tsss, "tmp")
     self.tran_path = os.path.join(args_ribo.trans, "tmp")
     self.fasta_path = os.path.join(args_ribo.fastas, "tmp")
     self.stat_folder = os.path.join(args_ribo.out_folder, "statistics")
     self.gff_outfolder = os.path.join(args_ribo.out_folder, "gffs")
     self.table_folder = os.path.join(args_ribo.out_folder, "tables")
     self.scan_folder = os.path.join(args_ribo.out_folder, "scan_Rfam")
     self.ribos_rfam = os.path.join(args_ribo.database,
                                    "Rfam_riboswitch.cm")
     self.tmp_files = {
         "fasta": os.path.join(args_ribo.out_folder, "tmp_fasta"),
         "scan": os.path.join(args_ribo.out_folder, "tmp_scan"),
         "table": os.path.join(args_ribo.out_folder, "tmp_table")
     }
     self.suffixs = {
         "csv": "riboswitch.csv",
         "txt": "riboswitch_prescan.txt",
         "re_txt": "riboswitch_scan.txt",
         "re_csv": "riboswitch_scan.csv"
     }
Exemple #26
0
def remove_primary(tss, tss_entry):
    final_types = []
    final_utrs = []
    final_genes = []
    tss_dict = tss_entry[1]
    types = tss_dict["type"].split("&")
    utrs = tss_dict["UTR_length"].split("&")
    genes = tss_dict["associated_gene"].split("&")
    index = 0
    for type_ in types:
        if type_ != "Primary":
            final_types.append(type_)
            final_utrs.append(utrs[index])
            final_genes.append(genes[index])
        index += 1
    strand = Helper().get_strand_name(tss.strand)
    tss_dict = {
        "Name": "_".join(["TSS:" + str(tss.start), strand]),
        "type": "&".join(final_types),
        "UTR_length": "&".join(final_utrs),
        "associated_gene": "&".join(final_genes)
    }
    tss_string = ";".join([
        "=".join(["UTR_length", tss_dict["UTR_length"]]),
        "=".join(["associated_gene", tss_dict["associated_gene"]]),
        "=".join(["type",
                  tss_dict["type"]]), "=".join(["Name", tss_dict["Name"]])
    ])
    return (tss_string, tss_dict)
Exemple #27
0
def get_gene_info(cds):
    if "locus_tag" in cds.attributes.keys():
        feature = cds.attributes["locus_tag"]
    else:
        strand = Helper().get_strand_name(cds.strand)
        feature = "".join([cds.feature, ":", str(cds.start),
                           "-", str(cds.end), "_", strand])
    return feature
Exemple #28
0
def merge_libs(input_libs, wig_folder, program):
    if "merge_forward.wig" in os.listdir(os.getcwd()):
        os.remove("merge_forward.wig")
    if "merge_reverse.wig" in os.listdir(os.getcwd()):
        os.remove("merge_reverse.wig")
    if program == "TSS":
        type_ = "tex"
    elif program == "processing":
        type_ = "notex"
    for lib in input_libs:
        datas = lib.split(":")
        if (datas[1] == type_) and (datas[4] == "+"):
            Helper().merge_file(os.path.join(wig_folder, datas[0]),
                                os.path.join(os.getcwd(), "merge_forward.wig"))
        elif (datas[1] == type_) and (datas[4] == "-"):
            Helper().merge_file(os.path.join(wig_folder, datas[0]),
                                os.path.join(os.getcwd(), "merge_reverse.wig"))
 def __init__(self, tar_folder, ref_folder):
     self.multiparser = Multiparser()
     self.seq_editer = SeqEditer()
     self.helper = Helper()
     self.folders = {
         "tmp_tar": os.path.join(tar_folder, "tmp"),
         "tmp_ref": os.path.join(ref_folder, "tmp")
     }
Exemple #30
0
def import_to_tss(tss_type, cds_pos, tss, locus_tag, tss_entry):
    if cds_pos == "NA":
        utr = "_".join([tss_type, "NA"])
    else:
        utr = "_".join([tss_type, str(int(math.fabs(cds_pos - tss.start)))])
    if len(tss_entry) != 0:
        tss_dict = tss_entry[1]
        tss_dict_types = tss_dict["type"].split("&")
        tss_dict_utrs = tss_dict["UTR_length"].split("&")
        tss_dict_tags = tss_dict["associated_gene"].split("&")
        if tss_type == "Primary" and ("Primary" in tss_dict["type"]):
            index = 0
            for tss_dict_type in tss_dict_types:
                if "Primary" in tss_dict_type:
                    utr_length = tss_dict_utrs[index].split("_")
                    if math.fabs(cds_pos - tss.start) < int(utr_length[1]):
                        tss_dict_utrs[index] = utr
                        tss_dict_tags[index] = locus_tag
                index += 1
        else:
            tss_dict_types.append(tss_type)
            tss_dict_utrs.append(utr)
            tss_dict_tags.append(locus_tag)
        strand = Helper().get_strand_name(tss.strand)
        tss_dict = {
            "Name": "_".join(["TSS:" + str(tss.start), strand]),
            "type": "&".join(tss_dict_types),
            "UTR_length": "&".join(tss_dict_utrs),
            "associated_gene": "&".join(tss_dict_tags)
        }
    else:
        strand = Helper().get_strand_name(tss.strand)
        tss_dict = {
            "Name": "_".join(["TSS:" + str(tss.start), strand]),
            "type": tss_type,
            "UTR_length": utr,
            "associated_gene": locus_tag
        }
    tss_string = ";".join([
        "=".join(["UTR_length", tss_dict["UTR_length"]]),
        "=".join(["associated_gene", tss_dict["associated_gene"]]),
        "=".join(["type",
                  tss_dict["type"]]), "=".join(["Name", tss_dict["Name"]])
    ])
    return (tss_string, tss_dict)