def _stat_and_correct(self, stats, folder): '''do statistics and print the final gff file''' for gff in os.listdir(folder): prefix = gff.replace("_CRISPR.gff", "") stats[prefix] = {"all": {"cri": 0, "re": {}}} gh = open(os.path.join(folder, gff), "r") oh = open("tmp_cri.gff", "w") oh.write("##gff-version 3\n") cr_num = 0 re_num = 0 first = True for entry in Gff3Parser().entries(gh): if entry.seq_id not in stats[prefix].keys(): stats[prefix][entry.seq_id] = {"cri": 0, "re": {}} if entry.feature == "CRISPR": id_ = "CRISPR_" + str(cr_num) attribute = ";".join( ["ID=" + entry.seq_id + "_" + id_, "method=CRT"]) cr_num += 1 if first: first = False else: if repeat not in stats[prefix][ entry.seq_id]["re"].keys(): stats[prefix][entry.seq_id]["re"][repeat] = 1 else: stats[prefix][entry.seq_id]["re"][repeat] += 1 if repeat not in stats[prefix]["all"]["re"].keys(): stats[prefix]["all"]["re"][repeat] = 1 else: stats[prefix]["all"]["re"][repeat] += 1 repeat = 0 stats[prefix][entry.seq_id]["cri"] += 1 stats[prefix]["all"]["cri"] += 1 elif entry.feature == "repeat_unit": attribute = ";".join([ "ID=" + entry.seq_id + "_Repeat_" + str(re_num), "method=CRT", "Parent=" + id_ ]) re_num += 1 repeat += 1 oh.write( "\t".join([entry.info_without_attributes, attribute]) + "\n") if not first: if repeat not in stats[prefix][entry.seq_id]["re"].keys(): stats[prefix][entry.seq_id]["re"][repeat] = 1 else: stats[prefix][entry.seq_id]["re"][repeat] += 1 if repeat not in stats[prefix]["all"]["re"].keys(): stats[prefix]["all"]["re"][repeat] = 1 else: stats[prefix]["all"]["re"][repeat] += 1 gh.close() oh.close() os.remove(os.path.join(folder, gff)) shutil.move("tmp_cri.gff", os.path.join(folder, gff))
def read_gff(input_file): datas = [] gff_parser = Gff3Parser() f_h = open(input_file, "r") for entry in gff_parser.entries(f_h): entry.attributes["print"] = False datas.append(entry) datas = sorted(datas, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) return datas
def read_data(inter_gff, tss_file, srna_gff, fasta, utr_detect): seq = {} inters = [] tsss = [] srnas = [] fh = open(inter_gff) for entry in Gff3Parser().entries(fh): if ((entry.source == "UTR_derived") and (utr_detect)) or ((entry.source == "intergenic") or (entry.source == "antisense")): inters.append(entry) inters = sorted(inters, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) fh.close() if tss_file is not None: fh = open(tss_file) for entry in Gff3Parser().entries(fh): tsss.append(entry) tsss = sorted(tsss, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) fh.close() else: tsss = None if srna_gff is not None: fh = open(srna_gff) for entry in Gff3Parser().entries(fh): new = {} for key, value in entry.attributes.items(): if "sORF" not in key: new[key] = value entry.attributes = copy.deepcopy(new) srnas.append(entry) srnas = sorted(srnas, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) fh.close() else: srnas = None with open(fasta, "r") as s_f: for line in s_f: line = line.strip() if line.startswith(">"): strain = line[1:] seq[strain] = "" else: seq[strain] = seq[strain] + line return inters, tsss, srnas, seq
def combine(frag_file, tex_file, tolerance, output_file): frags = [] norms = [] finals = [] out = open(output_file, "w") out.write("##gff-version 3\n") f_h = open(frag_file, "r") for entry in Gff3Parser().entries(f_h): entry.attributes["print"] = False frags.append(entry) f_h.close() n_h = open(tex_file, "r") for entry in Gff3Parser().entries(n_h): entry.attributes["print"] = False norms.append(entry) n_h.close() sort_frags = sorted(frags, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) sort_norms = sorted(norms, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) for frag in sort_frags: overlap = False for norm in sort_norms: overlap = compare(frag, norm, overlap, tolerance) if overlap: store(frag, "fragmented&tex_notex", finals) else: store(frag, "fragmented", finals) for norm in sort_norms: if norm.attributes["print"] is False: store(norm, "tex_notex", finals) sort_finals = sorted(finals, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) num = 0 for tar in sort_finals: if tar.attributes["print"] is True: continue overlap = False for ref in sort_finals: overlap = compare(tar, ref, overlap, tolerance) name = '%0*d' % (5, num) print_file(tar, out, name, num) num += 1 out.close()
def read_tag_file(gff_file, ta_file, c_feature): region = None gffs = [] tas = [] stats = {} stats["All"] = {"bsae": 0, "bsbe": 0, "asae": 0, "asbe": 0, "other": 0, "gene": 0} pre_seq_id = "" ta_f = open(ta_file, "r") for entry in Gff3Parser().entries(ta_f): if entry.seq_id != pre_seq_id: pre_seq_id = entry.seq_id stats[entry.seq_id] = {"bsae": 0, "bsbe": 0, "asae": 0, "asbe": 0, "other": 0, "gene": 0} entry.attributes = del_attributes(entry, [ "_".join(["associated", c_feature]), "_".join(["compare", c_feature])]) tas.append(entry) ta_f.close() g_f = open(gff_file, "r") for entry in Gff3Parser().entries(g_f): if (entry.feature == c_feature): ori_parents = [] if "Parent" in entry.attributes.keys(): parents = entry.attributes["Parent"].split(",") for parent in parents: if "gene" in parent: ori_parents.append(parent) if len(ori_parents) == 0: entry.attributes = del_attributes(entry, ["Parent"]) else: entry.attributes["Parent"] = ",".join(ori_parents) if entry.seq_id in stats.keys(): stats[entry.seq_id]["gene"] += 1 stats["All"]["gene"] += 1 if (entry.feature.lower() != "region") and ( entry.feature.lower() != "source") and ( entry.feature.lower() != "remark"): gffs.append(entry) else: region = entry g_f.close() tas = sorted(tas, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) return gffs, tas, stats, region
def read_tss(tss_file): tsss = [] if tss_file is not None: tss_f = open(tss_file, "r") gff_parser = Gff3Parser() for entry in gff_parser.entries(tss_f): tsss.append(entry) tss_f.close() num_tss = None return tsss, num_tss
def _read_gff(self, txt): gffs = [] gh = open(os.path.join(self.gff_path, txt.replace(".txt", ".gff")), "r") for entry in Gff3Parser().entries(gh): if (entry.feature == "gene") or (entry.feature == "CDS") or ( entry.feature == "tRNA") or (entry.feature == "rRNA"): gffs.append(entry) gh.close() return gffs
def fix_ratt(self, gff_file, strain, out_file): out = open(out_file, "w") out.write("##gff-version 3\n") nums = {"cds": 0, "rna": 0, "gene": 0} genes = [] datas = [] check_parent = False self._read_gff(gff_file, genes, datas, strain) check_parent = False for data in datas: if data.feature == "gene": data = genes[nums["gene"]] nums["gene"] += 1 elif (data.feature == "rRNA") or \ (data.feature == "tRNA"): name = data.attributes["locus_tag"] data.attribute_string = ";".join([ "ID=rna" + str(nums["rna"]), "Name=" + name, data.attribute_string]) nums["rna"] += 1 elif data.feature == "CDS": if "protein_id" in data.attributes.keys(): name = data.attributes["protein_id"] for gene in genes: if ((gene.start <= data.start) and ( gene.end >= data.end)) or ( gene.attributes["locus_tag"] == data.attributes["locus_tag"]): data.attribute_string = ";".join([ "ID=cds" + str(nums["cds"]), "Name=" + name, "Parent=" + gene.attributes["ID"], data.attribute_string]) check_parent = True break if check_parent: check_parent = False pass else: data.attribute_string = ";".join([ "ID=cds" + str(nums["cds"]), "Name=" + name, data.attribute_string]) nums["cds"] += 1 if "group" in data.attributes.keys(): ref_f = open(gff_file, "r") for ref in Gff3Parser().entries(ref_f): if "group" in ref.attributes.keys(): if (data.attributes["group"] == ref.attributes["group"]): if (data.strand != ref.strand): data.strand = ref.strand break ref_f.close() out.write("\t".join([data.info_without_attributes, data.attribute_string]) + "\n") out.close()
def read_gff(gff_file, tran_file, hypo): trans = [] gffs = [] gh = open(gff_file) for entry in Gff3Parser().entries(gh): if (Helper().feature_without_notgene(entry)) and (entry.feature != "sORF"): if ("product" in entry.attributes.keys()) and (hypo): if "hypothetical protein" not in entry.attributes["product"]: gffs.append(entry) else: gffs.append(entry) th = open(tran_file) for entry in Gff3Parser().entries(th): trans.append(entry) gffs = sorted(gffs, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) trans = sorted(trans, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) gh.close() th.close() return gffs, trans
def read_gff(gff_file, features): gffs = [] if not os.path.isfile(gff_file): filename = gff_file.split(".") gff_file = ".".join(filename[0:-2]) + ".gff" g_f = open(gff_file, "r") for entry in Gff3Parser().entries(g_f): if entry.feature in features: gffs.append(entry) gffs = sorted(gffs, key=lambda k: (k.seq_id, k.start)) return gffs
def read_gff(filename): gffs = [] genes = [] for entry in Gff3Parser().entries(open(filename)): if entry.feature == "gene": genes.append(entry) gffs.append(entry) gffs = sorted(gffs, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) if len(genes) != 0: genes = sorted(genes, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) return gffs, genes
def read_gff(seq_file, gff_file, tran_file): genome = {} genes = [] trans = [] for entry in Gff3Parser().entries(open(gff_file)): if (entry.feature == "gene"): genes.append(entry) for entry in Gff3Parser().entries(open(tran_file)): trans.append(entry) with open(seq_file, "r") as q_h: for line in q_h: line = line.strip() if line.startswith(">"): strain = line[1:] genome[strain] = "" else: genome[strain] = genome[strain] + line genes = sorted(genes, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) trans = sorted(trans, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) return genes, genome, trans
def read_predict_manual_gff(gff_file, args_ops): num = 0 gffs = [] f_h = open(gff_file, "r") for entry in Gff3Parser().entries(f_h): if (entry.start <= int(args_ops.gene_length)): num += 1 entry.attributes["print"] = False gffs.append(entry) f_h.close() return num, gffs
def read_gff(filename, index): gf = open(filename, "r") gff_parser = Gff3Parser() datas = [] for entry in gff_parser.entries(gf): entry.attributes[index] = "NA" datas.append(entry) datas = sorted(datas, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) gf.close() return datas
def check_overlap(table_file, gff_file): out = open(table_file + "tmp", "w") gffs = [] gff_f = open(gff_file, "r") for entry in Gff3Parser().entries(gff_f): if Helper().feature_without_notgene(entry): gffs.append(entry) fh = open(table_file, "r") out.write("\t".join([ "Rank", "Genome", "Name", "Start", "End", "Strand", "Start_with_TSS/Cleavage_site", "End_with_cleavage", "Candidates", "Lib_type", "Best_avg_coverage", "Track/Coverage", "Normalized_secondary_energy_change(by_length)", "sRNA_types", "Conflict_sORF", "nr_hit_number", "sRNA_hit_number", "nr_hit_top3|ID|e-value|score", "sRNA_hit|e-value|score", "Overlap_CDS_forward", "Overlap_nts_forward", "Overlap_CDS_reverse", "Overlap_nts_reverse", "End_with_terminator", "Associated_promoter", "sRNA_length" ]) + "\n") for row in csv.reader(fh, delimiter='\t'): if row[3] != "Start": overlaps = {"forward": [], "reverse": [], "CDS_f": [], "CDS_r": []} start = int(row[3]) end = int(row[4]) for gff in gffs: if ((gff.end < end) and (gff.end > start) and (gff.start <= start)) or ( (gff.start > start) and (gff.start < end) and (gff.end >= end)) or ((gff.end >= end) and (gff.start <= start)) or ( (gff.end <= end) and (gff.start >= start)): overlap = min(gff.end, end) - max(gff.start, start) + 1 percent = "{0:.0f}%".format( (float(overlap) / float(end - start + 1)) * 100) if gff.strand == "+": overlaps["forward"].append( str(overlap) + "(" + str(percent) + ")") overlaps["CDS_f"].append(import_cds(gff)) else: overlaps["reverse"].append( str(overlap) + "(" + str(percent) + ")") overlaps["CDS_r"].append(import_cds(gff)) if len(overlaps["forward"]) == 0: overlaps["forward"] = ["NA"] overlaps["CDS_f"] = ["NA"] if len(overlaps["reverse"]) == 0: overlaps["reverse"] = ["NA"] overlaps["CDS_r"] = ["NA"] out.write("\t".join(row[0:19] + [ ";".join(overlaps["CDS_f"]), ";".join(overlaps["forward"]), ";".join(overlaps["CDS_r"]), ";".join(overlaps["reverse"]) ] + row[21:]) + "\n") shutil.move(table_file + "tmp", table_file)
def read_gff(gff_file): cdss = [] g_h = open(gff_file) for entry in Gff3Parser().entries(g_h): if (entry.feature == "CDS") or (entry.feature == "tRNA") or (entry.feature == "rRNA"): cdss.append(entry) cdss = sorted(cdss, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) g_h.close() return cdss
def read_gffs(gff_files, feature): gffs = {} if feature == "transcript": gffs["transcript"] = [] gff_f = open(gff_files, "r") for entry in Gff3Parser().entries(gff_f): gffs["transcript"].append(entry) gff_f.close() gffs["transcript"] = sorted(gffs["transcript"], key=lambda x: (x.seq_id, x.start, x.end, x.strand)) else: num = 0 for files in gff_files: for gff_file in glob(files): gffs[num] = [] gff_f = open(gff_file, "r") for entry in Gff3Parser().entries(gff_f): parent = None if (entry.feature == "CDS") or (entry.feature == "exon") or ( entry.feature == "repeat_unit") or ( entry.feature == "tRNA") or (entry.feature == "rRNA") or (entry.feature == "ncRNA"): if "Parent" in entry.attributes.keys(): parent = entry.attributes["Parent"] del_attributes( entry, ["associated_tran", "parent_tran", "Parent", "Parent"]) if parent is not None: entry.attributes["Parent"] = parent entry.attributes["print"] = False gffs[num].append(entry) gff_f.close() gffs[num] = sorted(gffs[num], key=lambda x: (x.seq_id, x.start, x.end, x.strand)) num += 1 return gffs
def filter_frag(srna_table, srna_gff): out = open("tmp_srna.gff", "w") out_ta = open("tmp_srna.csv", "w") out.write("##gff-version 3\n") gffs = [] tables = [] gff_parser = Gff3Parser() g_f = open(srna_gff, "r") for entry in gff_parser.entries(g_f): gffs.append(entry) fh = open(srna_table, "r") for row in csv.reader(fh, delimiter='\t'): tables.append(row) new_gffs = [] for gff in gffs: if ("UTR_type" in gff.attributes.keys()): if ("5utr" in gff.attributes["UTR_type"]) or ( "interCDS" in gff.attributes["UTR_type"]): for table in tables: if (gff.seq_id == table[0]) and ( gff.start == int(table[2])) and ( gff.end == int(table[3])) and ( gff.strand == table[4]): if "frag" in table[5]: new_gffs.append(gff) elif "3utr" in gff.attributes["UTR_type"]: new_gffs.append(gff) else: new_gffs.append(gff) new_tables = [] for table in tables: for gff in new_gffs: if (gff.seq_id == table[0]) and ( gff.start == int(table[2])) and ( gff.end == int(table[3])) and ( gff.strand == table[4]): new_tables.append(table) out_ta.write("\t".join(table) + "\n") for gff in new_gffs: for table in new_tables: if (gff.seq_id == table[0]) and ( gff.start == int(table[2])) and ( gff.end == int(table[3])) and ( gff.strand == table[4]): out.write(gff.info + "\n") g_f.close() fh.close() out.close() out_ta.close() os.remove(srna_gff) os.remove(srna_table) shutil.move("tmp_srna.gff", srna_gff) shutil.move("tmp_srna.csv", srna_table)
def read_gff(gff_file): cdss = [] genes = [] g_f = open(gff_file, "r") for entry in Gff3Parser().entries(g_f): if (Helper().feature_without_notgene(entry)): cdss.append(entry) if entry.feature == "gene": genes.append(entry) cdss = sorted(cdss, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) genes = sorted(genes, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) return cdss, genes
def read_gff(gff_file, type_): cdss = [] g_h = open(gff_file) for entry in Gff3Parser().entries(g_h): if (Helper().feature_without_notgene(entry)): if (type_ == "riboswitch") and (entry.feature != "riboswitch"): cdss.append(entry) elif (type_ == "thermometer") and (entry.feature != "RNA_thermometer"): cdss.append(entry) cdss = sorted(cdss, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) g_h.close() return cdss
def fill_gap(gff_file, ta_file, type_, output): tas = [] genes = [] print_list = [] ta_f = open(ta_file, "r") gff_f = open(gff_file, "r") for entry in Gff3Parser().entries(ta_f): tas.append(entry) ta_f.close() tas = sorted(tas, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) for entry in Gff3Parser().entries(gff_f): if (entry.feature == "gene") or (entry.feature == "CDS") or ( entry.feature == "rRNA") or (entry.feature == "tRNA"): genes.append(entry) gff_f.close() genes = sorted(genes, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) out = open(output, "w") out.write("##gff-version 3\n") if type_ == "overlap": overlap(tas, genes, print_list, out) elif type_ == "uni": uni(tas, genes, out)
def fill_gap(gff_file, ta_file, type_, output, modify): '''compare transcript with genome annotation to modify the transcript''' tas = [] genes = [] ta_f = open(ta_file, "r") gff_f = open(gff_file, "r") for entry in Gff3Parser().entries(ta_f): tas.append(entry) ta_f.close() tas = sorted(tas, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) for entry in Gff3Parser().entries(gff_f): if (entry.feature == "gene") or (entry.feature == "CDS") or ( entry.feature == "rRNA") or (entry.feature == "tRNA"): genes.append(entry) gff_f.close() genes = sorted(genes, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) out = open(output, "w") out.write("##gff-version 3\n") if type_ == "overlap": overlap(tas, genes, out, modify) elif type_ == "uni": uni(tas, genes, out)
def output_coverage(table_file, gff_file, cutoff_cover, stat_file, out_folder): out = open(os.path.join(out_folder, "tmp_srna_table"), "w") out_g = open(os.path.join(out_folder, "tmp_srna_gff"), "w") out.write("\t".join([ "Rank", "Genome", "Name", "Start", "End", "Strand", "Start_with_TSS/Cleavage_site", "End_with_cleavage", "Candidates", "Lib_type", "Best_avg_coverage", "Best_highest_coverage", "Best_lower_coverage", "Track/Coverage", "Normalized_secondary_energy_change(by_length)", "UTR_derived/Intergenic", "Confliction_of_sORF", "nr_hit_number", "sRNA_hit_number", "nr_hit_top3|ID|e-value", "sRNA_hit|e-value", "Overlap_CDS", "Overlap_percent", "End_with_terminator" ]) + "\n") out_g.write("##gff-version 3\n") stat_out = open(stat_file, "w") nums = {5: 0} for i in range(10, 100, 10): nums[i] = 0 for i in range(100, 1000, 100): nums[i] = 0 for i in range(1000, 5000, 500): nums[i] = 0 gffs = [] gh = open(gff_file, "r") for entry in Gff3Parser().entries(gh): gffs.append(entry) fh = open(table_file, "r") rank = 1 new_gffs = [] for row in csv.reader(fh, delimiter='\t'): if row[0] != "rank": for cutoff in nums.keys(): if float(row[10]) >= cutoff: nums[cutoff] += 1 if float(row[10]) >= cutoff_cover: row[0] = str(rank) out.write("\t".join(row) + "\n") rank += 1 for gff in gffs: if (row[1] == gff.seq_id) and (row[3] == str( gff.start)) and (row[4] == str( gff.end)) and (row[5] == gff.strand): new_gffs.append(gff) sort_gffs = sorted(new_gffs, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) for gff in sort_gffs: out_g.write(gff.info + "\n") coverlist = sorted(nums, key=lambda key: nums[key]) stat_out.write("coverage\tfrequency\n") for cover in coverlist: stat_out.write("\t".join([str(cover), str(nums[cover])]) + "\n")
def gen_promoter_table(input_file, output_file, tss_file, type_): '''generate the table of promoter based on MEME''' tsss = [] gff_f = open(tss_file, "r") for entry in Gff3Parser().entries(gff_f): tsss.append(entry) out = open(output_file, "w") out.write("\t".join(["Genome", "TSS_position", "TSS_strand", "Motif"]) + "\n") detect = False num = 1 with open(input_file) as fh: for line in fh: line = line.strip() if type_ == "meme": if line.startswith("MOTIF"): motif = line.split("MEME")[0].strip() datas = motif.split(" ") motif = datas[0] + "_" + datas[-1] detect = False elif (line.startswith("Sequence name")) and ( line.endswith("Site")): detect = True elif (len(line) == 0): detect = False elif (detect) and (not line.startswith("---")): tag = line.split(" ")[0] datas = tag.split("_") for tss in tsss: if ("_".join(datas[2:]) in tss.seq_id) and (datas[0] == str( tss.start)) and (datas[1] == tss.strand): out.write("\t".join( [tss.seq_id, datas[0], datas[1], motif]) + "\n") elif type_ == "glam2": if line.startswith("*"): detect = True motif = "MOTIF_" + str(num) num += 1 elif len(line) == 0: detect = False elif detect: datas = line.split(" ")[0].split("_") for tss in tsss: if ("_".join(datas[2:]) in tss.seq_id) and (datas[0] == str( tss.start)) and (datas[1] == tss.strand): out.write("\t".join( [tss.seq_id, datas[0], datas[1], motif]) + "\n")
def read_gff(gff_file): cdss = [] genes = [] g_f = open(gff_file, "r") for entry in Gff3Parser().entries(g_f): if (entry.feature == "CDS") or \ (entry.feature == "rRNA") or \ (entry.feature == "tRNA"): cdss.append(entry) if entry.feature == "gene": genes.append(entry) cdss = sorted(cdss, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) genes = sorted(genes, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) return cdss, genes
def __init__(self, args_term): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_term.gffs, "tmp") self.fasta_path = os.path.join(args_term.fastas, "tmp") self.tran_path = os.path.join(args_term.trans, "tmp") self.outfolder = { "term": os.path.join(args_term.out_folder, "gffs"), "csv": os.path.join(args_term.out_folder, "tables") } self.terms = { "all": os.path.join(self.outfolder["term"], "all_candidates"), "express": os.path.join(self.outfolder["term"], "expressed_candidates"), "best": os.path.join(self.outfolder["term"], "best_candidates"), "non": os.path.join(self.outfolder["term"], "non_expressed_candidates") } self.csvs = { "all": os.path.join(self.outfolder["csv"], "all_candidates"), "express": os.path.join(self.outfolder["csv"], "expressed_candidates"), "best": os.path.join(self.outfolder["csv"], "best_candidates"), "non": os.path.join(self.outfolder["csv"], "non_expressed_candidates") } self.combine_path = os.path.join(self.gff_path, "combine") self.tmps = { "transterm": os.path.join(os.getcwd(), "tmp_transterm"), "hp": "transtermhp", "hp_gff": "transtermhp.gff", "hp_path": "tmp_transterm/tmp", "term_table": os.path.join(os.getcwd(), "tmp_term_table"), "merge": os.path.join(os.getcwd(), "tmp_merge_gff"), "gff": "tmp.gff", "folder": os.path.join(os.getcwd(), "tmp") } self.suffixs = { "gff": "term.gff", "csv": "term.csv", "allgff": "term_all.gff" } if args_term.srnas: self.srna_path = os.path.join(args_term.srnas, "tmp") else: self.srna_path = None self._make_gff_folder()
def print_coverage(trans, out, out_gff, wigs_f, wigs_r, table_best, gff_file): genes = [] if gff_file is not None: gff_f = open(gff_file, "r") for entry in Gff3Parser().entries(gff_f): if (entry.feature == "gene"): genes.append(entry) for tran in trans: infos = {} tran.attributes["detect_lib"] = tran.attributes["detect_lib"].replace( "tex_notex", "TEX+/-") out.write("\t".join([ tran.seq_id, tran.attributes["Name"], str(tran.start), str(tran.end), tran.strand, tran.attributes["detect_lib"] ])) compare_ta_genes(tran, genes, out) print_associate("associated_tss", tran, out) print_associate("associated_term", tran, out) if tran.strand == "+": detect_coverage(wigs_f, tran, infos) else: detect_coverage(wigs_r, tran, infos) out.write("\t") best = -1 best_track = "" best_cover = {} for track, cover in infos.items(): if not table_best: if best != -1: out.write(";") out.write("{0}(avg={1})".format(track, str(cover["avg"]))) if cover["avg"] > best: best = cover["avg"] best_track = track best_cover = cover if table_best: out.write("{0}(avg={1})".format(best_track, str(best_cover["avg"]))) out.write("\n") new_attrs = {} for key, value in tran.attributes.items(): if ("high_coverage" not in key) and ("low_coverage" not in key): new_attrs[key] = value new_attrs["best_avg_coverage"] = str(best_cover["avg"]) attribute_string = ";".join( ["=".join(items) for items in new_attrs.items()]) out_gff.write( "\t".join([tran.info_without_attributes, attribute_string]) + "\n")
def read_gff(gff_file, tss_file): tsss = [] gffs = [] gff_parser = Gff3Parser() fh = open(gff_file) for gff in gff_parser.entries(fh): gffs.append(gff) gffs = sorted(gffs, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) fh.close() tss_f = open(tss_file, "r") for tss in gff_parser.entries(tss_f): tsss.append(tss) tsss = sorted(tsss, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) tss_f.close() return gffs, tsss
def read_file(seq_file, tran_file, gff_file): seq = {} tas = [] genes = [] merges = [] with open(seq_file, "r") as f_h: for line in f_h: line = line.strip() if line.startswith(">"): strain = line[1:] seq[strain] = "" else: seq[strain] = seq[strain] + line ta_fh = open(tran_file, "r") for entry in Gff3Parser().entries(ta_fh): tas.append(entry) merges.append(entry) for entry in Gff3Parser().entries(open(gff_file)): if (entry.feature == "gene"): genes.append(entry) merges.append(entry) tas = sorted(tas, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) genes = sorted(genes, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) return seq, tas, merges, genes
def longer_ta(ta_file, length, out_file): '''merge overlaped transcript to for a complete transcript''' tas = [] for entry in Gff3Parser().entries(open(ta_file)): tas.append(entry) tas = sorted(tas, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) for ta_1 in tas: for ta_2 in tas: if (ta_1.seq_id == ta_2.seq_id) and ( ta_1.strand == ta_2.strand): if (ta_1.start <= ta_2.start) and ( ta_1.end >= ta_2.start) and ( ta_1.end <= ta_2.end): ta_1.end = ta_2.end elif (ta_1.start >= ta_2.start) and ( ta_1.start <= ta_2.end) and ( ta_1.end >= ta_2.end): ta_1.start = ta_2.start elif (ta_1.start <= ta_2.start) and ( ta_1.end >= ta_2.end): pass elif (ta_1.start >= ta_2.start) and ( ta_1.end <= ta_2.end): ta_1.start = ta_2.start ta_1.end = ta_2.end first = True out = open(out_file, "w") out.write("##gff-version 3\n") num = 0 pre_ta = None tas = sorted(tas, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) for ta in tas: if (ta.end - ta.start) >= length: if first: first = False print_file(ta, num, out) num += 1 else: if (ta.seq_id == pre_ta.seq_id) and ( ta.strand == pre_ta.strand) and ( ta.start == pre_ta.start) and ( ta.end == pre_ta.end): pass else: print_file(ta, num, out) num += 1 pre_ta = ta out.close()