class ArgsContainer(object): def __init__(self): self.multiparser = Multiparser() self.helper = Helper() def _check_replicates(self, replicates_tex, replicates_frag): if (replicates_tex is not None) and (replicates_frag is not None): replicates = {"tex": int(replicates_tex), "frag": int(replicates_frag)} elif replicates_tex is not None: replicates = {"tex": int(replicates_tex), "frag": -1} elif replicates_frag is not None: replicates = {"tex": -1, "frag": int(replicates_frag)} else: print("Error:No replicates number assign!!!") sys.exit() return replicates def _check_libs(self, tex_notex_libs, frag_libs): if (tex_notex_libs is None) and (frag_libs is None): print("Error: please input proper libraries!!") if (tex_notex_libs is not None) and (frag_libs is not None): libs = tex_notex_libs + frag_libs elif (tex_notex_libs is not None): libs = tex_notex_libs elif (frag_libs is not None): libs = frag_libs return libs def _parser_combine_wigs(self, subcommand): self.tex_path = None self.frag_path = None self.multiparser.parser_gff(self.gffs, None) if subcommand == "terminator": gff_path = os.path.join(self.gffs, "tmp") self.multiparser.parser_gff(gff_path, None) else: gff_path = self.gffs if self.tex_wigs is not None: self.tex_path = os.path.join(self.tex_wigs, "tmp") self.multiparser.parser_wig(self.tex_wigs) self.multiparser.combine_wig(gff_path, self.tex_path, None, self.libs) self.merge_wigs = self.tex_wigs self.wig_path = self.tex_path if self.frag_wigs is not None: self.frag_path = os.path.join(self.frag_wigs, "tmp") self.multiparser.parser_wig(self.frag_wigs) self.multiparser.combine_wig(gff_path, self.frag_path, None, self.libs) self.merge_wigs = self.frag_wigs self.wig_path = self.frag_path if (self.tex_path is not None) and ( self.frag_path is not None): self = self._merge_wig() if (self.tex_path is None) and ( self.frag_path is None): print("Error: There is no proper wig files assigned!!") sys.exit() return self def _merge_wig(self): self.merge_wigs = os.path.join(self.out_folder, "merge_wigs") if (self.tex_wigs is not None) and ( self.frag_wigs is not None): self.helper.check_make_folder(self.merge_wigs) self.wig_path = os.path.join(self.merge_wigs, "tmp") self.helper.check_make_folder(self.wig_path) for wig in os.listdir(self.tex_wigs): if os.path.isfile(os.path.join(self.tex_wigs, wig)): shutil.copy(os.path.join(self.tex_wigs, wig), self.merge_wigs) for wig in os.listdir(self.frag_wigs): if os.path.isfile(os.path.join(self.frag_wigs, wig)): shutil.copy(os.path.join(self.frag_wigs, wig), self.merge_wigs) for wig in os.listdir(self.tex_path): if os.path.isfile(os.path.join(self.tex_path, wig)): shutil.copy(os.path.join(self.tex_path, wig), self.wig_path) for wig in os.listdir(self.frag_path): if os.path.isfile(os.path.join(self.frag_path, wig)): self.helper.merge_file(os.path.join(self.frag_path, wig), os.path.join(self.wig_path, wig)) elif (self.tex_wigs is not None): self.merge_wigs = self.tex_wigs elif (self.frag_wigs is not None): self.merge_wigs = self.frag_wigs return self def _deal_multi_inputs(self, inputs, file_type, num, command): if inputs is not None: datas = inputs.split(",") if num is not None: if (len(datas) != num): print("Error: the amount of {0} is not correct!!".format( command)) new_inputs = [] for data in datas: if file_type == "float": new_inputs.append(float(data.strip())) elif file_type == "int": new_inputs.append(int(data.strip())) else: new_inputs.append(data) return new_inputs else: return inputs def container_ratt(self, ratt_path, element, transfer_type, ref_embl_gbk, target_fasta, ref_fasta, ratt_folder, convert_to_gff_rnt_ptt, tar_annotation_folder, compare_pair): self.ratt_path = ratt_path self.element = element self.transfer_type = transfer_type self.ref_embls = ref_embl_gbk self.tar_fastas = target_fasta self.ref_fastas = ref_fasta self.output_path = ratt_folder self.convert = convert_to_gff_rnt_ptt self.gff_outfolder = tar_annotation_folder self.pairs = self._deal_multi_inputs(compare_pair, "str", None, None) return self def container_tsspredator(self, TSSpredator_path, compute_program, fasta_folder, annotation_folder, wig_folder, lib, output_prefix, height, height_reduction, factor, factor_reduction, base_height, enrichment_factor, processing_factor, replicate_match, out_folder, statistics, validate_gene, merge_manual, compare_transcript_assembly, fuzzy, utr_length, cluster, length, re_check_orphan, overlap_feature, reference_gff_folder, remove_low_expression): self.tsspredator_path = TSSpredator_path self.program = compute_program self.fastas = fasta_folder self.gffs = annotation_folder self.wig_folder = wig_folder self.libs = self._deal_multi_inputs(lib, "str", None, None) self.output_prefixs = self._deal_multi_inputs(output_prefix, "str", None, None) self.height = height self.height_reduction = height_reduction self.factor = factor self.factor_reduction = factor_reduction self.base_height = base_height self.enrichment_factor = enrichment_factor self.processing_factor = processing_factor self.repmatch = replicate_match self.out_folder = out_folder self.stat = statistics self.validate = validate_gene self.manual = merge_manual self.ta_files = compare_transcript_assembly self.fuzzy = fuzzy self.utr_length = utr_length self.cluster = cluster self.nt_length = length self.check_orphan = re_check_orphan self.overlap_feature = overlap_feature self.references = reference_gff_folder self.remove_low_expression = remove_low_expression return self def container_optimize(self, TSSpredator_path, fasta_file, annotation_file, wig_folder, manual, out_folder, strain_name, max_height, max_height_reduction, max_factor, max_factor_reduction, max_base_height, max_enrichment_factor, max_processing_factor, utr_length, lib, output_prefix, cluster, length, core, program, replicate_match, steps): self.tsspredator_path = TSSpredator_path self.fastas = fasta_file self.gffs = annotation_file self.wigs = wig_folder self.manual = manual self.output_folder = out_folder self.project_strain = strain_name self.height = max_height self.height_reduction = max_height_reduction self.factor = max_factor self.factor_reduction = max_factor_reduction self.base_height = max_base_height self.enrichment = max_enrichment_factor self.processing = max_processing_factor self.utr = utr_length self.libs = self._deal_multi_inputs(lib, "str", None, None) self.replicate_name = self._deal_multi_inputs(output_prefix, "str", None, None) self.cluster = cluster self.length = length self.cores = core self.program = program self.replicate = replicate_match self.steps = steps return self def container_terminator( self, TransTermHP_path, expterm_path, RNAfold_path, out_folder, fasta_folder, annotation_folder, transcript_folder, srna, statistics, tex_wig_folder, frag_wig_folder, decrease, highest_coverage, fuzzy_detect_coverage, fuzzy_within_transcript, fuzzy_downstream_transcript, fuzzy_within_gene, fuzzy_downstream_gene, transtermhp_folder, tex_notex_libs, frag_libs, tex_notex, replicates_tex, replicates_frag, table_best, min_loop_length, max_loop_length, min_stem_length, max_stem_length, min_AT_tail_length, miss_rate, range_u): self.TransTermHP_path = TransTermHP_path self.expterm_path = expterm_path self.RNAfold_path = RNAfold_path self.out_folder = out_folder self.fastas = fasta_folder self.gffs = annotation_folder self.trans = transcript_folder self.srnas = srna self.stat = statistics self.tex_wigs = tex_wig_folder self.frag_wigs = frag_wig_folder self.decrease = decrease self.cutoff_coverage = highest_coverage self.fuzzy = fuzzy_detect_coverage self.fuzzy_up_ta = fuzzy_within_transcript self.fuzzy_down_ta = fuzzy_downstream_transcript self.fuzzy_up_gene = fuzzy_within_gene self.fuzzy_down_gene = fuzzy_downstream_gene self.hp_folder = transtermhp_folder self.tlibs = self._deal_multi_inputs(tex_notex_libs, "str", None, None) self.flibs = self._deal_multi_inputs(frag_libs, "str", None, None) self.libs = self._check_libs(self.tlibs, self.flibs) self.tex_notex = tex_notex self.replicates_tex = replicates_tex self.replicates_frag = replicates_frag self.replicates = self._check_replicates( replicates_tex, replicates_frag) self.table_best = table_best self.min_loop = min_loop_length self.max_loop = max_loop_length self.min_stem = min_stem_length self.max_stem = max_stem_length self.at_tail = min_AT_tail_length self.miss_rate = miss_rate self.range_u = range_u self = self._parser_combine_wigs("terminator") return self def container_transcript( self, frag_wig_path, tex_wig_path, tex_notex, length, annotation_folder, height, width, tolerance, tolerance_coverage, replicates_tex, replicates_frag, transcript_assembly_output_folder, compare_TSS, compare_genome_annotation, TSS_fuzzy, tex_treated_libs, fragmented_libs, compare_feature_genome, table_best, terminator_folder, fuzzy_term): self.frag_wigs = frag_wig_path self.tex_wigs = tex_wig_path self.tex = tex_notex self.length = length self.gffs = annotation_folder self.height = height self.width = width self.tolerance = tolerance self.low_cutoff = tolerance_coverage self.replicates_tex = replicates_tex self.replicates_frag = replicates_frag self.replicates = self._check_replicates( replicates_tex, replicates_frag) self.out_folder = transcript_assembly_output_folder self.compare_tss = compare_TSS self.compare_cds = compare_genome_annotation self.fuzzy = TSS_fuzzy self.tlibs = self._deal_multi_inputs(tex_treated_libs, "str", None, None) self.flibs = self._deal_multi_inputs(fragmented_libs, "str", None, None) self.libs = self._check_libs(self.tlibs, self.flibs) self.c_feature = self._deal_multi_inputs(compare_feature_genome, "str", None, None) self.table_best = table_best self.terms = terminator_folder self.fuzzy_term = fuzzy_term self = self._parser_combine_wigs("transcript") return self def container_utr(self, tss_folder, annotation_folder, transcript_assembly_folder, terminator_folder, terminator_fuzzy, utr_folder, tss_source, base_5utr, length, base_3utr): self.tsss = tss_folder self.gffs = annotation_folder self.trans = transcript_assembly_folder self.terms = terminator_folder self.fuzzy = terminator_fuzzy self.out_folder = utr_folder self.source = tss_source self.base_5utr = base_5utr self.base_3utr = base_3utr self.length = length return self def container_srna( self, Vienna_folder, Vienna_utils, blast_plus_folder, ps2pdf14_path, srna_folder, UTR_derived_sRNA, annotation_folder, TSS_folder, transcript_assembly_folder, TSS_intergenic_fuzzy, TSS_5UTR_fuzzy, TSS_3UTR_fuzzy, TSS_interCDS_fuzzy, import_info, tex_wig_folder, frag_wig_folder, processing_site_folder, fasta_folder, mountain_plot, nr_format, srna_format, sRNA_database_path, nr_database_path, cutoff_energy, run_intergenic_TEX_coverage, run_intergenic_noTEX_coverage, run_intergenic_fragmented_coverage, run_antisense_TEX_coverage, run_antisense_noTEX_coverage, run_antisense_fragmented_coverage, intergenic_tolerance, run_utr_TEX_coverage, run_utr_noTEX_coverage, run_utr_fragmented_coverage, max_length, min_length, tex_notex_libs, frag_libs, replicates_tex, replicates_frag, tex_notex, blast_e_nr, blast_e_srna, detect_sRNA_in_CDS, table_best, decrease_intergenic, decrease_utr, fuzzy_intergenic, fuzzy_utr, cutoff_nr_hit, sORF, best_with_all_sRNAhit, best_without_sORF_candidate, overlap_percent_CDS, terminator_folder, terminator_fuzzy_in_CDS, terminator_fuzzy_out_CDS, best_with_terminator, ignore_hypothetical_protein, TSS_source, min_utr_coverage, promoter_table, best_with_promoter, ranking_promoter, promoter_name): self.vienna_path = Vienna_folder self.vienna_util = Vienna_utils self.blast_path = blast_plus_folder self.ps2pdf14_path = ps2pdf14_path self.out_folder = srna_folder self.utr_srna = UTR_derived_sRNA self.gffs = annotation_folder self.tss_folder = TSS_folder self.trans = transcript_assembly_folder self.fuzzy_inter_tss = TSS_intergenic_fuzzy self.fuzzy_5utr_tss = TSS_5UTR_fuzzy self.fuzzy_3utr_tss = TSS_3UTR_fuzzy self.fuzzy_intercds_tss = TSS_interCDS_fuzzy self.fuzzy_tsss = {"5utr": self.fuzzy_5utr_tss, "3utr": self.fuzzy_3utr_tss, "interCDS": self.fuzzy_intercds_tss, "inter": self.fuzzy_inter_tss} self.import_info = self._deal_multi_inputs(import_info, "str", None, None) self.tex_wigs = tex_wig_folder self.frag_wigs = frag_wig_folder self.pro_folder = processing_site_folder self.fastas = fasta_folder self.mountain = mountain_plot self.nr_format = nr_format self.srna_format = srna_format self.srna_database = sRNA_database_path self.nr_database = nr_database_path self.energy = cutoff_energy self.coverage_tex = self._deal_multi_inputs( run_intergenic_TEX_coverage, "float", 5, "--run_intergenic_TEX_coverage") self.coverage_notex = self._deal_multi_inputs( run_intergenic_noTEX_coverage, "float", 5, "--run_intergenic_noTEX_coverage") self.coverage_frag = self._deal_multi_inputs( run_intergenic_fragmented_coverage, "float", 5, "--run_intergenic_fragmented_coverage") self.anti_cover_tex = self._deal_multi_inputs( run_antisense_TEX_coverage, "float", 5, "--run_antisense_TEX_coverage") self.anti_cover_notex = self._deal_multi_inputs( run_antisense_noTEX_coverage, "float", 5, "--run_antisense_noTEX_coverage") self.anti_cover_frag = self._deal_multi_inputs( run_antisense_fragmented_coverage, "float", 5, "--run_antisense_fragmented_coverage") self.tolerance = intergenic_tolerance self.utr_tex_cover = self._deal_multi_inputs( run_utr_TEX_coverage, "str", 3, "--run_utr_TEX_coverage") self.utr_notex_cover = self._deal_multi_inputs( run_utr_noTEX_coverage, "str", 3, "--run_utr_TEX_coverage") self.utr_frag_cover = self._deal_multi_inputs( run_utr_fragmented_coverage, "str", 3, "--run_utr_fragmented_coverage") self.max_len = max_length self.min_len = min_length self.tlibs = self._deal_multi_inputs(tex_notex_libs, "str", None, None) self.flibs = self._deal_multi_inputs(frag_libs, "str", None, None) self.libs = self._check_libs(self.tlibs, self.flibs) self.replicates_tex = replicates_tex self.replicates_frag = replicates_frag self.replicates = self._check_replicates( replicates_tex, replicates_frag) self.tex_notex = tex_notex self.e_nr = blast_e_nr self.e_srna = blast_e_srna self.in_cds = detect_sRNA_in_CDS self.table_best = table_best self.decrease_inter = decrease_intergenic self.decrease_utr = decrease_utr self.fuzzy_inter = fuzzy_intergenic self.fuzzy_utr = fuzzy_utr self.nr_hits_num = cutoff_nr_hit self.sorf_file = sORF self.all_hit = best_with_all_sRNAhit self.best_sorf = best_without_sORF_candidate self.cutoff_overlap = overlap_percent_CDS self.terms = terminator_folder self.fuzzy_b = terminator_fuzzy_in_CDS self.fuzzy_a = terminator_fuzzy_out_CDS self.best_term = best_with_terminator self.hypo = ignore_hypothetical_protein self.tss_source = TSS_source self.min_utr = min_utr_coverage self.promoter_table = promoter_table self.best_promoter = best_with_promoter if ranking_promoter < 1: print("Error: --ranking_time_promoter must larger than 1...") sys.exit() self.rank_promoter = ranking_promoter self.promoter_name = self._deal_multi_inputs(promoter_name, "str", None, None) self = self._parser_combine_wigs("srna") return self def container_intersrna(self, file_type, files, args_srna, prefix, gff_file, tran_file, tss_file, pro_file, fuzzy): args_srna.file_type = file_type args_srna.gff_file = gff_file args_srna.tran_file = tran_file args_srna.tss_file = tss_file args_srna.pro_file = pro_file args_srna.fuzzy = fuzzy args_srna.prefix = prefix if file_type == "frag": args_srna.wig_f_file = os.path.join( args_srna.frag_path, "_".join([prefix, "forward.wig"])) args_srna.wig_r_file = os.path.join( args_srna.frag_path, "_".join([prefix, "reverse.wig"])) args_srna.wig_folder = args_srna.frag_wigs args_srna.input_libs = args_srna.flibs args_srna.output_file = files["frag_gff"] args_srna.output_table = files["frag_csv"] args_srna.cutoffs = args_srna.coverage_frag args_srna.tss_source = True args_srna.cut_notex = None args_srna.anti_notex_cutoff = None else: args_srna.wig_f_file = os.path.join( args_srna.tex_path, "_".join([prefix, "forward.wig"])) args_srna.wig_r_file = os.path.join( args_srna.tex_path, "_".join([prefix, "reverse.wig"])) args_srna.wig_folder = args_srna.tex_wigs args_srna.input_libs = args_srna.tlibs args_srna.output_file = files["tex_gff"] args_srna.output_table = files["tex_csv"] args_srna.cutoffs = args_srna.coverage_tex args_srna.tss_source = args_srna.tss_source args_srna.cut_notex = args_srna.coverage_notex args_srna.anti_notex_cutoff = args_srna.anti_cover_notex return args_srna def container_utrsrna(self, gff, tran, tss, files, pro, fasta, file_type, prefix, args_srna): args_srna.file_type = file_type args_srna.gff_file = gff args_srna.ta_file = tran args_srna.tss_file = tss args_srna.pro_file = pro args_srna.prefix = prefix args_srna.seq_file = fasta if file_type == "frag": args_srna.wig_f_file = os.path.join( args_srna.frag_path, "_".join([prefix, "forward.wig"])) args_srna.wig_r_file = os.path.join( args_srna.frag_path, "_".join([prefix, "reverse.wig"])) args_srna.wig_folder = args_srna.frag_wigs args_srna.input_libs = args_srna.flibs args_srna.output_file = files["frag_gff"] args_srna.output_table = files["frag_csv"] args_srna.utr_coverages = args_srna.utr_frag_cover args_srna.notex = None else: args_srna.wig_f_file = os.path.join( args_srna.tex_path, "_".join([prefix, "forward.wig"])) args_srna.wig_r_file = os.path.join( args_srna.tex_path, "_".join([prefix, "reverse.wig"])) args_srna.wig_folder = args_srna.tex_wigs args_srna.input_libs = args_srna.tlibs args_srna.output_file = files["tex_gff"] args_srna.output_table = files["tex_csv"] args_srna.utr_coverages = args_srna.utr_tex_cover args_srna.notex = args_srna.utr_notex_cover args_srna.coverages = {"5utr": args_srna.utr_coverages[0], "3utr": args_srna.utr_coverages[1], "interCDS": args_srna.utr_coverages[2]} if args_srna.notex is not None: args_srna.cover_notex = {"5utr": args_srna.notex[0], "3utr": args_srna.notex[1], "interCDS": args_srna.notex[2]} else: args_srna.cover_notex = None return args_srna def extend_inter_container(self, args_srna, tsss, pros, wigs_f, wigs_r, nums, output, out_table, texs, detects, cutoff_coverage, notex): args_srna.tsss = tsss args_srna.pros = pros args_srna.wigs_f = wigs_f args_srna.wigs_r = wigs_r args_srna.nums = nums args_srna.output = output args_srna.out_table = out_table args_srna.texs = texs args_srna.detects = detects args_srna.cutoff_coverage = cutoff_coverage args_srna.notex = notex return args_srna def extend_utr_container(self, args_srna, cdss, tsss, pros, wig_fs, wig_rs, out, out_t, texs): args_srna.cdss = cdss args_srna.tsss = tsss args_srna.pros = pros args_srna.wig_fs = wig_fs args_srna.wig_rs = wig_rs args_srna.out = out args_srna.out_t = out_t args_srna.texs = texs args_srna.utrs = [] args_srna.srnas = [] return args_srna def container_sorf(self, sorf_folder, UTR_derived_sORF, transcript_folder, annotation_folder, TSS_folder, utr_length, min_length, max_length, tex_wig_folder, frag_wig_folder, cutoff_intergenic_coverage, cutoff_antisense_coverage, cutoff_5utr_coverage, cutoff_3utr_coverage, cutoff_interCDS_coverage, fasta_folder, tex_notex_libs, frag_libs, tex_notex, replicates_tex, replicates_frag, table_best, sRNA_folder, start_codon, stop_codon, cutoff_background, fuzzy_rbs, rbs_not_after_TSS, print_all_combination, best_no_sRNA, best_no_TSS, ignore_hypothetical_protein, min_rbs_distance, max_rbs_distance): self.out_folder = sorf_folder self.utr_detect = UTR_derived_sORF self.trans = transcript_folder self.gffs = annotation_folder self.tsss = TSS_folder self.utr_length = utr_length self.min_len = min_length self.max_len = max_length self.tex_wigs = tex_wig_folder self.frag_wigs = frag_wig_folder self.cutoff_inter = cutoff_intergenic_coverage self.cutoff_anti = cutoff_antisense_coverage self.cutoff_5utr = cutoff_5utr_coverage self.cutoff_3utr = cutoff_3utr_coverage self.cutoff_intercds = cutoff_interCDS_coverage self.fastas = fasta_folder self.tlibs = self._deal_multi_inputs(tex_notex_libs, "str", None, None) self.flibs = self._deal_multi_inputs(frag_libs, "str", None, None) self.libs = self._check_libs(self.tlibs, self.flibs) self.tex_notex = tex_notex self.replicates_tex = replicates_tex self.replicates_frag = replicates_frag self.replicates = self._check_replicates( replicates_tex, replicates_frag) self.table_best = table_best self.srnas = sRNA_folder self.start_codon = self._deal_multi_inputs(start_codon, "str", None, None) self.stop_codon = self._deal_multi_inputs(stop_codon, "str", None, None) self.background = cutoff_background self.fuzzy_rbs = fuzzy_rbs self.noafter_tss = rbs_not_after_TSS self.print_all = print_all_combination self.no_srna = best_no_sRNA self.no_tss = best_no_TSS self.hypo = ignore_hypothetical_protein self.min_rbs = min_rbs_distance self.max_rbs = max_rbs_distance self = self._parser_combine_wigs("sorf") return self def container_srna_target(self, Vienna_folder, annotation_path, fasta_path, sRNA_path, query_sRNA, program, interaction_length, window_size_target, span_target, window_size_srna, span_srna, unstructured_region_RNAplex_target, unstructured_region_RNAplex_srna, unstructured_region_RNAup, energy_threshold, duplex_distance, top, starget_output_folder, process_rnaplex, process_rnaup, continue_rnaup, potential_target_start, potential_target_end, target_feature): self.vienna_path = Vienna_folder self.gffs = annotation_path self.fastas = fasta_path self.srnas = sRNA_path self.query = self._deal_multi_inputs(query_sRNA, "str", None, None) self.program = program self.inter_length = interaction_length self.win_size_t = window_size_target self.span_t = span_target self.win_size_s = window_size_srna self.span_s = span_srna self.unstr_region_rnaplex_t = unstructured_region_RNAplex_target self.unstr_region_rnaplex_s = unstructured_region_RNAplex_srna self.unstr_region_rnaup = unstructured_region_RNAup self.energy = energy_threshold self.duplex_dist = duplex_distance self.top = top self.out_folder = starget_output_folder self.core_plex = process_rnaplex self.core_up = process_rnaup self.continue_rnaup = continue_rnaup self.tar_start = potential_target_start self.tar_end = potential_target_end self.features = self._deal_multi_inputs(target_feature, "str", None, None) return self def container_goterm(self, annotation_path, goterm_output_folder, UniProt_id, go_obo, goslim_obo, transcript_path): self.gffs = annotation_path self.out_folder = goterm_output_folder self.uniprot = UniProt_id self.go = go_obo self.goslim = goslim_obo self.trans = transcript_path return self def container_sublocal(self, Psortb_path, gff_path, fasta_path, bacteria_type, difference_multi, merge_to_gff, sublocal_output_folder, transcript_path): self.psortb_path = Psortb_path self.gffs = gff_path self.fastas = fasta_path self.gram = bacteria_type self.fuzzy = difference_multi self.merge = merge_to_gff self.out_folder = sublocal_output_folder self.trans = transcript_path return self def container_ppi(self, gff_path, proteinID_strains, without_strain_pubmed, species_STRING, score, ppi_output_folder, node_size, query): self.ptts = gff_path self.strains = self._deal_multi_inputs(proteinID_strains, "str", None, None) self.no_specific = without_strain_pubmed self.species = species_STRING self.score = score self.out_folder = ppi_output_folder self.size = node_size self.querys = self._deal_multi_inputs(query, "str", None, None) return self def container_promoter(self, MEME_path, promoter_output_folder, tex_libs, TSS_folder, fasta_folder, num_motif, nt_before_TSS, motif_width, TSS_source, tex_wig_path, annotation_folder, combine_all, e_value): self.meme_path = MEME_path self.output_folder = promoter_output_folder self.input_libs = self._deal_multi_inputs(tex_libs, "str", None, None) self.tsss = TSS_folder self.fastas = fasta_folder self.num_motif = num_motif self.nt_before = nt_before_TSS self.widths = self._deal_multi_inputs(motif_width, "str", None, None) self.source = TSS_source self.wigs = tex_wig_path self.gffs = annotation_folder self.combine = combine_all self.e_value = e_value return self def container_operon(self, TSS_folder, annotation_folder, transcript_folder, UTR5_folder, UTR3_folder, term_folder, TSS_fuzzy, term_fuzzy, min_length, statistics, operon_output_folder, combine_gff, operon_statistics_folder): self.tsss = TSS_folder self.gffs = annotation_folder self.trans = transcript_folder self.utr5s = UTR5_folder self.utr3s = UTR3_folder self.terms = term_folder self.tss_fuzzy = TSS_fuzzy self.term_fuzzy = term_fuzzy self.length = min_length self.statistics = statistics self.output_folder = operon_output_folder self.combine = combine_gff self.stat_folder = operon_statistics_folder return self def container_snp(self, samtools_path, bcftools_path, bam_type, program, fasta_path, tex_bam_path, frag_bam_path, quality, read_depth, snp_output_folder, indel_fraction, chrom): self.samtools_path = samtools_path self.bcftools_path = bcftools_path self.types = bam_type self.program = self._deal_multi_inputs(program, "str", None, None) self.fastas = fasta_path self.normal_bams = tex_bam_path self.frag_bams = frag_bam_path self.quality = quality self.depth = read_depth self.out_folder = snp_output_folder self.fraction = indel_fraction if chrom == "haploid": chrom = "1" elif chrom == "diploid": chrom = "2" self.chrom = chrom return self def container_circrna(self, align, process, fasta_path, annotation_path, tex_bam_path, fragmented_bam_path, read_folder, circrna_stat_folder, support_reads, segemehl_folder, samtools_path, start_ratio, end_ratio, ignore_hypothetical_protein, out_folder): self.align = align self.cores = process self.fastas = fasta_path self.gffs = annotation_path self.normal_bams = tex_bam_path self.frag_bams = fragmented_bam_path self.read_folder = read_folder self.stat_folder = circrna_stat_folder self.support = support_reads self.segemehl_path = segemehl_folder self.samtools_path = samtools_path self.start_ratio = start_ratio self.end_ratio = end_ratio self.hypo = ignore_hypothetical_protein self.output_folder = out_folder return self def container_ribos(self, infernal_path, riboswitch_ID, gff_path, fasta_path, tss_path, transcript_path, Rfam, ribos_output_folder, e_value, output_all, database_folder, fuzzy, start_codon, min_dist_rbs, max_dist_rbs, fuzzy_rbs, UTR_length): self.infernal_path = infernal_path self.ribos_id = riboswitch_ID self.gffs = gff_path self.fastas = fasta_path self.tsss = tss_path self.trans = transcript_path self.rfam = Rfam self.out_folder = ribos_output_folder self.e_value = e_value self.output_all = output_all self.database = database_folder self.fuzzy = fuzzy self.start_codons = self._deal_multi_inputs(start_codon, "str", None, None) self.start_rbs = min_dist_rbs self.end_rbs = max_dist_rbs self.fuzzy_rbs = fuzzy_rbs self.utr = UTR_length return self def container_screen(self, main_gff, side_gffs, fasta, frag_wig_folder, tex_wig_folder, height, tex_libs, frag_libs, present, output_folder): self.main_gff = main_gff self.side_gffs = self._deal_multi_inputs(side_gffs, "str", None, None) self.fasta = fasta self.frag_wigs = frag_wig_folder self.tex_wigs = tex_wig_folder self.height = height self.tlibs = self._deal_multi_inputs(tex_libs, "str", None, None) self.flibs = self._deal_multi_inputs(frag_libs, "str", None, None) self.present = present self.output_folder = output_folder return self
class TSSpredator(object): def __init__(self, args_tss): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.master = os.path.join(args_tss.out_folder, "MasterTables") self.tmps = { "tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta": "tmp_tss", "tmp": "tmp" } if args_tss.ta_files is not None: self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp") else: self.tmps["ta"] = None self.gff_path = os.path.join(args_tss.gffs, "tmp") self.wig_path = os.path.join(args_tss.wig_folder, "tmp") self.fasta_path = os.path.join(args_tss.fastas, "tmp") self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics") self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs") def _assign_dict(self, lib_datas): return { "wig": lib_datas[0], "tex": lib_datas[1], "condition": int(lib_datas[2]), "replicate": lib_datas[3], "strand": lib_datas[4] } def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix, rep_set): for num_id in range(1, lib_num + 1): cond_list = [] for lib in lib_list: if num_id == lib["condition"]: cond_list.append(lib) cond_sort_list = sorted(cond_list, key=lambda k: k['replicate']) reps = [] for cond in cond_sort_list: out.write("{0}_{1}{2} = {3}\n".format( prefix, cond["condition"], cond["replicate"], os.path.join(wig_folder, cond["wig"]))) reps.append(cond["replicate"]) for rep in sorted(rep_set): if rep not in reps: out.write("{0}_{1}{2} = \n".format(prefix, cond["condition"], rep)) def _start_to_run(self, tsspredator_path, config_file, out_path, prefix): print("Running TSSpredator for " + prefix) out = open(os.path.join(out_path, "log.txt"), "w") err = open(os.path.join(out_path, "err.txt"), "w") call(["java", "-jar", tsspredator_path, config_file], stdout=out, stderr=err) out.close() err.close() def _import_lib(self, libs, wig_folder, project_strain_name, out, gff, program, fasta): lib_dict = {"fp": [], "fm": [], "nm": [], "np": []} lib_num = 0 rep_set = set() list_num_id = [] print("Runniun {0} now...".format(program)) for lib in libs: lib_datas = lib.split(":") if not lib_datas[0].endswith(".wig"): print("Error:Exist a not proper wig files!!") sys.exit() for wig in os.listdir(wig_folder): filename = wig.split("_STRAIN_") if (filename[0] == lib_datas[0][:-4]) and (filename[1][:-4] == project_strain_name): lib_datas[0] = wig if int(lib_datas[2]) > lib_num: lib_num = int(lib_datas[2]) if lib_datas[3] not in rep_set: rep_set.add(lib_datas[3]) if (lib_datas[1] == "tex") and (lib_datas[4] == "+"): lib_dict["fp"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"): lib_dict["fm"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"): lib_dict["np"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"): lib_dict["nm"].append(self._assign_dict(lib_datas)) for num_id in range(1, lib_num + 1): out.write("annotation_{0} = {1}\n".format(num_id, gff)) if program.lower() == "tss": self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "fivePrimePlus", rep_set) elif program.lower() == "processing_site": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "fivePrimePlus", rep_set) else: print("Error: Wrong program name!!!") sys.exit() for num_id in range(1, lib_num + 1): out.write("genome_{0} = {1}\n".format(num_id, fasta)) for num_id in range(1, lib_num + 1): list_num_id.append(str(num_id)) return lib_num, num_id, rep_set, lib_dict, list_num_id def _print_repmatch(self, args_tss, out): '''check replicate match''' if "all" in args_tss.repmatch: match = args_tss.repmatch.split("_")[-1] out.write("minNumRepMatches = {0}\n".format(match)) else: nums = {} matchs = {} for match in args_tss.repmatch.split(","): lib = match.split("_")[0] rep = match.split("_")[-1] matchs[lib] = rep if rep not in nums.keys(): nums[rep] = 1 else: nums[rep] += 1 for rep, num in nums.items(): if num == max(nums.values()): out.write("minNumRepMatches = {0}\n".format(rep)) max_rep = rep break for lib, rep in matchs.items(): if rep != max_rep: out.write("minNumRepMatches_{0} = {1}\n".format(lib, rep)) def _gen_config(self, project_strain_name, args_tss, gff, wig_folder, fasta, config_file): '''generation of config files''' master_folder = "MasterTable_" + project_strain_name out_path = os.path.join(self.master, master_folder) self.helper.check_make_folder(out_path) out = open(config_file, "w") out.write("TSSinClusterSelectionMethod = HIGHEST\n") out.write("allowedCompareShift = 1\n") out.write("allowedRepCompareShift = 1\n") lib_num, num_id, rep_set, lib_dict, list_num_id = \ self._import_lib(args_tss.libs, wig_folder, project_strain_name, out, gff, args_tss.program, fasta) out.write("idList = ") out.write(",".join(list_num_id) + "\n") out.write("maxASutrLength = 100\n") out.write("maxGapLengthInGene = 500\n") out.write("maxNormalTo5primeFactor = {0}\n".format( args_tss.processing_factor)) out.write("maxTSSinClusterDistance = {0}\n".format(args_tss.cluster + 1)) out.write("maxUTRlength = {0}\n".format(args_tss.utr_length)) out.write("min5primeToNormalFactor = {0}\n".format( args_tss.enrichment_factor)) out.write("minCliffFactor = {0}\n".format(args_tss.factor)) out.write("minCliffFactorDiscount = {0}\n".format( args_tss.factor_reduction)) out.write("minCliffHeight = {0}\n".format(args_tss.height)) out.write("minCliffHeightDiscount = {0}\n".format( args_tss.height_reduction)) out.write("minNormalHeight = {0}\n".format(args_tss.base_height)) self._print_repmatch(args_tss, out) out.write("minPlateauLength = 0\n") out.write("mode = cond\n") out.write("normPercentile = 0.9\n") if args_tss.program.lower() == "tss": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "normalPlus", rep_set) else: self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "normalPlus", rep_set) out.write("numReplicates = {0}\n".format(len(rep_set))) out.write("numberOfDatasets = {0}\n".format(lib_num)) out.write("outputDirectory = {0}\n".format(out_path)) for prefix_id in range(len(args_tss.output_prefixs)): out.write("outputPrefix_{0} = {1}\n".format( prefix_id + 1, args_tss.output_prefixs[prefix_id])) out.write("projectName = {0}\n".format(project_strain_name)) out.write("superGraphCompatibility = igb\n") out.write("texNormPercentile = 0.5\n") out.write("writeGraphs = 0\n") out.write("writeNocornacFiles = 0\n") out.close() def _convert_gff(self, prefixs, args_tss): for prefix in prefixs: out_file = os.path.join( self.gff_outfolder, "_".join([prefix, args_tss.program]) + ".gff") gff_f = open(out_file, "w") out_path = os.path.join(self.master, "_".join(["MasterTable", prefix])) if "MasterTable.tsv" not in os.listdir(out_path): print("Error:there is not MasterTable file in {0}".format( out_path)) print("Please check configuration file.") else: if args_tss.program.lower() == "processing": feature = "processing_site" elif args_tss.program.lower() == "tss": feature = "TSS" self.converter.convert_mastertable2gff( os.path.join(out_path, "MasterTable.tsv"), "ANNOgesic", feature, prefix, out_file) gff_f.close() def _merge_manual(self, tsss, args_tss): '''if manual detected TSS is provided, it can merge manual detected TSS and TSSpredator predicted TSS''' self.helper.check_make_folder( os.path.join(os.getcwd(), self.tmps["tss"])) for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break filename = "_".join([tss, args_tss.program]) + ".gff" predict = os.path.join(self.gff_outfolder, filename) print("Running merge and classify manual ....") stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss) merge_manual_predict_tss(predict, stat_file, os.path.join(self.tmps["tss"], filename), os.path.join(args_tss.gffs, gff), args_tss) shutil.move( stat_file, os.path.join(args_tss.out_folder, "statistics", tss, stat_file)) self.helper.move_all_content(self.tmps["tss"], self.gff_outfolder, [".gff"]) shutil.rmtree(self.tmps["tss"]) def _validate(self, tsss, args_tss): '''validate TSS with genome annotation''' print("Running validation of annotation....") for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break stat_file = os.path.join(self.stat_outfolder, tss, "".join(["stat_gene_vali_", tss, ".csv"])) out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff") if args_tss.program.lower() == "tss": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) elif args_tss.program.lower() == "processing": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "processing.gff"])) validate_gff(compare_file, os.path.join(args_tss.gffs, gff), stat_file, out_cds_file, args_tss.utr_length, args_tss.program.lower()) shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff)) def _compare_ta(self, tsss, args_tss): '''compare TSS with transcript''' detect = False print("Running compare transcript assembly and TSS ...") self.multiparser.parser_gff(args_tss.ta_files, "transcript") self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"], None, "transcript") for tss in tsss: stat_out = os.path.join( self.stat_outfolder, tss, "".join(["stat_compare_TSS_transcript_", tss, ".csv"])) for ta in os.listdir(self.tmps["ta"]): filename = ta.split("_transcript") if (filename[0] == tss) and (filename[1] == ".gff"): detect = True break compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) if detect: stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file, stat_out, self.tmps["ta_tss"], self.tmps["tss_ta"], args_tss.fuzzy) self.helper.sort_gff(self.tmps["tss_ta"], compare_file) self.helper.sort_gff(self.tmps["ta_tss"], os.path.join(args_tss.ta_files, ta)) os.remove(self.tmps["tss_ta"]) os.remove(self.tmps["ta_tss"]) detect = False def _stat_tss(self, tsss, feature): print("Running statistaics.....") for tss in tsss: compare_file = os.path.join(self.gff_outfolder, "_".join([tss, feature]) + ".gff") stat_tsspredator( compare_file, feature, os.path.join( self.stat_outfolder, tss, "_".join(["stat", feature, "class", tss]) + ".csv"), os.path.join(self.stat_outfolder, tss, "_".join(["stat", feature, "libs", tss]) + ".csv")) self.helper.move_all_content( os.getcwd(), os.path.join(self.stat_outfolder, tss), ["_class", ".png"]) if os.path.exists( os.path.join(self.stat_outfolder, "TSSstatistics.tsv")): shutil.move( os.path.join(self.stat_outfolder, "TSSstatistics.tsv"), os.path.join(self.stat_outfolder, tss, "TSSstatistics.tsv")) plot_venn(compare_file, feature) self.helper.move_all_content( os.getcwd(), os.path.join(self.stat_outfolder, tss), ["_venn", ".png"]) def _set_gen_config(self, args_tss, input_folder): prefixs = [] detect = False for fasta in os.listdir(self.fasta_path): for gff in os.listdir(self.gff_path): if fasta[:-3] == gff[:-4]: prefix = fasta[:-3] for wig in os.listdir(self.wig_path): filename = wig.split("_STRAIN_") if filename[1][:-4] == prefix: detect = True break if detect: prefixs.append(prefix) config = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._gen_config(prefix, args_tss, os.path.join(self.gff_path, gff), self.wig_path, os.path.join(self.fasta_path, fasta), config) return prefixs def _merge_wigs(self, wig_folder, prefix, libs): self.helper.check_make_folder( os.path.join(os.getcwd(), self.tmps["tmp"])) for wig_file in os.listdir(wig_folder): for lib in libs: info = lib.split(":") if (info[0][:-4] in wig_file) and (info[-1] == "+") and ( prefix in wig_file) and (os.path.isfile( os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_forward.wig")) if (info[0][:-4] in wig_file) and (info[-1] == "-") and ( prefix in wig_file) and (os.path.isfile( os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_reverse.wig")) def _check_orphan(self, prefixs, wig_folder, args_tss): '''if genome has no locus tag, it can use for classify the TSS''' for prefix in prefixs: self._merge_wigs(wig_folder, prefix, args_tss.libs) tmp_tss = os.path.join( self.tmps["tmp"], "_".join([prefix, args_tss.program + ".gff"])) pre_tss = os.path.join( self.gff_outfolder, "_".join([prefix, args_tss.program + ".gff"])) check_orphan(pre_tss, os.path.join(args_tss.gffs, prefix + ".gff"), "tmp/merge_forward.wig", "tmp/merge_reverse.wig", tmp_tss) shutil.move(tmp_tss, pre_tss) shutil.rmtree("tmp") def _remove_files(self, args_tss): print("Remove temperary files and folders...") self.helper.remove_tmp(args_tss.fastas) self.helper.remove_tmp(args_tss.gffs) self.helper.remove_tmp(args_tss.wig_folder) self.helper.remove_tmp(args_tss.ta_files) if "merge_forward.wig" in os.listdir(os.getcwd()): os.remove("merge_forward.wig") if "merge_reverse.wig" in os.listdir(os.getcwd()): os.remove("merge_reverse.wig") def _deal_with_overlap(self, out_folder, args_tss): '''deal with the situation that TSS and processing site at the same position''' if args_tss.overlap_feature.lower() == "both": pass else: print("Comparing TSS and Processing site...") if args_tss.program.lower() == "tss": for tss in os.listdir(out_folder): if tss.endswith("_TSS.gff"): ref = self.helper.get_correct_file( args_tss.references, "_processing.gff", tss.replace("_TSS.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.overlap_feature, args_tss.cluster) elif args_tss.program.lower() == "processing_site": for tss in os.listdir(out_folder): if tss.endswith("_processing.gff"): ref = self.helper.get_correct_file( args_tss.references, "_TSS.gff", tss.replace("_processing.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.overlap_feature, args_tss.cluster) def _low_expression(self, args_tss, gff_folder): '''deal with the low expressed TSS''' prefix = None self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs) for gff in os.listdir(gff_folder): if (args_tss.program.lower() == "tss") and (gff.endswith("_TSS.gff")): prefix = gff.replace("_TSS.gff", "") elif (args_tss.program.lower() == "processing") and (gff.endswith("_processing.gff")): prefix = gff.replace("_processing.gff", "") if prefix: out = open( os.path.join( self.stat_outfolder, prefix, "_".join(["stat", prefix, "low_expression_cutoff.csv"])), "w") out.write("\t".join(["strain", "cutoff_coverage"]) + "\n") cutoff = filter_low_expression( os.path.join(gff_folder, gff), args_tss, "tmp/merge_forward.wig", "tmp/merge_reverse.wig", "tmp/without_low_expression.gff") out.write("\t".join([prefix, str(cutoff)]) + "\n") os.remove(os.path.join(gff_folder, gff)) shutil.move("tmp/without_low_expression.gff", os.path.join(gff_folder, gff)) prefix = None out.close() def run_tsspredator(self, args_tss): input_folder = os.path.join(args_tss.out_folder, "configs") for gff in os.listdir(args_tss.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes( os.path.join(args_tss.gffs, gff)) self.helper.check_make_folder(self.gff_outfolder) self.multiparser.parser_fasta(args_tss.fastas) self.multiparser.parser_gff(args_tss.gffs, None) self.multiparser.parser_wig(args_tss.wig_folder) prefixs = self._set_gen_config(args_tss, input_folder) for prefix in prefixs: out_path = os.path.join(self.master, "_".join(["MasterTable", prefix])) config_file = os.path.join(input_folder, "_".join(["config", prefix]) + ".ini") self._start_to_run(args_tss.tsspredator_path, config_file, out_path, prefix) if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")): shutil.move( os.path.join(out_path, "TSSstatistics.tsv"), os.path.join(self.stat_outfolder, "TSSstatistics.tsv")) if args_tss.program.lower() == "processing_site": args_tss.program = "processing" self._convert_gff(prefixs, args_tss) if args_tss.check_orphan: print("checking the orphan TSS...") self._check_orphan(prefixs, os.path.join(args_tss.wig_folder, "tmp"), args_tss) self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder, None, args_tss.program) datas = [] for gff in os.listdir(self.gff_outfolder): if gff.endswith(".gff"): gff_folder = gff.replace( "".join(["_", args_tss.program, ".gff"]), "") self.helper.check_make_folder( os.path.join(self.stat_outfolder, gff_folder)) datas.append(gff_folder) if args_tss.remove_low_expression is not None: self._low_expression(args_tss, self.gff_outfolder) if args_tss.manual is not None: self.multiparser.combine_wig(args_tss.gffs, self.wig_path, None, args_tss.libs) self._merge_manual(datas, args_tss) self._deal_with_overlap(self.gff_outfolder, args_tss) if args_tss.stat: self._stat_tss(datas, args_tss.program) if args_tss.validate: self._validate(datas, args_tss) if args_tss.ta_files is not None: self._compare_ta(datas, args_tss) self._remove_files(args_tss)
class TSSpredator(object): def __init__(self, args_tss): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.master = os.path.join(args_tss.out_folder, "MasterTables") self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta": "tmp_tss", "tmp": "tmp"} if args_tss.ta_files is not None: self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp") else: self.tmps["ta"] = None self.gff_path = os.path.join(args_tss.gffs, "tmp") if args_tss.manual is not None: self.manual_path = os.path.join(args_tss.manual, "tmp") self.wig_path = os.path.join(args_tss.wig_folder, "tmp") self.fasta_path = os.path.join(args_tss.fastas, "tmp") self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics") self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs") def _assign_dict(self, lib_datas): return {"wig": lib_datas[0], "tex": lib_datas[1], "condition": int(lib_datas[2]), "replicate": lib_datas[3], "strand": lib_datas[4]} def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix, rep_set): for num_id in range(1, lib_num+1): cond_list = [] for lib in lib_list: if num_id == lib["condition"]: cond_list.append(lib) cond_sort_list = sorted(cond_list, key=lambda k: k['replicate']) reps = [] for cond in cond_sort_list: out.write("{0}_{1}{2} = {3}\n".format( prefix, cond["condition"], cond["replicate"], os.path.join(wig_folder, cond["wig"]))) reps.append(cond["replicate"]) for rep in sorted(rep_set): if rep not in reps: out.write("{0}_{1}{2} = \n".format( prefix, cond["condition"], rep)) def _start_to_run(self, tsspredator_path, config_file, out_path, prefix, log): print("Running TSSpredator for " + prefix) log.write("Make sure the version of TSSpredator is at least 1.06.\n") out = open(os.path.join(out_path, "log.txt"), "w") err = open(os.path.join(out_path, "err.txt"), "w") log.write(" ".join(["java", "-jar", tsspredator_path, config_file]) + "\n") call(["java", "-jar", tsspredator_path, config_file], stdout=out, stderr=err) out.close() err.close() log.write("Done!\n") log.write("The following files are generated in {0}:\n".format(out_path)) for file_ in os.listdir(out_path): log.write("\t" + file_ + "\n") def _import_lib(self, libs, wig_folder, project_strain_name, out, gff, program, fasta): lib_dict = {"fp": [], "fm": [], "nm": [], "np": []} lib_num = 0 rep_set = set() list_num_id = [] for lib in libs: lib_datas = lib.split(":") if not lib_datas[0].endswith(".wig"): print("Error: Wiggle files are not end with .wig!") sys.exit() for wig in os.listdir(wig_folder): filename = wig.split("_STRAIN_") if (filename[0] == lib_datas[0][:-4]) and ( filename[1][:-4] == project_strain_name): lib_datas[0] = wig if int(lib_datas[2]) > lib_num: lib_num = int(lib_datas[2]) if lib_datas[3] not in rep_set: rep_set.add(lib_datas[3]) if (lib_datas[1] == "tex") and (lib_datas[4] == "+"): lib_dict["fp"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"): lib_dict["fm"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"): lib_dict["np"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"): lib_dict["nm"].append(self._assign_dict(lib_datas)) for num_id in range(1, lib_num+1): out.write("annotation_{0} = {1}\n".format(num_id, gff)) if program.lower() == "tss": self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "fivePrimePlus", rep_set) elif program.lower() == "ps": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "fivePrimePlus", rep_set) else: print("Error: Wrong program name! Please assing tss " "or processing_site.") sys.exit() for num_id in range(1, lib_num+1): out.write("genome_{0} = {1}\n".format(num_id, fasta)) for num_id in range(1, lib_num+1): list_num_id.append(str(num_id)) return lib_num, num_id, rep_set, lib_dict, list_num_id def _print_repmatch(self, args_tss, out): '''check replicate match''' detect_all = False for rep in args_tss.repmatch: if "all" in rep: detect_all = True match = rep.split("_")[-1] out.write("minNumRepMatches = {0}\n".format(match)) break if not detect_all: nums = {} matchs = {} for match in args_tss.repmatch: lib = match.split("_")[0] rep = match.split("_")[-1] matchs[lib] = rep if rep not in nums.keys(): nums[rep] = 1 else: nums[rep] += 1 for rep, num in nums.items(): if num == max(nums.values()): out.write("minNumRepMatches = {0}\n".format(rep)) max_rep = rep break for lib, rep in matchs.items(): if rep != max_rep: out.write("minNumRepMatches_{0} = {1}\n".format( lib, rep)) def _extract_best_para(self, args_tss, prefix, log): detect = False for best_file in os.listdir(args_tss.auto_load): if best_file == "_".join(["best", prefix + ".csv"]): bh = open(os.path.join(args_tss.auto_load, best_file),"r" ) lines = bh.readlines() bh.close() if len(lines[len(lines)-1].split("\t")) < 8: print("Error: some information in {0} is missing. " "It may be due to that \"optimize_tss_ps\" did " "not finish successfully.".format(best_file)) log.write("Error: some information in {0} is missing. " "It may be due to that \"optimize_tss_ps\" did " "not finish successfully.\n".format(best_file)) sys.exit() else: para_info = lines[len(lines)-1].split("\t")[1].split("_") detect_all = all(elem in para_info for elem in ["he", "rh", "fa", "rf", "bh", "ef", "pf"]) if (not detect_all) or (len(para_info) != 14): print("Error: {0} is complete. Some parameters are " "missing!".format(best_file)) log.write("Error: {0} is complete. Some parameters " "are missing!\n".format(best_file)) sys.exit() else: detect = True height = para_info[para_info.index("he") + 1] height_reduction = para_info[ para_info.index("rh") + 1] factor = para_info[para_info.index("fa") + 1] factor_reduction = para_info[ para_info.index("rf") + 1] base_height = para_info[ para_info.index("bh") + 1] enrichment_factor = para_info[ para_info.index("ef") + 1] processing_factor = para_info[ para_info.index("pf") + 1] if detect: return height, height_reduction, factor, factor_reduction, \ base_height, enrichment_factor, processing_factor else: print("Error: No best_{0}.csv can be found in {1}! ".format( prefix, args_tss.auto_load)) log.write("Error: No best_{0}.csv can be found in {1}\n".format( prefix, args_tss.auto_load)) sys.exit() def _get_input_para(self, args_tss, prefix, log): if args_tss.genome_order is None: height = args_tss.height[0] height_reduction = args_tss.height_reduction[0] factor = args_tss.factor[0] factor_reduction = args_tss.factor_reduction[0] base_height = args_tss.base_height[0] enrichment_factor = args_tss.enrichment_factor[0] processing_factor = args_tss.processing_factor[0] else: if prefix not in args_tss.genome_order: print("Error: the parameters for {0} were not assigned!".format( prefix)) log.write("Error: the parameters for {0} were not assigned!\n".format( prefix)) sys.exit() else: index = args_tss.genome_order.index(prefix) height = args_tss.height[index] height_reduction = args_tss.height_reduction[index] factor = args_tss.factor[index] factor_reduction = args_tss.factor_reduction[index] base_height = args_tss.base_height[index] enrichment_factor = args_tss.enrichment_factor[index] processing_factor = args_tss.processing_factor[index] return height, height_reduction, factor, factor_reduction, \ base_height, enrichment_factor, processing_factor def _gen_config(self, project_strain_name, args_tss, gff, wig_folder, fasta, config_file, log): '''generation of config files''' log.write("Generating config files for TSSpredator.\n") if args_tss.auto_load is not None: height, height_reduction, factor, factor_reduction, \ base_height, enrichment_factor, processing_factor = \ self._extract_best_para(args_tss, project_strain_name, log) else: height, height_reduction, factor, factor_reduction, \ base_height, enrichment_factor, processing_factor = \ self._get_input_para(args_tss, project_strain_name, log) master_folder = "MasterTable_" + project_strain_name out_path = os.path.join(self.master, master_folder) self.helper.check_make_folder(out_path) out = open(config_file, "w") out.write("TSSinClusterSelectionMethod = HIGHEST\n") out.write("allowedCompareShift = 1\n") out.write("allowedRepCompareShift = 1\n") lib_num, num_id, rep_set, lib_dict, list_num_id = \ self._import_lib(args_tss.libs, wig_folder, project_strain_name, out, gff, args_tss.program, fasta) out.write("idList = ") out.write(",".join(list_num_id) + "\n") out.write("maxASutrLength = 100\n") out.write("maxGapLengthInGene = 500\n") out.write("maxNormalTo5primeFactor = {0}\n".format( processing_factor)) out.write("maxTSSinClusterDistance = {0}\n".format( args_tss.cluster + 1)) out.write("maxUTRlength = {0}\n".format(args_tss.utr_length)) out.write("min5primeToNormalFactor = {0}\n".format( enrichment_factor)) out.write("minCliffFactor = {0}\n".format(factor)) out.write("minCliffFactorDiscount = {0}\n".format( factor_reduction)) out.write("minCliffHeight = {0}\n".format(height)) out.write("minCliffHeightDiscount = {0}\n".format( height_reduction)) out.write("minNormalHeight = {0}\n".format(base_height)) self._print_repmatch(args_tss, out) out.write("minPlateauLength = 0\n") out.write("mode = cond\n") out.write("normPercentile = 0.9\n") if args_tss.program.lower() == "tss": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "normalPlus", rep_set) else: self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "normalPlus", rep_set) out.write("numReplicates = {0}\n".format(len(rep_set))) out.write("numberOfDatasets = {0}\n".format(lib_num)) out.write("outputDirectory = {0}\n".format(out_path)) for prefix_id in range(len(args_tss.output_prefixs)): out.write("outputPrefix_{0} = {1}\n".format( prefix_id + 1, args_tss.output_prefixs[prefix_id])) out.write("projectName = {0}\n".format(project_strain_name)) out.write("superGraphCompatibility = igb\n") out.write("texNormPercentile = 0.5\n") out.write("writeGraphs = 0\n") out.write("writeNocornacFiles = 0\n") log.write("\t" + config_file + " is generated.\n") out.close() def _convert_gff(self, prefixs, args_tss, log): for prefix in prefixs: out_file = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program]) + ".gff") gff_f = open(out_file, "w") out_path = os.path.join(self.master, "_".join([ "MasterTable", prefix])) if "MasterTable.tsv" not in os.listdir(out_path): print("Error: There is not MasterTable file in {0} ".format( out_path)) print("Please check configuration file.") log.write("not MasterTable file is found in {0}\n".format( out_path)) else: if args_tss.program.lower() == "processing": feature = "processing_site" elif args_tss.program.lower() == "tss": feature = "TSS" self.converter.convert_mastertable2gff( os.path.join(out_path, "MasterTable.tsv"), "ANNOgesic", feature, prefix, out_file) log.write("\t" + out_file + "is generated.\n") gff_f.close() def _merge_manual(self, tsss, args_tss): '''if manual detected TSS is provided, it can merge manual detected TSS and TSSpredator predicted TSS''' self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tss"])) for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break filename = "_".join([tss, args_tss.program]) + ".gff" predict = os.path.join(self.gff_outfolder, filename) manual = os.path.join(self.manual_path, tss + ".gff") fasta = os.path.join(self.fasta_path, tss + ".fa") stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss) if os.path.exists(manual): print("Merging and classiflying manually-detected " "TSSs for {0}".format(tss)) merge_manual_predict_tss( predict, stat_file, os.path.join(self.tmps["tss"], filename), os.path.join(args_tss.gffs, gff), args_tss, manual, fasta) if os.path.exists(stat_file): shutil.move(stat_file, os.path.join( args_tss.out_folder, "statistics", tss, stat_file)) self.helper.move_all_content(self.tmps["tss"], self.gff_outfolder, [".gff"]) shutil.rmtree(self.tmps["tss"]) def _validate(self, tsss, args_tss, log): '''validate TSS with genome annotation''' print("Validating TSSs with genome annotations") log.write("Running validate_gene.py to compare genome " "annotations and TSSs/PSs.\n") for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break stat_file = os.path.join( self.stat_outfolder, tss, "".join(["stat_gene_vali_", tss, ".csv"])) out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff") if args_tss.program.lower() == "tss": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) elif args_tss.program.lower() == "processing": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "processing.gff"])) validate_gff(compare_file, os.path.join(args_tss.gffs, gff), stat_file, out_cds_file, args_tss.utr_length, args_tss.program.lower()) log.write("\t" + stat_file + " is generated.\n") shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff)) def _compare_ta(self, tsss, args_tss, log): '''compare TSS with transcript''' detect = False log.write("Running stat_TA_comparison to compare transcripts " "and TSSs/PSs.\n") print("Comparing transcripts and TSSs") self.multiparser.parser_gff(args_tss.ta_files, "transcript") self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"], None, "transcript") for tss in tsss: stat_out = os.path.join( self.stat_outfolder, tss, "".join([ "stat_compare_TSS_transcript_", tss, ".csv"])) for ta in os.listdir(self.tmps["ta"]): filename = ta.split("_transcript") if (filename[0] == tss) and (filename[1] == ".gff"): detect = True break compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) if detect: stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file, stat_out, self.tmps["ta_tss"], self.tmps["tss_ta"], args_tss.fuzzy) self.helper.sort_gff(self.tmps["tss_ta"], compare_file) self.helper.sort_gff(self.tmps["ta_tss"], os.path.join(args_tss.ta_files, ta)) os.remove(self.tmps["tss_ta"]) os.remove(self.tmps["ta_tss"]) detect = False log.write("\t" + stat_out + " is generated.\n") def _stat_tss(self, tsss, feature, log): print("Running statistaics") for tss in tsss: compare_file = os.path.join(self.gff_outfolder, "_".join([tss, feature]) + ".gff") stat_tsspredator( compare_file, feature, os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "class", tss]) + ".csv"), os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "libs", tss]) + ".csv")) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_class", ".png"]) if os.path.exists(os.path.join( self.stat_outfolder, "TSSstatistics.tsv")): shutil.move( os.path.join( self.stat_outfolder, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, tss, "TSSstatistics.tsv")) plot_venn(compare_file, feature) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_venn", ".png"]) log.write("The following files in {0} are generated:\n".format( (os.path.join(self.stat_outfolder, tss)))) for file_ in os.listdir(os.path.join( self.stat_outfolder, tss)): log.write("\t" + file_ + "\n") def _get_prefixs(self, args_tss): prefixs = [] detect = False for fasta in os.listdir(self.fasta_path): run = False for gff in os.listdir(self.gff_path): if fasta[:-3] == gff[:-4]: prefix = fasta[:-3] for wig in os.listdir(self.wig_path): filename = wig.split("_STRAIN_") if filename[1][:-4] == prefix: detect = True break if detect: prefixs.append(prefix) return prefixs def _merge_wigs(self, wig_folder, prefix, libs): self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tmp"])) for wig_file in os.listdir(wig_folder): for lib in libs: info = lib.split(":") if (info[0][:-4] in wig_file) and (info[-1] == "+") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_forward.wig")) if (info[0][:-4] in wig_file) and (info[-1] == "-") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_reverse.wig")) def _check_orphan(self, prefixs, wig_folder, args_tss): '''if genome has no locus tag, it can use for classify the TSS''' for prefix in prefixs: self._merge_wigs(wig_folder, prefix, args_tss.libs) tmp_tss = os.path.join(self.tmps["tmp"], "_".join([ prefix, args_tss.program + ".gff"])) pre_tss = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program + ".gff"])) check_orphan(pre_tss, os.path.join( args_tss.gffs, prefix + ".gff"), "tmp/merge_forward.wig", "tmp/merge_reverse.wig", tmp_tss) shutil.move(tmp_tss, pre_tss) shutil.rmtree("tmp") def _remove_files(self, args_tss): print("Remove temperary files and folders") self.helper.remove_tmp_dir(args_tss.fastas) self.helper.remove_tmp_dir(args_tss.gffs) self.helper.remove_tmp_dir(args_tss.ta_files) if "merge_forward.wig" in os.listdir(os.getcwd()): os.remove("merge_forward.wig") if "merge_reverse.wig" in os.listdir(os.getcwd()): os.remove("merge_reverse.wig") shutil.rmtree(args_tss.wig_folder) if args_tss.manual is not None: shutil.rmtree(args_tss.manual) def _deal_with_overlap(self, out_folder, args_tss): '''deal with the situation that TSS and processing site at the same position''' if not args_tss.overlap_feature: pass else: print("Comparing TSSs and Processing sites") if args_tss.program.lower() == "tss": for tss in os.listdir(out_folder): if tss.endswith("_TSS.gff"): ref = self.helper.get_correct_file( args_tss.overlap_gffs, "_processing.gff", tss.replace("_TSS.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.program, args_tss.cluster) elif args_tss.program.lower() == "processing": for tss in os.listdir(out_folder): if tss.endswith("_processing.gff"): ref = self.helper.get_correct_file( args_tss.overlap_gffs, "_TSS.gff", tss.replace("_processing.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.program, args_tss.cluster) def _low_expression(self, args_tss, gff_folder): '''deal with the low expressed TSS''' prefix = None self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs) for gff in os.listdir(gff_folder): if (args_tss.program.lower() == "tss") and ( gff.endswith("_TSS.gff")): prefix = gff.replace("_TSS.gff", "") elif (args_tss.program.lower() == "processing") and ( gff.endswith("_processing.gff")): prefix = gff.replace("_processing.gff", "") if prefix: out = open(os.path.join( self.stat_outfolder, prefix, "_".join([ "stat", prefix, "low_expression_cutoff.csv"])), "w") out.write("\t".join(["Genome", "Cutoff_coverage"]) + "\n") cutoff = filter_low_expression( os.path.join(gff_folder, gff), args_tss, "tmp/merge_forward.wig", "tmp/merge_reverse.wig", "tmp/without_low_expression.gff") out.write("\t".join([prefix, str(cutoff)]) + "\n") os.remove(os.path.join(gff_folder, gff)) shutil.move("tmp/without_low_expression.gff", os.path.join(gff_folder, gff)) prefix = None out.close() def run_tsspredator(self, args_tss, log): input_folder = os.path.join(args_tss.out_folder, "configs") for gff in os.listdir(args_tss.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_tss.gffs, gff)) self.helper.check_make_folder(self.gff_outfolder) self.multiparser.parser_fasta(args_tss.fastas) self.multiparser.parser_gff(args_tss.gffs, None) self.multiparser.parser_wig(args_tss.wig_folder) prefixs = self._get_prefixs(args_tss) for prefix in prefixs: config = os.path.join(input_folder, "_".join(["config", prefix]) + ".ini") self._gen_config( prefix, args_tss, os.path.join(self.gff_path, prefix + ".gff"), self.wig_path, os.path.join(self.fasta_path, prefix + ".fa"), config, log) out_path = os.path.join( self.master, "_".join(["MasterTable", prefix])) config_file = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._start_to_run(args_tss.tsspredator_path, config_file, out_path, prefix, log) if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")): shutil.move(os.path.join(out_path, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, "TSSstatistics.tsv")) if args_tss.program.lower() == "ps": args_tss.program = "processing" self._convert_gff(prefixs, args_tss, log) if args_tss.check_orphan: print("checking the orphan TSSs") log.write("Running check_orphan.py to re-check orphan TSSs.\n") self._check_orphan(prefixs, os.path.join(args_tss.wig_folder, "tmp"), args_tss) self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder, None, args_tss.program) datas = [] for gff in os.listdir(self.gff_outfolder): if gff.endswith(".gff"): gff_folder = gff.replace("".join(["_", args_tss.program, ".gff"]), "") self.helper.check_make_folder( os.path.join(self.stat_outfolder, gff_folder)) datas.append(gff_folder) if args_tss.remove_low_expression is not None: log.write("Running filter_low_expression.py to filter out " "low expressed TSS/PS.\n") self._low_expression(args_tss, self.gff_outfolder) if args_tss.manual is not None: self.multiparser.parser_gff(args_tss.manual, None) self.multiparser.combine_gff(args_tss.gffs, self.manual_path, None, None) self.multiparser.combine_fasta(args_tss.gffs, self.fasta_path, None) self.multiparser.combine_wig(args_tss.gffs, self.wig_path, None, args_tss.libs) log.write("Running merge_manual.py to merge the manual TSSs.\n") self._merge_manual(datas, args_tss) log.write("Running filter_TSS_pro.py to deal with the overlap " "position between TSS and PS.\n") self._deal_with_overlap(self.gff_outfolder, args_tss) log.write("Running stat_TSSpredator.py to do statistics.\n") self._stat_tss(datas, args_tss.program, log) if args_tss.validate: self._validate(datas, args_tss, log) if args_tss.ta_files is not None: self._compare_ta(datas, args_tss, log) self._remove_files(args_tss)
class TestMultiparser(unittest.TestCase): def setUp(self): self.multiparser = Multiparser() self.example = Example() self.ref_folder = "ref_folder" if (not os.path.exists(self.ref_folder)): os.mkdir(self.ref_folder) self.tar_folder = "tar_folder" if (not os.path.exists(self.tar_folder)): os.mkdir(self.tar_folder) def tearDown(self): if os.path.exists(self.ref_folder): shutil.rmtree(self.ref_folder) if os.path.exists(self.tar_folder): shutil.rmtree(self.tar_folder) def test_combine_fasta(self): tmp_tar = os.path.join(self.tar_folder, "tmp") tmp_ref = os.path.join(self.ref_folder, "test.gff_folder") os.mkdir(tmp_ref) os.mkdir(tmp_tar) sub_fasta1 = os.path.join(tmp_tar, "aaa.fa") with open(sub_fasta1, "w") as rh: rh.write(self.example.sub_fasta1) sub_fasta2 = os.path.join(tmp_tar, "bbb.fa") with open(sub_fasta2, "w") as rh: rh.write(self.example.sub_fasta2) sub_gff1 = os.path.join(tmp_ref, "aaa.gff") with open(sub_gff1, "w") as rh: rh.write(self.example.sub_gff1) sub_gff2 = os.path.join(tmp_ref, "bbb.gff") with open(sub_gff2, "w") as rh: rh.write(self.example.sub_gff2) self.multiparser.combine_fasta(self.ref_folder, tmp_tar, None) self.assertTrue(os.path.exists(os.path.join(tmp_tar, "test.fa"))) def test_combine_wig(self): tmp_tar = os.path.join(self.tar_folder, "tmp") tmp_ref = os.path.join(self.ref_folder, "test.fa_folder") os.mkdir(tmp_ref) os.mkdir(tmp_tar) sub_fasta1 = os.path.join(tmp_ref, "aaa.fa") with open(sub_fasta1, "w") as rh: rh.write(self.example.sub_fasta1) sub_fasta2 = os.path.join(tmp_ref, "bbb.fa") with open(sub_fasta2, "w") as rh: rh.write(self.example.sub_fasta2) sub_wig1 = os.path.join(tmp_tar, "test_forward.wig_STRAIN_aaa.wig") sub_wig2 = os.path.join(tmp_tar, "test_forward.wig_STRAIN_bbb.wig") sub_wig3 = os.path.join(tmp_tar, "test_reverse.wig_STRAIN_aaa.wig") sub_wig4 = os.path.join(tmp_tar, "test_reverse.wig_STRAIN_bbb.wig") wig_files = [sub_wig1, sub_wig2, sub_wig3, sub_wig4] example_wigs = [ self.example.sub_f_wig1, self.example.sub_f_wig2, self.example.sub_r_wig1, self.example.sub_r_wig2 ] for index in range(0, 4): with open(wig_files[index], "w") as fh: fh.write(example_wigs[index]) libs = [ "test_forward.wig_STRAIN_aaa.wig:frag:1:a:+", "test_reverse.wig_STRAIN_aaa.wig:frag:1:a:-" ] self.multiparser.combine_wig(self.ref_folder, tmp_tar, "fasta", libs) self.assertTrue( os.path.exists(os.path.join(tmp_tar, "test_forward.wig"))) self.assertTrue( os.path.exists(os.path.join(tmp_tar, "test_reverse.wig"))) def test_combine_gff(self): tmp_tar = os.path.join(self.tar_folder, "tmp") tmp_ref = os.path.join(self.ref_folder, "test.fa_folder") os.mkdir(tmp_ref) os.mkdir(tmp_tar) sub_fasta1 = os.path.join(tmp_ref, "aaa.fa") with open(sub_fasta1, "w") as rh: rh.write(self.example.sub_fasta1) sub_fasta2 = os.path.join(tmp_ref, "bbb.fa") with open(sub_fasta2, "w") as rh: rh.write(self.example.sub_fasta2) sub_gff1 = os.path.join(tmp_tar, "aaa.gff") with open(sub_gff1, "w") as rh: rh.write(self.example.sub_gff1) sub_gff2 = os.path.join(tmp_tar, "bbb.gff") with open(sub_gff2, "w") as rh: rh.write(self.example.sub_gff2) self.multiparser.combine_gff(self.ref_folder, tmp_tar, "fasta", None) self.assertTrue(os.path.exists(os.path.join(tmp_tar, "test.gff"))) def test_parser_fasta(self): fasta_file = os.path.join(self.ref_folder, "test.fa") with open(fasta_file, "w") as rh: rh.write(self.example.fasta_file) self.multiparser.parser_fasta(self.ref_folder) self.assertTrue( os.path.exists(os.path.join(self.ref_folder, "tmp/aaa.fa"))) self.assertTrue( os.path.exists(os.path.join(self.ref_folder, "tmp/bbb.fa"))) self.assertTrue( os.path.exists( os.path.join(self.ref_folder, "test.fa_folder/aaa.fa"))) self.assertTrue( os.path.exists( os.path.join(self.ref_folder, "test.fa_folder/bbb.fa"))) def test_parser_gff(self): gff_file = os.path.join(self.ref_folder, "test.gff") with open(gff_file, "w") as rh: rh.write(self.example.gff_file) self.multiparser.parser_gff(self.ref_folder, None) self.assertTrue( os.path.exists(os.path.join(self.ref_folder, "tmp/aaa.gff"))) self.assertTrue( os.path.exists(os.path.join(self.ref_folder, "tmp/bbb.gff"))) self.assertTrue( os.path.exists( os.path.join(self.ref_folder, "test.gff_folder/aaa.gff"))) self.assertTrue( os.path.exists( os.path.join(self.ref_folder, "test.gff_folder/bbb.gff"))) tss_file = os.path.join(self.ref_folder, "test_TSS.gff") os.rename(gff_file, tss_file) tss_file = os.path.join(self.ref_folder, "test_TSS.gff") with open(tss_file, "w") as rh: rh.write(self.example.gff_file) self.multiparser.parser_gff(self.ref_folder, "TSS") self.assertTrue( os.path.exists(os.path.join(self.ref_folder, "tmp/aaa_TSS.gff"))) self.assertTrue( os.path.exists(os.path.join(self.ref_folder, "tmp/bbb_TSS.gff"))) self.assertTrue( os.path.exists( os.path.join(self.ref_folder, "test_TSS.gff_folder/aaa_TSS.gff"))) self.assertTrue( os.path.exists( os.path.join(self.ref_folder, "test_TSS.gff_folder/bbb_TSS.gff"))) def test_parser_wig(self): wig_f_file = os.path.join(self.ref_folder, "test_forward.wig") with open(wig_f_file, "w") as rh: rh.write(self.example.wig_f_file) wig_r_file = os.path.join(self.ref_folder, "test_reverse.wig") with open(wig_r_file, "w") as rh: rh.write(self.example.wig_r_file) self.multiparser.parser_wig(self.ref_folder) self.assertTrue( os.path.exists( os.path.join(self.ref_folder, "tmp/test_forward_STRAIN_aaa.wig"))) self.assertTrue( os.path.exists( os.path.join(self.ref_folder, "tmp/test_forward_STRAIN_bbb.wig"))) self.assertTrue( os.path.exists( os.path.join(self.ref_folder, "tmp/test_reverse_STRAIN_aaa.wig"))) self.assertTrue( os.path.exists( os.path.join(self.ref_folder, "tmp/test_reverse_STRAIN_bbb.wig"))) self.assertTrue( os.path.exists( os.path.join( self.ref_folder, "test_forward.wig_folder/test_forward_STRAIN_aaa.wig"))) self.assertTrue( os.path.exists( os.path.join( self.ref_folder, "test_forward.wig_folder/test_forward_STRAIN_bbb.wig"))) self.assertTrue( os.path.exists( os.path.join( self.ref_folder, "test_reverse.wig_folder/test_reverse_STRAIN_aaa.wig"))) self.assertTrue( os.path.exists( os.path.join( self.ref_folder, "test_reverse.wig_folder/test_reverse_STRAIN_bbb.wig")))
class TestMultiparser(unittest.TestCase): def setUp(self): self.multiparser = Multiparser() self.example = Example() self.ref_folder = "ref_folder" if (not os.path.exists(self.ref_folder)): os.mkdir(self.ref_folder) self.tar_folder = "tar_folder" if (not os.path.exists(self.tar_folder)): os.mkdir(self.tar_folder) def tearDown(self): if os.path.exists(self.ref_folder): shutil.rmtree(self.ref_folder) if os.path.exists(self.tar_folder): shutil.rmtree(self.tar_folder) def test_combine_fasta(self): tmp_tar = os.path.join(self.tar_folder, "tmp") tmp_ref = os.path.join(self.ref_folder, "test.gff_folder") os.mkdir(tmp_ref) os.mkdir(tmp_tar) sub_fasta1 = os.path.join(tmp_tar, "aaa.fa") with open(sub_fasta1, "w") as rh: rh.write(self.example.sub_fasta1) sub_fasta2 = os.path.join(tmp_tar, "bbb.fa") with open(sub_fasta2, "w") as rh: rh.write(self.example.sub_fasta2) sub_gff1 = os.path.join(tmp_ref, "aaa.gff") with open(sub_gff1, "w") as rh: rh.write(self.example.sub_gff1) sub_gff2 = os.path.join(tmp_ref, "bbb.gff") with open(sub_gff2, "w") as rh: rh.write(self.example.sub_gff2) self.multiparser.combine_fasta(self.ref_folder, tmp_tar, None) self.assertTrue(os.path.exists(os.path.join(tmp_tar, "test.fa"))) def test_combine_wig(self): tmp_tar = os.path.join(self.tar_folder, "tmp") tmp_ref = os.path.join(self.ref_folder, "test.fa_folder") os.mkdir(tmp_ref) os.mkdir(tmp_tar) sub_fasta1 = os.path.join(tmp_ref, "aaa.fa") with open(sub_fasta1, "w") as rh: rh.write(self.example.sub_fasta1) sub_fasta2 = os.path.join(tmp_ref, "bbb.fa") with open(sub_fasta2, "w") as rh: rh.write(self.example.sub_fasta2) sub_wig1 = os.path.join(tmp_tar, "test_forward.wig_STRAIN_aaa.wig") sub_wig2 = os.path.join(tmp_tar, "test_forward.wig_STRAIN_bbb.wig") sub_wig3 = os.path.join(tmp_tar, "test_reverse.wig_STRAIN_aaa.wig") sub_wig4 = os.path.join(tmp_tar, "test_reverse.wig_STRAIN_bbb.wig") wig_files = [sub_wig1, sub_wig2, sub_wig3, sub_wig4] example_wigs = [self.example.sub_f_wig1, self.example.sub_f_wig2, self.example.sub_r_wig1, self.example.sub_r_wig2] for index in range(0, 4): with open(wig_files[index], "w") as fh: fh.write(example_wigs[index]) libs = ["test_forward.wig_STRAIN_aaa.wig:frag:1:a:+", "test_reverse.wig_STRAIN_aaa.wig:frag:1:a:-"] self.multiparser.combine_wig(self.ref_folder, tmp_tar, "fasta", libs) self.assertTrue(os.path.exists(os.path.join(tmp_tar, "test_forward.wig"))) self.assertTrue(os.path.exists(os.path.join(tmp_tar, "test_reverse.wig"))) def test_combine_gff(self): tmp_tar = os.path.join(self.tar_folder, "tmp") tmp_ref = os.path.join(self.ref_folder, "test.fa_folder") os.mkdir(tmp_ref) os.mkdir(tmp_tar) sub_fasta1 = os.path.join(tmp_ref, "aaa.fa") with open(sub_fasta1, "w") as rh: rh.write(self.example.sub_fasta1) sub_fasta2 = os.path.join(tmp_ref, "bbb.fa") with open(sub_fasta2, "w") as rh: rh.write(self.example.sub_fasta2) sub_gff1 = os.path.join(tmp_tar, "aaa.gff") with open(sub_gff1, "w") as rh: rh.write(self.example.sub_gff1) sub_gff2 = os.path.join(tmp_tar, "bbb.gff") with open(sub_gff2, "w") as rh: rh.write(self.example.sub_gff2) self.multiparser.combine_gff(self.ref_folder, tmp_tar, "fasta", None) self.assertTrue(os.path.exists(os.path.join(tmp_tar, "test.gff"))) def test_parser_fasta(self): fasta_file = os.path.join(self.ref_folder, "test.fa") with open(fasta_file, "w") as rh: rh.write(self.example.fasta_file) self.multiparser.parser_fasta(self.ref_folder) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/aaa.fa"))) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/bbb.fa"))) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "test.fa_folder/aaa.fa"))) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "test.fa_folder/bbb.fa"))) def test_parser_gff(self): gff_file = os.path.join(self.ref_folder, "test.gff") with open(gff_file, "w") as rh: rh.write(self.example.gff_file) self.multiparser.parser_gff(self.ref_folder, None) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/aaa.gff"))) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/bbb.gff"))) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "test.gff_folder/aaa.gff"))) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "test.gff_folder/bbb.gff"))) tss_file = os.path.join(self.ref_folder, "test_TSS.gff") os.rename(gff_file, tss_file) tss_file = os.path.join(self.ref_folder, "test_TSS.gff") with open(tss_file, "w") as rh: rh.write(self.example.gff_file) self.multiparser.parser_gff(self.ref_folder, "TSS") self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/aaa_TSS.gff"))) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/bbb_TSS.gff"))) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "test_TSS.gff_folder/aaa_TSS.gff"))) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "test_TSS.gff_folder/bbb_TSS.gff"))) def test_parser_wig(self): wig_f_file = os.path.join(self.ref_folder, "test_forward.wig") with open(wig_f_file, "w") as rh: rh.write(self.example.wig_f_file) wig_r_file = os.path.join(self.ref_folder, "test_reverse.wig") with open(wig_r_file, "w") as rh: rh.write(self.example.wig_r_file) self.multiparser.parser_wig(self.ref_folder) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/test_forward_STRAIN_aaa.wig"))) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/test_forward_STRAIN_bbb.wig"))) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/test_reverse_STRAIN_aaa.wig"))) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/test_reverse_STRAIN_bbb.wig"))) self.assertTrue(os.path.exists( os.path.join(self.ref_folder, "test_forward.wig_folder/test_forward_STRAIN_aaa.wig"))) self.assertTrue(os.path.exists( os.path.join(self.ref_folder, "test_forward.wig_folder/test_forward_STRAIN_bbb.wig"))) self.assertTrue(os.path.exists( os.path.join(self.ref_folder, "test_reverse.wig_folder/test_reverse_STRAIN_aaa.wig"))) self.assertTrue(os.path.exists( os.path.join(self.ref_folder, "test_reverse.wig_folder/test_reverse_STRAIN_bbb.wig")))
class TSSpredator(object): def __init__(self, args_tss): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.master = os.path.join(args_tss.out_folder, "MasterTables") self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta": "tmp_tss", "tmp": "tmp"} if args_tss.ta_files is not None: self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp") else: self.tmps["ta"] = None self.gff_path = os.path.join(args_tss.gffs, "tmp") if args_tss.manual is not None: self.manual_path = os.path.join(args_tss.manual, "tmp") self.wig_path = os.path.join(args_tss.wig_folder, "tmp") self.fasta_path = os.path.join(args_tss.fastas, "tmp") self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics") self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs") def _assign_dict(self, lib_datas): return {"wig": lib_datas[0], "tex": lib_datas[1], "condition": int(lib_datas[2]), "replicate": lib_datas[3], "strand": lib_datas[4]} def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix, rep_set): for num_id in range(1, lib_num+1): cond_list = [] for lib in lib_list: if num_id == lib["condition"]: cond_list.append(lib) cond_sort_list = sorted(cond_list, key=lambda k: k['replicate']) reps = [] for cond in cond_sort_list: out.write("{0}_{1}{2} = {3}\n".format( prefix, cond["condition"], cond["replicate"], os.path.join(wig_folder, cond["wig"]))) reps.append(cond["replicate"]) for rep in sorted(rep_set): if rep not in reps: out.write("{0}_{1}{2} = \n".format( prefix, cond["condition"], rep)) def _start_to_run(self, tsspredator_path, config_file, out_path, prefix, log): print("Running TSSpredator for " + prefix) log.write("Make sure the version of TSSpredator is at least 1.06.\n") out = open(os.path.join(out_path, "log.txt"), "w") err = open(os.path.join(out_path, "err.txt"), "w") log.write(" ".join(["java", "-jar", tsspredator_path, config_file]) + "\n") call(["java", "-jar", tsspredator_path, config_file], stdout=out, stderr=err) out.close() err.close() log.write("Done!\n") log.write("The following files are generated in {0}:\n".format(out_path)) for file_ in os.listdir(out_path): log.write("\t" + file_ + "\n") def _import_lib(self, libs, wig_folder, project_strain_name, out, gff, program, fasta): lib_dict = {"fp": [], "fm": [], "nm": [], "np": []} lib_num = 0 rep_set = set() list_num_id = [] for lib in libs: lib_datas = lib.split(":") if not lib_datas[0].endswith(".wig"): print("Error: Wiggle files are not end with .wig!") sys.exit() for wig in os.listdir(wig_folder): filename = wig.split("_STRAIN_") if (filename[0] == lib_datas[0][:-4]) and ( filename[1][:-4] == project_strain_name): lib_datas[0] = wig if int(lib_datas[2]) > lib_num: lib_num = int(lib_datas[2]) if lib_datas[3] not in rep_set: rep_set.add(lib_datas[3]) if (lib_datas[1] == "tex") and (lib_datas[4] == "+"): lib_dict["fp"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"): lib_dict["fm"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"): lib_dict["np"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"): lib_dict["nm"].append(self._assign_dict(lib_datas)) for num_id in range(1, lib_num+1): out.write("annotation_{0} = {1}\n".format(num_id, gff)) if program.lower() == "tss": self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "fivePrimePlus", rep_set) elif program.lower() == "ps": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "fivePrimePlus", rep_set) else: print("Error: Wrong program name! Please assing tss " "or processing_site.") sys.exit() for num_id in range(1, lib_num+1): out.write("genome_{0} = {1}\n".format(num_id, fasta)) for num_id in range(1, lib_num+1): list_num_id.append(str(num_id)) return lib_num, num_id, rep_set, lib_dict, list_num_id def _print_repmatch(self, args_tss, out): '''check replicate match''' detect_all = False for rep in args_tss.repmatch: if "all" in rep: detect_all = True match = rep.split("_")[-1] out.write("minNumRepMatches = {0}\n".format(match)) break if not detect_all: nums = {} matchs = {} for match in args_tss.repmatch: lib = match.split("_")[0] rep = match.split("_")[-1] matchs[lib] = rep if rep not in nums.keys(): nums[rep] = 1 else: nums[rep] += 1 for rep, num in nums.items(): if num == max(nums.values()): out.write("minNumRepMatches = {0}\n".format(rep)) max_rep = rep break for lib, rep in matchs.items(): if rep != max_rep: out.write("minNumRepMatches_{0} = {1}\n".format( lib, rep)) def _gen_config(self, project_strain_name, args_tss, gff, wig_folder, fasta, config_file, log): '''generation of config files''' master_folder = "MasterTable_" + project_strain_name out_path = os.path.join(self.master, master_folder) self.helper.check_make_folder(out_path) out = open(config_file, "w") out.write("TSSinClusterSelectionMethod = HIGHEST\n") out.write("allowedCompareShift = 1\n") out.write("allowedRepCompareShift = 1\n") lib_num, num_id, rep_set, lib_dict, list_num_id = \ self._import_lib(args_tss.libs, wig_folder, project_strain_name, out, gff, args_tss.program, fasta) out.write("idList = ") out.write(",".join(list_num_id) + "\n") out.write("maxASutrLength = 100\n") out.write("maxGapLengthInGene = 500\n") out.write("maxNormalTo5primeFactor = {0}\n".format( args_tss.processing_factor)) out.write("maxTSSinClusterDistance = {0}\n".format( args_tss.cluster + 1)) out.write("maxUTRlength = {0}\n".format(args_tss.utr_length)) out.write("min5primeToNormalFactor = {0}\n".format( args_tss.enrichment_factor)) out.write("minCliffFactor = {0}\n".format(args_tss.factor)) out.write("minCliffFactorDiscount = {0}\n".format( args_tss.factor_reduction)) out.write("minCliffHeight = {0}\n".format(args_tss.height)) out.write("minCliffHeightDiscount = {0}\n".format( args_tss.height_reduction)) out.write("minNormalHeight = {0}\n".format(args_tss.base_height)) self._print_repmatch(args_tss, out) out.write("minPlateauLength = 0\n") out.write("mode = cond\n") out.write("normPercentile = 0.9\n") if args_tss.program.lower() == "tss": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "normalPlus", rep_set) else: self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "normalPlus", rep_set) out.write("numReplicates = {0}\n".format(len(rep_set))) out.write("numberOfDatasets = {0}\n".format(lib_num)) out.write("outputDirectory = {0}\n".format(out_path)) for prefix_id in range(len(args_tss.output_prefixs)): out.write("outputPrefix_{0} = {1}\n".format( prefix_id + 1, args_tss.output_prefixs[prefix_id])) out.write("projectName = {0}\n".format(project_strain_name)) out.write("superGraphCompatibility = igb\n") out.write("texNormPercentile = 0.5\n") out.write("writeGraphs = 0\n") out.write("writeNocornacFiles = 0\n") log.write("\t" + config_file + " is generated.\n") out.close() def _convert_gff(self, prefixs, args_tss, log): for prefix in prefixs: out_file = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program]) + ".gff") gff_f = open(out_file, "w") out_path = os.path.join(self.master, "_".join([ "MasterTable", prefix])) if "MasterTable.tsv" not in os.listdir(out_path): print("Error: There is not MasterTable file in {0} ".format( out_path)) print("Please check configuration file.") log.write("not MasterTable file is found in {0}\n".format( out_path)) else: if args_tss.program.lower() == "processing": feature = "processing_site" elif args_tss.program.lower() == "tss": feature = "TSS" self.converter.convert_mastertable2gff( os.path.join(out_path, "MasterTable.tsv"), "ANNOgesic", feature, prefix, out_file) log.write("\t" + out_file + "is generated.\n") gff_f.close() def _merge_manual(self, tsss, args_tss): '''if manual detected TSS is provided, it can merge manual detected TSS and TSSpredator predicted TSS''' self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tss"])) for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break filename = "_".join([tss, args_tss.program]) + ".gff" predict = os.path.join(self.gff_outfolder, filename) manual = os.path.join(self.manual_path, tss + ".gff") fasta = os.path.join(self.fasta_path, tss + ".fa") stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss) if os.path.exists(manual): print("Merging and classiflying manually-detected " "TSSs for {0}".format(tss)) merge_manual_predict_tss( predict, stat_file, os.path.join(self.tmps["tss"], filename), os.path.join(args_tss.gffs, gff), args_tss, manual, fasta) if os.path.exists(stat_file): shutil.move(stat_file, os.path.join( args_tss.out_folder, "statistics", tss, stat_file)) self.helper.move_all_content(self.tmps["tss"], self.gff_outfolder, [".gff"]) shutil.rmtree(self.tmps["tss"]) def _validate(self, tsss, args_tss, log): '''validate TSS with genome annotation''' print("Validating TSSs with genome annotations") log.write("Running validate_gene.py to compare genome " "annotations and TSSs/PSs.\n") for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break stat_file = os.path.join( self.stat_outfolder, tss, "".join(["stat_gene_vali_", tss, ".csv"])) out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff") if args_tss.program.lower() == "tss": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) elif args_tss.program.lower() == "processing": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "processing.gff"])) validate_gff(compare_file, os.path.join(args_tss.gffs, gff), stat_file, out_cds_file, args_tss.utr_length, args_tss.program.lower()) log.write("\t" + stat_file + " is generated.\n") shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff)) def _compare_ta(self, tsss, args_tss, log): '''compare TSS with transcript''' detect = False log.write("Running stat_TA_comparison to compare transcripts " "and TSSs/PSs.\n") print("Comparing transcripts and TSSs") self.multiparser.parser_gff(args_tss.ta_files, "transcript") self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"], None, "transcript") for tss in tsss: stat_out = os.path.join( self.stat_outfolder, tss, "".join([ "stat_compare_TSS_transcript_", tss, ".csv"])) for ta in os.listdir(self.tmps["ta"]): filename = ta.split("_transcript") if (filename[0] == tss) and (filename[1] == ".gff"): detect = True break compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) if detect: stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file, stat_out, self.tmps["ta_tss"], self.tmps["tss_ta"], args_tss.fuzzy) self.helper.sort_gff(self.tmps["tss_ta"], compare_file) self.helper.sort_gff(self.tmps["ta_tss"], os.path.join(args_tss.ta_files, ta)) os.remove(self.tmps["tss_ta"]) os.remove(self.tmps["ta_tss"]) detect = False log.write("\t" + stat_out + " is generated.\n") def _stat_tss(self, tsss, feature, log): print("Running statistaics") for tss in tsss: compare_file = os.path.join(self.gff_outfolder, "_".join([tss, feature]) + ".gff") stat_tsspredator( compare_file, feature, os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "class", tss]) + ".csv"), os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "libs", tss]) + ".csv")) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_class", ".png"]) if os.path.exists(os.path.join( self.stat_outfolder, "TSSstatistics.tsv")): shutil.move( os.path.join( self.stat_outfolder, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, tss, "TSSstatistics.tsv")) plot_venn(compare_file, feature) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_venn", ".png"]) log.write("The following files in {0} are generated:\n".format( (os.path.join(self.stat_outfolder, tss)))) for file_ in os.listdir(os.path.join( self.stat_outfolder, tss)): log.write("\t" + file_ + "\n") def _set_gen_config(self, args_tss, input_folder, log): prefixs = [] detect = False log.write("Generating config files for TSSpredator.\n") for fasta in os.listdir(self.fasta_path): run = False for gff in os.listdir(self.gff_path): if fasta[:-3] == gff[:-4]: prefix = fasta[:-3] for wig in os.listdir(self.wig_path): filename = wig.split("_STRAIN_") if filename[1][:-4] == prefix: detect = True break if detect: prefixs.append(prefix) config = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._gen_config( prefix, args_tss, os.path.join(self.gff_path, gff), self.wig_path, os.path.join(self.fasta_path, fasta), config, log) return prefixs def _merge_wigs(self, wig_folder, prefix, libs): self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tmp"])) for wig_file in os.listdir(wig_folder): for lib in libs: info = lib.split(":") if (info[0][:-4] in wig_file) and (info[-1] == "+") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_forward.wig")) if (info[0][:-4] in wig_file) and (info[-1] == "-") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_reverse.wig")) def _check_orphan(self, prefixs, wig_folder, args_tss): '''if genome has no locus tag, it can use for classify the TSS''' for prefix in prefixs: self._merge_wigs(wig_folder, prefix, args_tss.libs) tmp_tss = os.path.join(self.tmps["tmp"], "_".join([ prefix, args_tss.program + ".gff"])) pre_tss = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program + ".gff"])) check_orphan(pre_tss, os.path.join( args_tss.gffs, prefix + ".gff"), "tmp/merge_forward.wig", "tmp/merge_reverse.wig", tmp_tss) shutil.move(tmp_tss, pre_tss) shutil.rmtree("tmp") def _remove_files(self, args_tss): print("Remove temperary files and folders") self.helper.remove_tmp_dir(args_tss.fastas) self.helper.remove_tmp_dir(args_tss.gffs) self.helper.remove_tmp_dir(args_tss.ta_files) if "merge_forward.wig" in os.listdir(os.getcwd()): os.remove("merge_forward.wig") if "merge_reverse.wig" in os.listdir(os.getcwd()): os.remove("merge_reverse.wig") shutil.rmtree(args_tss.wig_folder) if args_tss.manual is not None: shutil.rmtree(args_tss.manual) def _deal_with_overlap(self, out_folder, args_tss): '''deal with the situation that TSS and processing site at the same position''' if not args_tss.overlap_feature: pass else: print("Comparing TSSs and Processing sites") if args_tss.program.lower() == "tss": for tss in os.listdir(out_folder): if tss.endswith("_TSS.gff"): ref = self.helper.get_correct_file( args_tss.overlap_gffs, "_processing.gff", tss.replace("_TSS.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.program, args_tss.cluster) elif args_tss.program.lower() == "processing": for tss in os.listdir(out_folder): if tss.endswith("_processing.gff"): ref = self.helper.get_correct_file( args_tss.overlap_gffs, "_TSS.gff", tss.replace("_processing.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.program, args_tss.cluster) def _low_expression(self, args_tss, gff_folder): '''deal with the low expressed TSS''' prefix = None self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs) for gff in os.listdir(gff_folder): if (args_tss.program.lower() == "tss") and ( gff.endswith("_TSS.gff")): prefix = gff.replace("_TSS.gff", "") elif (args_tss.program.lower() == "processing") and ( gff.endswith("_processing.gff")): prefix = gff.replace("_processing.gff", "") if prefix: out = open(os.path.join( self.stat_outfolder, prefix, "_".join([ "stat", prefix, "low_expression_cutoff.csv"])), "w") out.write("\t".join(["Genome", "Cutoff_coverage"]) + "\n") cutoff = filter_low_expression( os.path.join(gff_folder, gff), args_tss, "tmp/merge_forward.wig", "tmp/merge_reverse.wig", "tmp/without_low_expression.gff") out.write("\t".join([prefix, str(cutoff)]) + "\n") os.remove(os.path.join(gff_folder, gff)) shutil.move("tmp/without_low_expression.gff", os.path.join(gff_folder, gff)) prefix = None out.close() def run_tsspredator(self, args_tss, log): input_folder = os.path.join(args_tss.out_folder, "configs") for gff in os.listdir(args_tss.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_tss.gffs, gff)) self.helper.check_make_folder(self.gff_outfolder) self.multiparser.parser_fasta(args_tss.fastas) self.multiparser.parser_gff(args_tss.gffs, None) self.multiparser.parser_wig(args_tss.wig_folder) prefixs = self._set_gen_config(args_tss, input_folder, log) for prefix in prefixs: out_path = os.path.join( self.master, "_".join(["MasterTable", prefix])) config_file = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._start_to_run(args_tss.tsspredator_path, config_file, out_path, prefix, log) if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")): shutil.move(os.path.join(out_path, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, "TSSstatistics.tsv")) if args_tss.program.lower() == "ps": args_tss.program = "processing" self._convert_gff(prefixs, args_tss, log) if args_tss.check_orphan: print("checking the orphan TSSs") log.write("Running check_orphan.py to re-check orphan TSSs.\n") self._check_orphan(prefixs, os.path.join(args_tss.wig_folder, "tmp"), args_tss) self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder, None, args_tss.program) datas = [] for gff in os.listdir(self.gff_outfolder): if gff.endswith(".gff"): gff_folder = gff.replace("".join(["_", args_tss.program, ".gff"]), "") self.helper.check_make_folder( os.path.join(self.stat_outfolder, gff_folder)) datas.append(gff_folder) if args_tss.remove_low_expression is not None: log.write("Running filter_low_expression.py to filter out " "low expressed TSS/PS.\n") self._low_expression(args_tss, self.gff_outfolder) if args_tss.manual is not None: self.multiparser.parser_gff(args_tss.manual, None) self.multiparser.combine_gff(args_tss.gffs, self.manual_path, None, None) self.multiparser.combine_fasta(args_tss.gffs, self.fasta_path, None) self.multiparser.combine_wig(args_tss.gffs, self.wig_path, None, args_tss.libs) log.write("Running merge_manual.py to merge the manual TSSs.\n") self._merge_manual(datas, args_tss) log.write("Running filter_TSS_pro.py to deal with the overlap " "position between TSS and PS.\n") self._deal_with_overlap(self.gff_outfolder, args_tss) log.write("Running stat_TSSpredator.py to do statistics.\n") self._stat_tss(datas, args_tss.program, log) if args_tss.validate: self._validate(datas, args_tss, log) if args_tss.ta_files is not None: self._compare_ta(datas, args_tss, log) self._remove_files(args_tss)
class TSSpredator(object): def __init__(self, args_tss): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.master = os.path.join(args_tss.out_folder, "MasterTables") self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta": "tmp_tss", "tmp": "tmp"} if args_tss.ta_files is not None: self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp") else: self.tmps["ta"] = None self.gff_path = os.path.join(args_tss.gffs, "tmp") self.wig_path = os.path.join(args_tss.wig_folder, "tmp") self.fasta_path = os.path.join(args_tss.fastas, "tmp") self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics") self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs") def _assign_dict(self, lib_datas): return {"wig": lib_datas[0], "tex": lib_datas[1], "condition": int(lib_datas[2]), "replicate": lib_datas[3], "strand": lib_datas[4]} def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix): for num_id in range(1, lib_num+1): cond_list = [] for lib in lib_list: if num_id == lib["condition"]: cond_list.append(lib) cond_sort_list = sorted(cond_list, key=lambda k: k['replicate']) for cond in cond_sort_list: out.write("{0}_{1}{2} = {3}\n".format( prefix, cond["condition"], cond["replicate"], os.path.join(wig_folder, cond["wig"]))) def _start_to_run(self, tsspredator_path, config_file, out_path, prefix): print("Running TSSpredator for " + prefix) out = open(os.path.join(out_path, "log.txt"), "w") err = open(os.path.join(out_path, "err.txt"), "w") call(["java", "-jar", tsspredator_path, config_file], stdout=out, stderr=err) out.close() err.close() def _import_lib(self, libs, wig_folder, project_strain_name, out, gff, program, fasta): lib_dict = {"fp": [], "fm": [], "nm": [], "np": []} lib_num = 0 rep_set = set() list_num_id = [] print("Runniun {0} now...".format(program)) for lib in libs: lib_datas = lib.split(":") if not lib_datas[0].endswith(".wig"): print("Error:Exist a not proper wig files!!") sys.exit() for wig in os.listdir(wig_folder): filename = wig.split("_STRAIN_") if (filename[0] == lib_datas[0][:-4]) and ( filename[1][:-4] == project_strain_name): lib_datas[0] = wig if int(lib_datas[2]) > lib_num: lib_num = int(lib_datas[2]) if lib_datas[3] not in rep_set: rep_set.add(lib_datas[3]) if (lib_datas[1] == "tex") and (lib_datas[4] == "+"): lib_dict["fp"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"): lib_dict["fm"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"): lib_dict["np"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"): lib_dict["nm"].append(self._assign_dict(lib_datas)) for num_id in range(1, lib_num+1): out.write("annotation_{0} = {1}\n".format(num_id, gff)) if program.lower() == "tss": self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "fivePrimeMinus") self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "fivePrimePlus") elif program.lower() == "processing_site": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "fivePrimeMinus") self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "fivePrimePlus") else: print("Error: Wrong program name!!!") sys.exit() for num_id in range(1, lib_num+1): out.write("genome_{0} = {1}\n".format(num_id, fasta)) for num_id in range(1, lib_num+1): list_num_id.append(str(num_id)) return lib_num, num_id, rep_set, lib_dict, list_num_id def _gen_config(self, project_strain_name, args_tss, gff, wig_folder, fasta, config_file): master_folder = "MasterTable_" + project_strain_name out_path = os.path.join(self.master, master_folder) self.helper.check_make_folder(out_path) out = open(config_file, "w") out.write("TSSinClusterSelectionMethod = HIGHEST\n") out.write("allowedCompareShift = 1\n") out.write("allowedRepCompareShift = 1\n") lib_num, num_id, rep_set, lib_dict, list_num_id = \ self._import_lib(args_tss.libs, wig_folder, project_strain_name, out, gff, args_tss.program, fasta) out.write("idList = ") out.write(",".join(list_num_id) + "\n") out.write("maxASutrLength = 100\n") out.write("maxGapLengthInGene = 500\n") out.write("maxNormalTo5primeFactor = {0}\n".format( args_tss.processing_factor)) out.write("maxTSSinClusterDistance = {0}\n".format( args_tss.cluster + 1)) out.write("maxUTRlength = {0}\n".format(args_tss.utr_length)) out.write("min5primeToNormalFactor = {0}\n".format( args_tss.enrichment_factor)) out.write("minCliffFactor = {0}\n".format(args_tss.factor)) out.write("minCliffFactorDiscount = {0}\n".format( args_tss.factor_reduction)) out.write("minCliffHeight = {0}\n".format(args_tss.height)) out.write("minCliffHeightDiscount = {0}\n".format( args_tss.height_reduction)) out.write("minNormalHeight = {0}\n".format(args_tss.base_height)) out.write("minNumRepMatches = {0}\n".format(args_tss.repmatch)) out.write("minPlateauLength = 0\n") out.write("mode = cond\n") out.write("normPercentile = 0.9\n") if args_tss.program.lower() == "tss": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "normalMinus") self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "normalPlus") else: self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "normalMinus") self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "normalPlus") out.write("numReplicates = {0}\n".format(len(rep_set))) out.write("numberOfDatasets = {0}\n".format(lib_num)) out.write("outputDirectory = {0}\n".format(out_path)) for prefix_id in range(len(args_tss.output_prefixs)): out.write("outputPrefix_{0} = {1}\n".format( prefix_id + 1, args_tss.output_prefixs[prefix_id])) out.write("projectName = {0}\n".format(project_strain_name)) out.write("superGraphCompatibility = igb\n") out.write("texNormPercentile = 0.5\n") out.write("writeGraphs = 0\n") out.write("writeNocornacFiles = 0\n") out.close() def _convert_gff(self, prefixs, args_tss): for prefix in prefixs: out_file = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program]) + ".gff") gff_f = open(out_file, "w") out_path = os.path.join(self.master, "_".join([ "MasterTable", prefix])) if "MasterTable.tsv" not in os.listdir(out_path): print("Error:there is not MasterTable file in {0}".format( out_path)) print("Please check configuration file.") else: self.converter.convert_mastertable2gff( os.path.join(out_path, "MasterTable.tsv"), "ANNOgesic", args_tss.program, prefix, out_file) gff_f.close() def _merge_manual(self, tsss, args_tss): self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tss"])) for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break filename = "_".join([tss, args_tss.program]) + ".gff" predict = os.path.join(self.gff_outfolder, filename) print("Running merge and classify manual ....") stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss) merge_manual_predict_tss( predict, stat_file, os.path.join(self.tmps["tss"], filename), os.path.join(args_tss.gffs, gff), args_tss) shutil.move(stat_file, os.path.join(args_tss.out_folder, "statistics", tss, stat_file)) self.helper.move_all_content(self.tmps["tss"], self.gff_outfolder, [".gff"]) shutil.rmtree(self.tmps["tss"]) def _validate(self, tsss, args_tss): print("Running validation of annotation....") for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break stat_file = os.path.join( self.stat_outfolder, tss, "".join(["stat_gene_vali_", tss, ".csv"])) out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff") if args_tss.program.lower() == "tss": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) elif args_tss.program.lower() == "processing": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "processing.gff"])) validate_gff(compare_file, os.path.join(args_tss.gffs, gff), stat_file, out_cds_file, args_tss.utr_length, args_tss.program.lower()) shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff)) def _compare_ta(self, tsss, args_tss): detect = False print("Running compare transcript assembly and TSS ...") self.multiparser.parser_gff(args_tss.ta_files, "transcript") self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"], None, "transcript") for tss in tsss: stat_out = os.path.join( self.stat_outfolder, tss, "".join([ "stat_compare_TSS_Transcriptome_assembly_", tss, ".csv"])) for ta in os.listdir(self.tmps["ta"]): filename = ta.split("_transcript") if (filename[0] == tss) and (filename[1] == ".gff"): detect = True break compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) if detect: stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file, stat_out, self.tmps["ta_tss"], self.tmps["tss_ta"], args_tss.fuzzy) self.helper.sort_gff(self.tmps["tss_ta"], compare_file) self.helper.sort_gff(self.tmps["ta_tss"], os.path.join(args_tss.ta_files, ta)) os.remove(self.tmps["tss_ta"]) os.remove(self.tmps["ta_tss"]) detect = False def _stat_tss(self, tsss, feature): print("Running statistaics.....") for tss in tsss: compare_file = os.path.join(self.gff_outfolder, "_".join([tss, feature]) + ".gff") stat_tsspredator( compare_file, feature, os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "class", tss]) + ".csv"), os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "libs", tss]) + ".csv")) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_class", ".png"]) if os.path.exists(os.path.join( self.stat_outfolder, "TSSstatistics.tsv")): shutil.move( os.path.join( self.stat_outfolder, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, tss, "TSSstatistics.tsv")) plot_venn(compare_file, feature) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_venn", ".png"]) def _set_gen_config(self, args_tss, input_folder): prefixs = [] detect = False for fasta in os.listdir(self.fasta_path): for gff in os.listdir(self.gff_path): if fasta[:-3] == gff[:-4]: prefix = fasta[:-3] for wig in os.listdir(self.wig_path): filename = wig.split("_STRAIN_") if filename[1][:-4] == prefix: detect = True break if detect: prefixs.append(prefix) config = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._gen_config( prefix, args_tss, os.path.join(self.gff_path, gff), self.wig_path, os.path.join(self.fasta_path, fasta), config) return prefixs def _merge_wigs(self, wig_folder, prefix, libs): self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tmp"])) for wig_file in os.listdir(wig_folder): for lib in libs: info = lib.split(":") if (info[0][:-4] in wig_file) and (info[-1] == "+") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_forward.wig")) if (info[0][:-4] in wig_file) and (info[-1] == "-") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_reverse.wig")) def _check_orphan(self, prefixs, wig_folder, args_tss): for prefix in prefixs: self._merge_wigs(wig_folder, prefix, args_tss.libs) tmp_tss = os.path.join(self.tmps["tmp"], "_".join([ prefix, args_tss.program + ".gff"])) pre_tss = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program + ".gff"])) check_orphan(pre_tss, os.path.join( args_tss.gffs, prefix + ".gff"), "tmp/merge_forward.wig", "tmp/merge_reverse.wig", tmp_tss) shutil.move(tmp_tss, pre_tss) shutil.rmtree("tmp") def _remove_files(self, args_tss): print("Remove temperary files and folders...") self.helper.remove_tmp(args_tss.fastas) self.helper.remove_tmp(args_tss.gffs) self.helper.remove_tmp(args_tss.wig_folder) self.helper.remove_tmp(args_tss.ta_files) if "merge_forward.wig" in os.listdir(os.getcwd()): os.remove("merge_forward.wig") if "merge_reverse.wig" in os.listdir(os.getcwd()): os.remove("merge_reverse.wig") def _deal_with_overlap(self, out_folder, args_tss): if args_tss.overlap_feature.lower() == "both": pass else: print("Comparing TSS and Processing site...") if args_tss.program.lower() == "tss": for tss in os.listdir(out_folder): if tss.endswith("_TSS.gff"): ref = self.helper.get_correct_file( args_tss.references, "_processing.gff", tss.replace("_TSS.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.overlap_feature, args_tss.cluster) elif args_tss.program.lower() == "processing_site": for tss in os.listdir(out_folder): if tss.endswith("_processing.gff"): ref = self.helper.get_correct_file( args_tss.references, "_TSS.gff", tss.replace("_processing.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.overlap_feature, args_tss.cluster) def _low_expression(self, args_tss, gff_folder): prefix = None self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs) for gff in os.listdir(gff_folder): if (args_tss.program.lower() == "tss") and ( gff.endswith("_TSS.gff")): prefix = gff.replace("_TSS.gff", "") elif (args_tss.program.lower() == "processing") and ( gff.endswith("_processing.gff")): prefix = gff.replace("_processing.gff", "") if prefix: out = open(os.path.join( self.stat_outfolder, prefix, "_".join([ "stat", prefix, "low_expression_cutoff.csv"])), "w") out.write("\t".join(["strain", "cutoff_coverage"]) + "\n") cutoff = filter_low_expression( os.path.join(gff_folder, gff), args_tss, "tmp/merge_forward.wig", "tmp/merge_reverse.wig", "tmp/without_low_expression.gff") out.write("\t".join([prefix, str(cutoff)]) + "\n") os.remove(os.path.join(gff_folder, gff)) shutil.move("tmp/without_low_expression.gff", os.path.join(gff_folder, gff)) prefix = None out.close() def run_tsspredator(self, args_tss): input_folder = os.path.join(args_tss.out_folder, "configs") for gff in os.listdir(args_tss.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_tss.gffs, gff)) self.helper.check_make_folder(self.gff_outfolder) self.multiparser.parser_fasta(args_tss.fastas) self.multiparser.parser_gff(args_tss.gffs, None) self.multiparser.parser_wig(args_tss.wig_folder) prefixs = self._set_gen_config(args_tss, input_folder) for prefix in prefixs: out_path = os.path.join( self.master, "_".join(["MasterTable", prefix])) config_file = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._start_to_run(args_tss.tsspredator_path, config_file, out_path, prefix) if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")): shutil.move(os.path.join(out_path, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, "TSSstatistics.tsv")) if args_tss.program.lower() == "processing_site": args_tss.program = "processing" self._convert_gff(prefixs, args_tss) if args_tss.check_orphan: print("checking the orphan TSS...") self._check_orphan(prefixs, os.path.join(args_tss.wig_folder, "tmp"), args_tss) self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder, None, args_tss.program) datas = [] for gff in os.listdir(self.gff_outfolder): if gff.endswith(".gff"): gff_folder = gff.replace("".join(["_", args_tss.program, ".gff"]), "") self.helper.check_make_folder( os.path.join(self.stat_outfolder, gff_folder)) datas.append(gff_folder) if args_tss.remove_low_expression is not None: self._low_expression(args_tss, self.gff_outfolder) if args_tss.manual is not None: self.multiparser.combine_wig(args_tss.gffs, self.wig_path, None, args_tss.libs) self._merge_manual(datas, args_tss) self._deal_with_overlap(self.gff_outfolder, args_tss) if args_tss.stat: self._stat_tss(datas, args_tss.program) if args_tss.validate: self._validate(datas, args_tss) if args_tss.ta_files is not None: self._compare_ta(datas, args_tss) self._remove_files(args_tss)