예제 #1
0
class ArgsContainer(object):

    def __init__(self):
        self.multiparser = Multiparser()
        self.helper = Helper()

    def _check_replicates(self, replicates_tex, replicates_frag):
        if (replicates_tex is not None) and (replicates_frag is not None):
            replicates = {"tex": int(replicates_tex),
                          "frag": int(replicates_frag)}
        elif replicates_tex is not None:
            replicates = {"tex": int(replicates_tex), "frag": -1}
        elif replicates_frag is not None:
            replicates = {"tex": -1, "frag": int(replicates_frag)}
        else:
            print("Error:No replicates number assign!!!")
            sys.exit()
        return replicates

    def _check_libs(self, tex_notex_libs, frag_libs):
        if (tex_notex_libs is None) and (frag_libs is None):
            print("Error: please input proper libraries!!")
        if (tex_notex_libs is not None) and (frag_libs is not None):
            libs = tex_notex_libs + frag_libs
        elif (tex_notex_libs is not None):
            libs = tex_notex_libs
        elif (frag_libs is not None):
            libs = frag_libs
        return libs

    def _parser_combine_wigs(self, subcommand):
        self.tex_path = None
        self.frag_path = None
        self.multiparser.parser_gff(self.gffs, None)
        if subcommand == "terminator":
            gff_path = os.path.join(self.gffs, "tmp")
            self.multiparser.parser_gff(gff_path, None)
        else:
            gff_path = self.gffs
        if self.tex_wigs is not None:
            self.tex_path = os.path.join(self.tex_wigs, "tmp")
            self.multiparser.parser_wig(self.tex_wigs)
            self.multiparser.combine_wig(gff_path, self.tex_path,
                                         None, self.libs)
            self.merge_wigs = self.tex_wigs
            self.wig_path = self.tex_path
        if self.frag_wigs is not None:
            self.frag_path = os.path.join(self.frag_wigs, "tmp")
            self.multiparser.parser_wig(self.frag_wigs)
            self.multiparser.combine_wig(gff_path, self.frag_path,
                                         None, self.libs)
            self.merge_wigs = self.frag_wigs
            self.wig_path = self.frag_path
        if (self.tex_path is not None) and (
                self.frag_path is not None):
            self = self._merge_wig()
        if (self.tex_path is None) and (
                self.frag_path is None):
            print("Error: There is no proper wig files assigned!!")
            sys.exit()
        return self

    def _merge_wig(self):
        self.merge_wigs = os.path.join(self.out_folder, "merge_wigs")
        if (self.tex_wigs is not None) and (
                self.frag_wigs is not None):
            self.helper.check_make_folder(self.merge_wigs)
            self.wig_path = os.path.join(self.merge_wigs, "tmp")
            self.helper.check_make_folder(self.wig_path)
            for wig in os.listdir(self.tex_wigs):
                if os.path.isfile(os.path.join(self.tex_wigs, wig)):
                    shutil.copy(os.path.join(self.tex_wigs, wig),
                                self.merge_wigs)
            for wig in os.listdir(self.frag_wigs):
                if os.path.isfile(os.path.join(self.frag_wigs, wig)):
                    shutil.copy(os.path.join(self.frag_wigs, wig),
                                self.merge_wigs)
            for wig in os.listdir(self.tex_path):
                if os.path.isfile(os.path.join(self.tex_path, wig)):
                    shutil.copy(os.path.join(self.tex_path, wig),
                                self.wig_path)
            for wig in os.listdir(self.frag_path):
                if os.path.isfile(os.path.join(self.frag_path, wig)):
                    self.helper.merge_file(os.path.join(self.frag_path, wig),
                                           os.path.join(self.wig_path, wig))
        elif (self.tex_wigs is not None):
            self.merge_wigs = self.tex_wigs
        elif (self.frag_wigs is not None):
            self.merge_wigs = self.frag_wigs
        return self

    def _deal_multi_inputs(self, inputs, file_type, num, command):
        if inputs is not None:
            datas = inputs.split(",")
            if num is not None:
                if (len(datas) != num):
                    print("Error: the amount of {0} is not correct!!".format(
                        command))
            new_inputs = []
            for data in datas:
                if file_type == "float":
                    new_inputs.append(float(data.strip()))
                elif file_type == "int":
                    new_inputs.append(int(data.strip()))
                else:
                    new_inputs.append(data)
            return new_inputs
        else:
            return inputs

    def container_ratt(self, ratt_path, element, transfer_type,
                       ref_embl_gbk, target_fasta, ref_fasta, ratt_folder,
                       convert_to_gff_rnt_ptt, tar_annotation_folder,
                       compare_pair):
        self.ratt_path = ratt_path
        self.element = element
        self.transfer_type = transfer_type
        self.ref_embls = ref_embl_gbk
        self.tar_fastas = target_fasta
        self.ref_fastas = ref_fasta
        self.output_path = ratt_folder
        self.convert = convert_to_gff_rnt_ptt
        self.gff_outfolder = tar_annotation_folder
        self.pairs = self._deal_multi_inputs(compare_pair, "str", None, None)
        return self

    def container_tsspredator(self, TSSpredator_path, compute_program,
                              fasta_folder, annotation_folder, wig_folder, lib,
                              output_prefix, height, height_reduction, factor,
                              factor_reduction, base_height, enrichment_factor,
                              processing_factor, replicate_match, out_folder,
                              statistics, validate_gene, merge_manual,
                              compare_transcript_assembly, fuzzy, utr_length,
                              cluster, length, re_check_orphan,
                              overlap_feature, reference_gff_folder,
                              remove_low_expression):
        self.tsspredator_path = TSSpredator_path
        self.program = compute_program
        self.fastas = fasta_folder
        self.gffs = annotation_folder
        self.wig_folder = wig_folder
        self.libs = self._deal_multi_inputs(lib, "str", None, None)
        self.output_prefixs = self._deal_multi_inputs(output_prefix, "str",
                                                      None, None)
        self.height = height
        self.height_reduction = height_reduction
        self.factor = factor
        self.factor_reduction = factor_reduction
        self.base_height = base_height
        self.enrichment_factor = enrichment_factor
        self.processing_factor = processing_factor
        self.repmatch = replicate_match
        self.out_folder = out_folder
        self.stat = statistics
        self.validate = validate_gene
        self.manual = merge_manual
        self.ta_files = compare_transcript_assembly
        self.fuzzy = fuzzy
        self.utr_length = utr_length
        self.cluster = cluster
        self.nt_length = length
        self.check_orphan = re_check_orphan
        self.overlap_feature = overlap_feature
        self.references = reference_gff_folder
        self.remove_low_expression = remove_low_expression
        return self

    def container_optimize(self, TSSpredator_path, fasta_file, annotation_file,
                           wig_folder, manual, out_folder, strain_name,
                           max_height, max_height_reduction, max_factor,
                           max_factor_reduction, max_base_height,
                           max_enrichment_factor, max_processing_factor,
                           utr_length, lib, output_prefix, cluster, length,
                           core, program, replicate_match, steps):
        self.tsspredator_path = TSSpredator_path
        self.fastas = fasta_file
        self.gffs = annotation_file
        self.wigs = wig_folder
        self.manual = manual
        self.output_folder = out_folder
        self.project_strain = strain_name
        self.height = max_height
        self.height_reduction = max_height_reduction
        self.factor = max_factor
        self.factor_reduction = max_factor_reduction
        self.base_height = max_base_height
        self.enrichment = max_enrichment_factor
        self.processing = max_processing_factor
        self.utr = utr_length
        self.libs = self._deal_multi_inputs(lib, "str", None, None)
        self.replicate_name = self._deal_multi_inputs(output_prefix, "str",
                                                      None, None)
        self.cluster = cluster
        self.length = length
        self.cores = core
        self.program = program
        self.replicate = replicate_match
        self.steps = steps
        return self

    def container_terminator(
            self, TransTermHP_path, expterm_path, RNAfold_path, out_folder,
            fasta_folder, annotation_folder, transcript_folder, srna,
            statistics, tex_wig_folder, frag_wig_folder, decrease,
            highest_coverage, fuzzy_detect_coverage, fuzzy_within_transcript,
            fuzzy_downstream_transcript, fuzzy_within_gene,
            fuzzy_downstream_gene, transtermhp_folder, tex_notex_libs,
            frag_libs, tex_notex, replicates_tex, replicates_frag, table_best,
            min_loop_length, max_loop_length, min_stem_length, max_stem_length,
            min_AT_tail_length, miss_rate, range_u):
        self.TransTermHP_path = TransTermHP_path
        self.expterm_path = expterm_path
        self.RNAfold_path = RNAfold_path
        self.out_folder = out_folder
        self.fastas = fasta_folder
        self.gffs = annotation_folder
        self.trans = transcript_folder
        self.srnas = srna
        self.stat = statistics
        self.tex_wigs = tex_wig_folder
        self.frag_wigs = frag_wig_folder
        self.decrease = decrease
        self.cutoff_coverage = highest_coverage
        self.fuzzy = fuzzy_detect_coverage
        self.fuzzy_up_ta = fuzzy_within_transcript
        self.fuzzy_down_ta = fuzzy_downstream_transcript
        self.fuzzy_up_gene = fuzzy_within_gene
        self.fuzzy_down_gene = fuzzy_downstream_gene
        self.hp_folder = transtermhp_folder
        self.tlibs = self._deal_multi_inputs(tex_notex_libs, "str", None, None)
        self.flibs = self._deal_multi_inputs(frag_libs, "str", None, None)
        self.libs = self._check_libs(self.tlibs, self.flibs)
        self.tex_notex = tex_notex
        self.replicates_tex = replicates_tex
        self.replicates_frag = replicates_frag
        self.replicates = self._check_replicates(
                replicates_tex, replicates_frag)
        self.table_best = table_best
        self.min_loop = min_loop_length
        self.max_loop = max_loop_length
        self.min_stem = min_stem_length
        self.max_stem = max_stem_length
        self.at_tail = min_AT_tail_length
        self.miss_rate = miss_rate
        self.range_u = range_u
        self = self._parser_combine_wigs("terminator")
        return self

    def container_transcript(
            self, frag_wig_path, tex_wig_path, tex_notex, length,
            annotation_folder, height, width, tolerance, tolerance_coverage,
            replicates_tex, replicates_frag, transcript_assembly_output_folder,
            compare_TSS, compare_genome_annotation, TSS_fuzzy,
            tex_treated_libs, fragmented_libs, compare_feature_genome,
            table_best, terminator_folder, fuzzy_term):
        self.frag_wigs = frag_wig_path
        self.tex_wigs = tex_wig_path
        self.tex = tex_notex
        self.length = length
        self.gffs = annotation_folder
        self.height = height
        self.width = width
        self.tolerance = tolerance
        self.low_cutoff = tolerance_coverage
        self.replicates_tex = replicates_tex
        self.replicates_frag = replicates_frag
        self.replicates = self._check_replicates(
                replicates_tex, replicates_frag)
        self.out_folder = transcript_assembly_output_folder
        self.compare_tss = compare_TSS
        self.compare_cds = compare_genome_annotation
        self.fuzzy = TSS_fuzzy
        self.tlibs = self._deal_multi_inputs(tex_treated_libs, "str", None,
                                             None)
        self.flibs = self._deal_multi_inputs(fragmented_libs, "str", None,
                                             None)
        self.libs = self._check_libs(self.tlibs, self.flibs)
        self.c_feature = self._deal_multi_inputs(compare_feature_genome, "str",
                                                 None, None)
        self.table_best = table_best
        self.terms = terminator_folder
        self.fuzzy_term = fuzzy_term
        self = self._parser_combine_wigs("transcript")
        return self

    def container_utr(self, tss_folder, annotation_folder,
                      transcript_assembly_folder, terminator_folder,
                      terminator_fuzzy, utr_folder, tss_source, base_5utr,
                      length, base_3utr):
        self.tsss = tss_folder
        self.gffs = annotation_folder
        self.trans = transcript_assembly_folder
        self.terms = terminator_folder
        self.fuzzy = terminator_fuzzy
        self.out_folder = utr_folder
        self.source = tss_source
        self.base_5utr = base_5utr
        self.base_3utr = base_3utr
        self.length = length
        return self

    def container_srna(
            self, Vienna_folder, Vienna_utils, blast_plus_folder,
            ps2pdf14_path, srna_folder, UTR_derived_sRNA, annotation_folder,
            TSS_folder, transcript_assembly_folder, TSS_intergenic_fuzzy,
            TSS_5UTR_fuzzy, TSS_3UTR_fuzzy, TSS_interCDS_fuzzy, import_info,
            tex_wig_folder, frag_wig_folder, processing_site_folder,
            fasta_folder, mountain_plot, nr_format, srna_format,
            sRNA_database_path, nr_database_path, cutoff_energy,
            run_intergenic_TEX_coverage, run_intergenic_noTEX_coverage,
            run_intergenic_fragmented_coverage, run_antisense_TEX_coverage,
            run_antisense_noTEX_coverage, run_antisense_fragmented_coverage,
            intergenic_tolerance, run_utr_TEX_coverage, run_utr_noTEX_coverage,
            run_utr_fragmented_coverage, max_length, min_length,
            tex_notex_libs, frag_libs, replicates_tex, replicates_frag,
            tex_notex, blast_e_nr, blast_e_srna, detect_sRNA_in_CDS,
            table_best, decrease_intergenic, decrease_utr, fuzzy_intergenic,
            fuzzy_utr, cutoff_nr_hit, sORF, best_with_all_sRNAhit,
            best_without_sORF_candidate, overlap_percent_CDS,
            terminator_folder, terminator_fuzzy_in_CDS,
            terminator_fuzzy_out_CDS, best_with_terminator,
            ignore_hypothetical_protein, TSS_source, min_utr_coverage,
            promoter_table, best_with_promoter, ranking_promoter,
            promoter_name):
        self.vienna_path = Vienna_folder
        self.vienna_util = Vienna_utils
        self.blast_path = blast_plus_folder
        self.ps2pdf14_path = ps2pdf14_path
        self.out_folder = srna_folder
        self.utr_srna = UTR_derived_sRNA
        self.gffs = annotation_folder
        self.tss_folder = TSS_folder
        self.trans = transcript_assembly_folder
        self.fuzzy_inter_tss = TSS_intergenic_fuzzy
        self.fuzzy_5utr_tss = TSS_5UTR_fuzzy
        self.fuzzy_3utr_tss = TSS_3UTR_fuzzy
        self.fuzzy_intercds_tss = TSS_interCDS_fuzzy
        self.fuzzy_tsss = {"5utr": self.fuzzy_5utr_tss,
                           "3utr": self.fuzzy_3utr_tss,
                           "interCDS": self.fuzzy_intercds_tss,
                           "inter": self.fuzzy_inter_tss}
        self.import_info = self._deal_multi_inputs(import_info, "str",
                                                   None, None)
        self.tex_wigs = tex_wig_folder
        self.frag_wigs = frag_wig_folder
        self.pro_folder = processing_site_folder
        self.fastas = fasta_folder
        self.mountain = mountain_plot
        self.nr_format = nr_format
        self.srna_format = srna_format
        self.srna_database = sRNA_database_path
        self.nr_database = nr_database_path
        self.energy = cutoff_energy
        self.coverage_tex = self._deal_multi_inputs(
                run_intergenic_TEX_coverage, "float", 5,
                "--run_intergenic_TEX_coverage")
        self.coverage_notex = self._deal_multi_inputs(
                run_intergenic_noTEX_coverage, "float", 5,
                "--run_intergenic_noTEX_coverage")
        self.coverage_frag = self._deal_multi_inputs(
                run_intergenic_fragmented_coverage, "float", 5,
                "--run_intergenic_fragmented_coverage")
        self.anti_cover_tex = self._deal_multi_inputs(
                run_antisense_TEX_coverage, "float", 5,
                "--run_antisense_TEX_coverage")
        self.anti_cover_notex = self._deal_multi_inputs(
                run_antisense_noTEX_coverage, "float", 5,
                "--run_antisense_noTEX_coverage")
        self.anti_cover_frag = self._deal_multi_inputs(
                run_antisense_fragmented_coverage, "float", 5,
                "--run_antisense_fragmented_coverage")
        self.tolerance = intergenic_tolerance
        self.utr_tex_cover = self._deal_multi_inputs(
                run_utr_TEX_coverage, "str", 3, "--run_utr_TEX_coverage")
        self.utr_notex_cover = self._deal_multi_inputs(
                run_utr_noTEX_coverage, "str", 3, "--run_utr_TEX_coverage")
        self.utr_frag_cover = self._deal_multi_inputs(
                run_utr_fragmented_coverage, "str", 3,
                "--run_utr_fragmented_coverage")
        self.max_len = max_length
        self.min_len = min_length
        self.tlibs = self._deal_multi_inputs(tex_notex_libs, "str", None, None)
        self.flibs = self._deal_multi_inputs(frag_libs, "str", None, None)
        self.libs = self._check_libs(self.tlibs, self.flibs)
        self.replicates_tex = replicates_tex
        self.replicates_frag = replicates_frag
        self.replicates = self._check_replicates(
                replicates_tex, replicates_frag)
        self.tex_notex = tex_notex
        self.e_nr = blast_e_nr
        self.e_srna = blast_e_srna
        self.in_cds = detect_sRNA_in_CDS
        self.table_best = table_best
        self.decrease_inter = decrease_intergenic
        self.decrease_utr = decrease_utr
        self.fuzzy_inter = fuzzy_intergenic
        self.fuzzy_utr = fuzzy_utr
        self.nr_hits_num = cutoff_nr_hit
        self.sorf_file = sORF
        self.all_hit = best_with_all_sRNAhit
        self.best_sorf = best_without_sORF_candidate
        self.cutoff_overlap = overlap_percent_CDS
        self.terms = terminator_folder
        self.fuzzy_b = terminator_fuzzy_in_CDS
        self.fuzzy_a = terminator_fuzzy_out_CDS
        self.best_term = best_with_terminator
        self.hypo = ignore_hypothetical_protein
        self.tss_source = TSS_source
        self.min_utr = min_utr_coverage
        self.promoter_table = promoter_table
        self.best_promoter = best_with_promoter
        if ranking_promoter < 1:
            print("Error: --ranking_time_promoter must larger than 1...")
            sys.exit()
        self.rank_promoter = ranking_promoter
        self.promoter_name = self._deal_multi_inputs(promoter_name, "str",
                                                     None, None)
        self = self._parser_combine_wigs("srna")
        return self

    def container_intersrna(self, file_type, files, args_srna, prefix,
                            gff_file, tran_file, tss_file, pro_file, fuzzy):
        args_srna.file_type = file_type
        args_srna.gff_file = gff_file
        args_srna.tran_file = tran_file
        args_srna.tss_file = tss_file
        args_srna.pro_file = pro_file
        args_srna.fuzzy = fuzzy
        args_srna.prefix = prefix
        if file_type == "frag":
            args_srna.wig_f_file = os.path.join(
                    args_srna.frag_path, "_".join([prefix, "forward.wig"]))
            args_srna.wig_r_file = os.path.join(
                    args_srna.frag_path, "_".join([prefix, "reverse.wig"]))
            args_srna.wig_folder = args_srna.frag_wigs
            args_srna.input_libs = args_srna.flibs
            args_srna.output_file = files["frag_gff"]
            args_srna.output_table = files["frag_csv"]
            args_srna.cutoffs = args_srna.coverage_frag
            args_srna.tss_source = True
            args_srna.cut_notex = None
            args_srna.anti_notex_cutoff = None
        else:
            args_srna.wig_f_file = os.path.join(
                    args_srna.tex_path, "_".join([prefix, "forward.wig"]))
            args_srna.wig_r_file = os.path.join(
                    args_srna.tex_path, "_".join([prefix, "reverse.wig"]))
            args_srna.wig_folder = args_srna.tex_wigs
            args_srna.input_libs = args_srna.tlibs
            args_srna.output_file = files["tex_gff"]
            args_srna.output_table = files["tex_csv"]
            args_srna.cutoffs = args_srna.coverage_tex
            args_srna.tss_source = args_srna.tss_source
            args_srna.cut_notex = args_srna.coverage_notex
            args_srna.anti_notex_cutoff = args_srna.anti_cover_notex
        return args_srna

    def container_utrsrna(self, gff, tran, tss, files, pro, fasta, file_type,
                          prefix, args_srna):
        args_srna.file_type = file_type
        args_srna.gff_file = gff
        args_srna.ta_file = tran
        args_srna.tss_file = tss
        args_srna.pro_file = pro
        args_srna.prefix = prefix
        args_srna.seq_file = fasta
        if file_type == "frag":
            args_srna.wig_f_file = os.path.join(
                    args_srna.frag_path, "_".join([prefix, "forward.wig"]))
            args_srna.wig_r_file = os.path.join(
                    args_srna.frag_path, "_".join([prefix, "reverse.wig"]))
            args_srna.wig_folder = args_srna.frag_wigs
            args_srna.input_libs = args_srna.flibs
            args_srna.output_file = files["frag_gff"]
            args_srna.output_table = files["frag_csv"]
            args_srna.utr_coverages = args_srna.utr_frag_cover
            args_srna.notex = None
        else:
            args_srna.wig_f_file = os.path.join(
                    args_srna.tex_path, "_".join([prefix, "forward.wig"]))
            args_srna.wig_r_file = os.path.join(
                    args_srna.tex_path, "_".join([prefix, "reverse.wig"]))
            args_srna.wig_folder = args_srna.tex_wigs
            args_srna.input_libs = args_srna.tlibs
            args_srna.output_file = files["tex_gff"]
            args_srna.output_table = files["tex_csv"]
            args_srna.utr_coverages = args_srna.utr_tex_cover
            args_srna.notex = args_srna.utr_notex_cover
        args_srna.coverages = {"5utr": args_srna.utr_coverages[0],
                               "3utr": args_srna.utr_coverages[1],
                               "interCDS": args_srna.utr_coverages[2]}
        if args_srna.notex is not None:
            args_srna.cover_notex = {"5utr": args_srna.notex[0],
                                     "3utr": args_srna.notex[1],
                                     "interCDS": args_srna.notex[2]}
        else:
            args_srna.cover_notex = None
        return args_srna

    def extend_inter_container(self, args_srna, tsss, pros, wigs_f, wigs_r,
                               nums, output, out_table, texs, detects,
                               cutoff_coverage, notex):
        args_srna.tsss = tsss
        args_srna.pros = pros
        args_srna.wigs_f = wigs_f
        args_srna.wigs_r = wigs_r
        args_srna.nums = nums
        args_srna.output = output
        args_srna.out_table = out_table
        args_srna.texs = texs
        args_srna.detects = detects
        args_srna.cutoff_coverage = cutoff_coverage
        args_srna.notex = notex
        return args_srna

    def extend_utr_container(self, args_srna, cdss, tsss, pros, wig_fs, wig_rs,
                             out, out_t, texs):
        args_srna.cdss = cdss
        args_srna.tsss = tsss
        args_srna.pros = pros
        args_srna.wig_fs = wig_fs
        args_srna.wig_rs = wig_rs
        args_srna.out = out
        args_srna.out_t = out_t
        args_srna.texs = texs
        args_srna.utrs = []
        args_srna.srnas = []
        return args_srna

    def container_sorf(self, sorf_folder, UTR_derived_sORF, transcript_folder,
                       annotation_folder, TSS_folder, utr_length, min_length,
                       max_length, tex_wig_folder, frag_wig_folder,
                       cutoff_intergenic_coverage, cutoff_antisense_coverage,
                       cutoff_5utr_coverage, cutoff_3utr_coverage,
                       cutoff_interCDS_coverage, fasta_folder, tex_notex_libs,
                       frag_libs, tex_notex, replicates_tex, replicates_frag,
                       table_best, sRNA_folder, start_codon, stop_codon,
                       cutoff_background, fuzzy_rbs, rbs_not_after_TSS,
                       print_all_combination, best_no_sRNA, best_no_TSS,
                       ignore_hypothetical_protein, min_rbs_distance,
                       max_rbs_distance):
        self.out_folder = sorf_folder
        self.utr_detect = UTR_derived_sORF
        self.trans = transcript_folder
        self.gffs = annotation_folder
        self.tsss = TSS_folder
        self.utr_length = utr_length
        self.min_len = min_length
        self.max_len = max_length
        self.tex_wigs = tex_wig_folder
        self.frag_wigs = frag_wig_folder
        self.cutoff_inter = cutoff_intergenic_coverage
        self.cutoff_anti = cutoff_antisense_coverage
        self.cutoff_5utr = cutoff_5utr_coverage
        self.cutoff_3utr = cutoff_3utr_coverage
        self.cutoff_intercds = cutoff_interCDS_coverage
        self.fastas = fasta_folder
        self.tlibs = self._deal_multi_inputs(tex_notex_libs, "str", None, None)
        self.flibs = self._deal_multi_inputs(frag_libs, "str", None, None)
        self.libs = self._check_libs(self.tlibs, self.flibs)
        self.tex_notex = tex_notex
        self.replicates_tex = replicates_tex
        self.replicates_frag = replicates_frag
        self.replicates = self._check_replicates(
                replicates_tex, replicates_frag)
        self.table_best = table_best
        self.srnas = sRNA_folder
        self.start_codon = self._deal_multi_inputs(start_codon, "str",
                                                   None, None)
        self.stop_codon = self._deal_multi_inputs(stop_codon, "str",
                                                  None, None)
        self.background = cutoff_background
        self.fuzzy_rbs = fuzzy_rbs
        self.noafter_tss = rbs_not_after_TSS
        self.print_all = print_all_combination
        self.no_srna = best_no_sRNA
        self.no_tss = best_no_TSS
        self.hypo = ignore_hypothetical_protein
        self.min_rbs = min_rbs_distance
        self.max_rbs = max_rbs_distance
        self = self._parser_combine_wigs("sorf")
        return self

    def container_srna_target(self, Vienna_folder, annotation_path, fasta_path,
                              sRNA_path, query_sRNA, program,
                              interaction_length, window_size_target,
                              span_target, window_size_srna, span_srna,
                              unstructured_region_RNAplex_target,
                              unstructured_region_RNAplex_srna,
                              unstructured_region_RNAup, energy_threshold,
                              duplex_distance, top, starget_output_folder,
                              process_rnaplex, process_rnaup, continue_rnaup,
                              potential_target_start, potential_target_end,
                              target_feature):
        self.vienna_path = Vienna_folder
        self.gffs = annotation_path
        self.fastas = fasta_path
        self.srnas = sRNA_path
        self.query = self._deal_multi_inputs(query_sRNA, "str", None, None)
        self.program = program
        self.inter_length = interaction_length
        self.win_size_t = window_size_target
        self.span_t = span_target
        self.win_size_s = window_size_srna
        self.span_s = span_srna
        self.unstr_region_rnaplex_t = unstructured_region_RNAplex_target
        self.unstr_region_rnaplex_s = unstructured_region_RNAplex_srna
        self.unstr_region_rnaup = unstructured_region_RNAup
        self.energy = energy_threshold
        self.duplex_dist = duplex_distance
        self.top = top
        self.out_folder = starget_output_folder
        self.core_plex = process_rnaplex
        self.core_up = process_rnaup
        self.continue_rnaup = continue_rnaup
        self.tar_start = potential_target_start
        self.tar_end = potential_target_end
        self.features = self._deal_multi_inputs(target_feature, "str",
                                                None, None)
        return self

    def container_goterm(self, annotation_path, goterm_output_folder,
                         UniProt_id, go_obo, goslim_obo, transcript_path):
        self.gffs = annotation_path
        self.out_folder = goterm_output_folder
        self.uniprot = UniProt_id
        self.go = go_obo
        self.goslim = goslim_obo
        self.trans = transcript_path
        return self

    def container_sublocal(self, Psortb_path, gff_path, fasta_path,
                           bacteria_type, difference_multi, merge_to_gff,
                           sublocal_output_folder, transcript_path):
        self.psortb_path = Psortb_path
        self.gffs = gff_path
        self.fastas = fasta_path
        self.gram = bacteria_type
        self.fuzzy = difference_multi
        self.merge = merge_to_gff
        self.out_folder = sublocal_output_folder
        self.trans = transcript_path
        return self

    def container_ppi(self, gff_path, proteinID_strains, without_strain_pubmed,
                      species_STRING, score, ppi_output_folder, node_size,
                      query):
        self.ptts = gff_path
        self.strains = self._deal_multi_inputs(proteinID_strains, "str",
                                               None, None)
        self.no_specific = without_strain_pubmed
        self.species = species_STRING
        self.score = score
        self.out_folder = ppi_output_folder
        self.size = node_size
        self.querys = self._deal_multi_inputs(query, "str", None, None)
        return self

    def container_promoter(self, MEME_path, promoter_output_folder, tex_libs,
                           TSS_folder, fasta_folder, num_motif, nt_before_TSS,
                           motif_width, TSS_source, tex_wig_path,
                           annotation_folder, combine_all, e_value):
        self.meme_path = MEME_path
        self.output_folder = promoter_output_folder
        self.input_libs = self._deal_multi_inputs(tex_libs, "str", None, None)
        self.tsss = TSS_folder
        self.fastas = fasta_folder
        self.num_motif = num_motif
        self.nt_before = nt_before_TSS
        self.widths = self._deal_multi_inputs(motif_width, "str", None, None)
        self.source = TSS_source
        self.wigs = tex_wig_path
        self.gffs = annotation_folder
        self.combine = combine_all
        self.e_value = e_value
        return self

    def container_operon(self, TSS_folder, annotation_folder,
                         transcript_folder, UTR5_folder, UTR3_folder,
                         term_folder, TSS_fuzzy, term_fuzzy, min_length,
                         statistics, operon_output_folder, combine_gff,
                         operon_statistics_folder):
        self.tsss = TSS_folder
        self.gffs = annotation_folder
        self.trans = transcript_folder
        self.utr5s = UTR5_folder
        self.utr3s = UTR3_folder
        self.terms = term_folder
        self.tss_fuzzy = TSS_fuzzy
        self.term_fuzzy = term_fuzzy
        self.length = min_length
        self.statistics = statistics
        self.output_folder = operon_output_folder
        self.combine = combine_gff
        self.stat_folder = operon_statistics_folder
        return self

    def container_snp(self, samtools_path, bcftools_path, bam_type, program,
                      fasta_path, tex_bam_path, frag_bam_path, quality,
                      read_depth, snp_output_folder, indel_fraction, chrom):
        self.samtools_path = samtools_path
        self.bcftools_path = bcftools_path
        self.types = bam_type
        self.program = self._deal_multi_inputs(program, "str", None, None)
        self.fastas = fasta_path
        self.normal_bams = tex_bam_path
        self.frag_bams = frag_bam_path
        self.quality = quality
        self.depth = read_depth
        self.out_folder = snp_output_folder
        self.fraction = indel_fraction
        if chrom == "haploid":
            chrom = "1"
        elif chrom == "diploid":
            chrom = "2"
        self.chrom = chrom
        return self

    def container_circrna(self, align, process, fasta_path, annotation_path,
                          tex_bam_path, fragmented_bam_path, read_folder,
                          circrna_stat_folder, support_reads,
                          segemehl_folder, samtools_path, start_ratio,
                          end_ratio, ignore_hypothetical_protein, out_folder):
        self.align = align
        self.cores = process
        self.fastas = fasta_path
        self.gffs = annotation_path
        self.normal_bams = tex_bam_path
        self.frag_bams = fragmented_bam_path
        self.read_folder = read_folder
        self.stat_folder = circrna_stat_folder
        self.support = support_reads
        self.segemehl_path = segemehl_folder
        self.samtools_path = samtools_path
        self.start_ratio = start_ratio
        self.end_ratio = end_ratio
        self.hypo = ignore_hypothetical_protein
        self.output_folder = out_folder
        return self

    def container_ribos(self, infernal_path, riboswitch_ID, gff_path,
                        fasta_path, tss_path, transcript_path, Rfam,
                        ribos_output_folder, e_value, output_all,
                        database_folder, fuzzy, start_codon, min_dist_rbs,
                        max_dist_rbs, fuzzy_rbs, UTR_length):
        self.infernal_path = infernal_path
        self.ribos_id = riboswitch_ID
        self.gffs = gff_path
        self.fastas = fasta_path
        self.tsss = tss_path
        self.trans = transcript_path
        self.rfam = Rfam
        self.out_folder = ribos_output_folder
        self.e_value = e_value
        self.output_all = output_all
        self.database = database_folder
        self.fuzzy = fuzzy
        self.start_codons = self._deal_multi_inputs(start_codon, "str",
                                                    None, None)
        self.start_rbs = min_dist_rbs
        self.end_rbs = max_dist_rbs
        self.fuzzy_rbs = fuzzy_rbs
        self.utr = UTR_length
        return self

    def container_screen(self, main_gff, side_gffs, fasta, frag_wig_folder,
                         tex_wig_folder, height, tex_libs, frag_libs, present,
                         output_folder):
        self.main_gff = main_gff
        self.side_gffs = self._deal_multi_inputs(side_gffs, "str", None, None)
        self.fasta = fasta
        self.frag_wigs = frag_wig_folder
        self.tex_wigs = tex_wig_folder
        self.height = height
        self.tlibs = self._deal_multi_inputs(tex_libs, "str", None, None)
        self.flibs = self._deal_multi_inputs(frag_libs, "str", None, None)
        self.present = present
        self.output_folder = output_folder
        return self
예제 #2
0
class TSSpredator(object):
    def __init__(self, args_tss):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.master = os.path.join(args_tss.out_folder, "MasterTables")
        self.tmps = {
            "tss": "tmp_TSS",
            "ta_tss": "tmp_ta_tss",
            "tss_ta": "tmp_tss",
            "tmp": "tmp"
        }
        if args_tss.ta_files is not None:
            self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp")
        else:
            self.tmps["ta"] = None
        self.gff_path = os.path.join(args_tss.gffs, "tmp")
        self.wig_path = os.path.join(args_tss.wig_folder, "tmp")
        self.fasta_path = os.path.join(args_tss.fastas, "tmp")
        self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics")
        self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs")

    def _assign_dict(self, lib_datas):
        return {
            "wig": lib_datas[0],
            "tex": lib_datas[1],
            "condition": int(lib_datas[2]),
            "replicate": lib_datas[3],
            "strand": lib_datas[4]
        }

    def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix, rep_set):
        for num_id in range(1, lib_num + 1):
            cond_list = []
            for lib in lib_list:
                if num_id == lib["condition"]:
                    cond_list.append(lib)
            cond_sort_list = sorted(cond_list, key=lambda k: k['replicate'])
            reps = []
            for cond in cond_sort_list:
                out.write("{0}_{1}{2} = {3}\n".format(
                    prefix, cond["condition"], cond["replicate"],
                    os.path.join(wig_folder, cond["wig"])))
                reps.append(cond["replicate"])
            for rep in sorted(rep_set):
                if rep not in reps:
                    out.write("{0}_{1}{2} = \n".format(prefix,
                                                       cond["condition"], rep))

    def _start_to_run(self, tsspredator_path, config_file, out_path, prefix):
        print("Running TSSpredator for " + prefix)
        out = open(os.path.join(out_path, "log.txt"), "w")
        err = open(os.path.join(out_path, "err.txt"), "w")
        call(["java", "-jar", tsspredator_path, config_file],
             stdout=out,
             stderr=err)
        out.close()
        err.close()

    def _import_lib(self, libs, wig_folder, project_strain_name, out, gff,
                    program, fasta):
        lib_dict = {"fp": [], "fm": [], "nm": [], "np": []}
        lib_num = 0
        rep_set = set()
        list_num_id = []
        print("Runniun {0} now...".format(program))
        for lib in libs:
            lib_datas = lib.split(":")
            if not lib_datas[0].endswith(".wig"):
                print("Error:Exist a not proper wig files!!")
                sys.exit()
            for wig in os.listdir(wig_folder):
                filename = wig.split("_STRAIN_")
                if (filename[0]
                        == lib_datas[0][:-4]) and (filename[1][:-4]
                                                   == project_strain_name):
                    lib_datas[0] = wig
            if int(lib_datas[2]) > lib_num:
                lib_num = int(lib_datas[2])
            if lib_datas[3] not in rep_set:
                rep_set.add(lib_datas[3])
            if (lib_datas[1] == "tex") and (lib_datas[4] == "+"):
                lib_dict["fp"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"):
                lib_dict["fm"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"):
                lib_dict["np"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"):
                lib_dict["nm"].append(self._assign_dict(lib_datas))
        for num_id in range(1, lib_num + 1):
            out.write("annotation_{0} = {1}\n".format(num_id, gff))
        if program.lower() == "tss":
            self._print_lib(lib_num, lib_dict["fm"], out, wig_folder,
                            "fivePrimeMinus", rep_set)
            self._print_lib(lib_num, lib_dict["fp"], out, wig_folder,
                            "fivePrimePlus", rep_set)
        elif program.lower() == "processing_site":
            self._print_lib(lib_num, lib_dict["nm"], out, wig_folder,
                            "fivePrimeMinus", rep_set)
            self._print_lib(lib_num, lib_dict["np"], out, wig_folder,
                            "fivePrimePlus", rep_set)
        else:
            print("Error: Wrong program name!!!")
            sys.exit()
        for num_id in range(1, lib_num + 1):
            out.write("genome_{0} = {1}\n".format(num_id, fasta))
        for num_id in range(1, lib_num + 1):
            list_num_id.append(str(num_id))
        return lib_num, num_id, rep_set, lib_dict, list_num_id

    def _print_repmatch(self, args_tss, out):
        '''check replicate match'''
        if "all" in args_tss.repmatch:
            match = args_tss.repmatch.split("_")[-1]
            out.write("minNumRepMatches = {0}\n".format(match))
        else:
            nums = {}
            matchs = {}
            for match in args_tss.repmatch.split(","):
                lib = match.split("_")[0]
                rep = match.split("_")[-1]
                matchs[lib] = rep
                if rep not in nums.keys():
                    nums[rep] = 1
                else:
                    nums[rep] += 1
            for rep, num in nums.items():
                if num == max(nums.values()):
                    out.write("minNumRepMatches = {0}\n".format(rep))
                    max_rep = rep
                    break
            for lib, rep in matchs.items():
                if rep != max_rep:
                    out.write("minNumRepMatches_{0} = {1}\n".format(lib, rep))

    def _gen_config(self, project_strain_name, args_tss, gff, wig_folder,
                    fasta, config_file):
        '''generation of config files'''
        master_folder = "MasterTable_" + project_strain_name
        out_path = os.path.join(self.master, master_folder)
        self.helper.check_make_folder(out_path)
        out = open(config_file, "w")
        out.write("TSSinClusterSelectionMethod = HIGHEST\n")
        out.write("allowedCompareShift = 1\n")
        out.write("allowedRepCompareShift = 1\n")
        lib_num, num_id, rep_set, lib_dict, list_num_id = \
            self._import_lib(args_tss.libs, wig_folder, project_strain_name,
                             out, gff, args_tss.program, fasta)
        out.write("idList = ")
        out.write(",".join(list_num_id) + "\n")
        out.write("maxASutrLength = 100\n")
        out.write("maxGapLengthInGene = 500\n")
        out.write("maxNormalTo5primeFactor = {0}\n".format(
            args_tss.processing_factor))
        out.write("maxTSSinClusterDistance = {0}\n".format(args_tss.cluster +
                                                           1))
        out.write("maxUTRlength = {0}\n".format(args_tss.utr_length))
        out.write("min5primeToNormalFactor = {0}\n".format(
            args_tss.enrichment_factor))
        out.write("minCliffFactor = {0}\n".format(args_tss.factor))
        out.write("minCliffFactorDiscount = {0}\n".format(
            args_tss.factor_reduction))
        out.write("minCliffHeight = {0}\n".format(args_tss.height))
        out.write("minCliffHeightDiscount = {0}\n".format(
            args_tss.height_reduction))
        out.write("minNormalHeight = {0}\n".format(args_tss.base_height))
        self._print_repmatch(args_tss, out)
        out.write("minPlateauLength = 0\n")
        out.write("mode = cond\n")
        out.write("normPercentile = 0.9\n")
        if args_tss.program.lower() == "tss":
            self._print_lib(lib_num, lib_dict["nm"], out, wig_folder,
                            "normalMinus", rep_set)
            self._print_lib(lib_num, lib_dict["np"], out, wig_folder,
                            "normalPlus", rep_set)
        else:
            self._print_lib(lib_num, lib_dict["fm"], out, wig_folder,
                            "normalMinus", rep_set)
            self._print_lib(lib_num, lib_dict["fp"], out, wig_folder,
                            "normalPlus", rep_set)
        out.write("numReplicates = {0}\n".format(len(rep_set)))
        out.write("numberOfDatasets = {0}\n".format(lib_num))
        out.write("outputDirectory = {0}\n".format(out_path))
        for prefix_id in range(len(args_tss.output_prefixs)):
            out.write("outputPrefix_{0} = {1}\n".format(
                prefix_id + 1, args_tss.output_prefixs[prefix_id]))
        out.write("projectName = {0}\n".format(project_strain_name))
        out.write("superGraphCompatibility = igb\n")
        out.write("texNormPercentile = 0.5\n")
        out.write("writeGraphs = 0\n")
        out.write("writeNocornacFiles = 0\n")
        out.close()

    def _convert_gff(self, prefixs, args_tss):
        for prefix in prefixs:
            out_file = os.path.join(
                self.gff_outfolder,
                "_".join([prefix, args_tss.program]) + ".gff")
            gff_f = open(out_file, "w")
            out_path = os.path.join(self.master,
                                    "_".join(["MasterTable", prefix]))
            if "MasterTable.tsv" not in os.listdir(out_path):
                print("Error:there is not MasterTable file in {0}".format(
                    out_path))
                print("Please check configuration file.")
            else:
                if args_tss.program.lower() == "processing":
                    feature = "processing_site"
                elif args_tss.program.lower() == "tss":
                    feature = "TSS"
                self.converter.convert_mastertable2gff(
                    os.path.join(out_path, "MasterTable.tsv"), "ANNOgesic",
                    feature, prefix, out_file)
            gff_f.close()

    def _merge_manual(self, tsss, args_tss):
        '''if manual detected TSS is provided, it can merge manual detected TSS 
        and TSSpredator predicted TSS'''
        self.helper.check_make_folder(
            os.path.join(os.getcwd(), self.tmps["tss"]))
        for tss in tsss:
            for gff in os.listdir(args_tss.gffs):
                if (gff[:-4] == tss) and (".gff" in gff):
                    break
            filename = "_".join([tss, args_tss.program]) + ".gff"
            predict = os.path.join(self.gff_outfolder, filename)
            print("Running merge and classify manual ....")
            stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss)
            merge_manual_predict_tss(predict, stat_file,
                                     os.path.join(self.tmps["tss"], filename),
                                     os.path.join(args_tss.gffs, gff),
                                     args_tss)
            shutil.move(
                stat_file,
                os.path.join(args_tss.out_folder, "statistics", tss,
                             stat_file))
        self.helper.move_all_content(self.tmps["tss"], self.gff_outfolder,
                                     [".gff"])
        shutil.rmtree(self.tmps["tss"])

    def _validate(self, tsss, args_tss):
        '''validate TSS with genome annotation'''
        print("Running validation of annotation....")
        for tss in tsss:
            for gff in os.listdir(args_tss.gffs):
                if (gff[:-4] == tss) and (".gff" in gff):
                    break
            stat_file = os.path.join(self.stat_outfolder, tss,
                                     "".join(["stat_gene_vali_", tss, ".csv"]))
            out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff")
            if args_tss.program.lower() == "tss":
                compare_file = os.path.join(self.gff_outfolder,
                                            "_".join([tss, "TSS.gff"]))
            elif args_tss.program.lower() == "processing":
                compare_file = os.path.join(self.gff_outfolder,
                                            "_".join([tss, "processing.gff"]))
            validate_gff(compare_file, os.path.join(args_tss.gffs, gff),
                         stat_file, out_cds_file, args_tss.utr_length,
                         args_tss.program.lower())
            shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff))

    def _compare_ta(self, tsss, args_tss):
        '''compare TSS with transcript'''
        detect = False
        print("Running compare transcript assembly and TSS ...")
        self.multiparser.parser_gff(args_tss.ta_files, "transcript")
        self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"], None,
                                     "transcript")
        for tss in tsss:
            stat_out = os.path.join(
                self.stat_outfolder, tss,
                "".join(["stat_compare_TSS_transcript_", tss, ".csv"]))
            for ta in os.listdir(self.tmps["ta"]):
                filename = ta.split("_transcript")
                if (filename[0] == tss) and (filename[1] == ".gff"):
                    detect = True
                    break
            compare_file = os.path.join(self.gff_outfolder,
                                        "_".join([tss, "TSS.gff"]))
            if detect:
                stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file,
                            stat_out, self.tmps["ta_tss"], self.tmps["tss_ta"],
                            args_tss.fuzzy)
                self.helper.sort_gff(self.tmps["tss_ta"], compare_file)
                self.helper.sort_gff(self.tmps["ta_tss"],
                                     os.path.join(args_tss.ta_files, ta))
                os.remove(self.tmps["tss_ta"])
                os.remove(self.tmps["ta_tss"])
                detect = False

    def _stat_tss(self, tsss, feature):
        print("Running statistaics.....")
        for tss in tsss:
            compare_file = os.path.join(self.gff_outfolder,
                                        "_".join([tss, feature]) + ".gff")
            stat_tsspredator(
                compare_file, feature,
                os.path.join(
                    self.stat_outfolder, tss,
                    "_".join(["stat", feature, "class", tss]) + ".csv"),
                os.path.join(self.stat_outfolder, tss,
                             "_".join(["stat", feature, "libs", tss]) +
                             ".csv"))
            self.helper.move_all_content(
                os.getcwd(), os.path.join(self.stat_outfolder, tss),
                ["_class", ".png"])
            if os.path.exists(
                    os.path.join(self.stat_outfolder, "TSSstatistics.tsv")):
                shutil.move(
                    os.path.join(self.stat_outfolder, "TSSstatistics.tsv"),
                    os.path.join(self.stat_outfolder, tss,
                                 "TSSstatistics.tsv"))
            plot_venn(compare_file, feature)
            self.helper.move_all_content(
                os.getcwd(), os.path.join(self.stat_outfolder, tss),
                ["_venn", ".png"])

    def _set_gen_config(self, args_tss, input_folder):
        prefixs = []
        detect = False
        for fasta in os.listdir(self.fasta_path):
            for gff in os.listdir(self.gff_path):
                if fasta[:-3] == gff[:-4]:
                    prefix = fasta[:-3]
                    for wig in os.listdir(self.wig_path):
                        filename = wig.split("_STRAIN_")
                        if filename[1][:-4] == prefix:
                            detect = True
                            break
                    if detect:
                        prefixs.append(prefix)
                        config = os.path.join(
                            input_folder,
                            "_".join(["config", prefix]) + ".ini")
                        self._gen_config(prefix, args_tss,
                                         os.path.join(self.gff_path,
                                                      gff), self.wig_path,
                                         os.path.join(self.fasta_path, fasta),
                                         config)
        return prefixs

    def _merge_wigs(self, wig_folder, prefix, libs):
        self.helper.check_make_folder(
            os.path.join(os.getcwd(), self.tmps["tmp"]))
        for wig_file in os.listdir(wig_folder):
            for lib in libs:
                info = lib.split(":")
                if (info[0][:-4] in wig_file) and (info[-1] == "+") and (
                        prefix in wig_file) and (os.path.isfile(
                            os.path.join(wig_folder, wig_file))):
                    Helper().merge_file(
                        os.path.join(wig_folder, wig_file),
                        os.path.join("tmp", "merge_forward.wig"))
                if (info[0][:-4] in wig_file) and (info[-1] == "-") and (
                        prefix in wig_file) and (os.path.isfile(
                            os.path.join(wig_folder, wig_file))):
                    Helper().merge_file(
                        os.path.join(wig_folder, wig_file),
                        os.path.join("tmp", "merge_reverse.wig"))

    def _check_orphan(self, prefixs, wig_folder, args_tss):
        '''if genome has no locus tag, it can use for classify the TSS'''
        for prefix in prefixs:
            self._merge_wigs(wig_folder, prefix, args_tss.libs)
            tmp_tss = os.path.join(
                self.tmps["tmp"], "_".join([prefix,
                                            args_tss.program + ".gff"]))
            pre_tss = os.path.join(
                self.gff_outfolder,
                "_".join([prefix, args_tss.program + ".gff"]))
            check_orphan(pre_tss, os.path.join(args_tss.gffs, prefix + ".gff"),
                         "tmp/merge_forward.wig", "tmp/merge_reverse.wig",
                         tmp_tss)
            shutil.move(tmp_tss, pre_tss)
        shutil.rmtree("tmp")

    def _remove_files(self, args_tss):
        print("Remove temperary files and folders...")
        self.helper.remove_tmp(args_tss.fastas)
        self.helper.remove_tmp(args_tss.gffs)
        self.helper.remove_tmp(args_tss.wig_folder)
        self.helper.remove_tmp(args_tss.ta_files)
        if "merge_forward.wig" in os.listdir(os.getcwd()):
            os.remove("merge_forward.wig")
        if "merge_reverse.wig" in os.listdir(os.getcwd()):
            os.remove("merge_reverse.wig")

    def _deal_with_overlap(self, out_folder, args_tss):
        '''deal with the situation that TSS and 
        processing site at the same position'''
        if args_tss.overlap_feature.lower() == "both":
            pass
        else:
            print("Comparing TSS and Processing site...")
            if args_tss.program.lower() == "tss":
                for tss in os.listdir(out_folder):
                    if tss.endswith("_TSS.gff"):
                        ref = self.helper.get_correct_file(
                            args_tss.references, "_processing.gff",
                            tss.replace("_TSS.gff", ""), None, None)
                        filter_tss_pro(os.path.join(out_folder, tss), ref,
                                       args_tss.overlap_feature,
                                       args_tss.cluster)
            elif args_tss.program.lower() == "processing_site":
                for tss in os.listdir(out_folder):
                    if tss.endswith("_processing.gff"):
                        ref = self.helper.get_correct_file(
                            args_tss.references, "_TSS.gff",
                            tss.replace("_processing.gff", ""), None, None)
                        filter_tss_pro(os.path.join(out_folder, tss), ref,
                                       args_tss.overlap_feature,
                                       args_tss.cluster)

    def _low_expression(self, args_tss, gff_folder):
        '''deal with the low expressed TSS'''
        prefix = None
        self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs)
        for gff in os.listdir(gff_folder):
            if (args_tss.program.lower()
                    == "tss") and (gff.endswith("_TSS.gff")):
                prefix = gff.replace("_TSS.gff", "")
            elif (args_tss.program.lower()
                  == "processing") and (gff.endswith("_processing.gff")):
                prefix = gff.replace("_processing.gff", "")
            if prefix:
                out = open(
                    os.path.join(
                        self.stat_outfolder, prefix,
                        "_".join(["stat", prefix,
                                  "low_expression_cutoff.csv"])), "w")
                out.write("\t".join(["strain", "cutoff_coverage"]) + "\n")
                cutoff = filter_low_expression(
                    os.path.join(gff_folder,
                                 gff), args_tss, "tmp/merge_forward.wig",
                    "tmp/merge_reverse.wig", "tmp/without_low_expression.gff")
                out.write("\t".join([prefix, str(cutoff)]) + "\n")
                os.remove(os.path.join(gff_folder, gff))
                shutil.move("tmp/without_low_expression.gff",
                            os.path.join(gff_folder, gff))
                prefix = None
        out.close()

    def run_tsspredator(self, args_tss):
        input_folder = os.path.join(args_tss.out_folder, "configs")
        for gff in os.listdir(args_tss.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(
                    os.path.join(args_tss.gffs, gff))
        self.helper.check_make_folder(self.gff_outfolder)
        self.multiparser.parser_fasta(args_tss.fastas)
        self.multiparser.parser_gff(args_tss.gffs, None)
        self.multiparser.parser_wig(args_tss.wig_folder)
        prefixs = self._set_gen_config(args_tss, input_folder)
        for prefix in prefixs:
            out_path = os.path.join(self.master,
                                    "_".join(["MasterTable", prefix]))
            config_file = os.path.join(input_folder,
                                       "_".join(["config", prefix]) + ".ini")
            self._start_to_run(args_tss.tsspredator_path, config_file,
                               out_path, prefix)
            if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")):
                shutil.move(
                    os.path.join(out_path, "TSSstatistics.tsv"),
                    os.path.join(self.stat_outfolder, "TSSstatistics.tsv"))
        if args_tss.program.lower() == "processing_site":
            args_tss.program = "processing"
        self._convert_gff(prefixs, args_tss)
        if args_tss.check_orphan:
            print("checking the orphan TSS...")
            self._check_orphan(prefixs, os.path.join(args_tss.wig_folder,
                                                     "tmp"), args_tss)
        self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder, None,
                                     args_tss.program)
        datas = []
        for gff in os.listdir(self.gff_outfolder):
            if gff.endswith(".gff"):
                gff_folder = gff.replace(
                    "".join(["_", args_tss.program, ".gff"]), "")
                self.helper.check_make_folder(
                    os.path.join(self.stat_outfolder, gff_folder))
                datas.append(gff_folder)
        if args_tss.remove_low_expression is not None:
            self._low_expression(args_tss, self.gff_outfolder)
        if args_tss.manual is not None:
            self.multiparser.combine_wig(args_tss.gffs, self.wig_path, None,
                                         args_tss.libs)
            self._merge_manual(datas, args_tss)
        self._deal_with_overlap(self.gff_outfolder, args_tss)
        if args_tss.stat:
            self._stat_tss(datas, args_tss.program)
        if args_tss.validate:
            self._validate(datas, args_tss)
        if args_tss.ta_files is not None:
            self._compare_ta(datas, args_tss)
        self._remove_files(args_tss)
예제 #3
0
class TSSpredator(object):

    def __init__(self, args_tss):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.master = os.path.join(args_tss.out_folder, "MasterTables")
        self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta":
                     "tmp_tss", "tmp": "tmp"}
        if args_tss.ta_files is not None:
            self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp")
        else:
            self.tmps["ta"] = None
        self.gff_path = os.path.join(args_tss.gffs, "tmp")
        if args_tss.manual is not None:
            self.manual_path = os.path.join(args_tss.manual, "tmp")
        self.wig_path = os.path.join(args_tss.wig_folder, "tmp")
        self.fasta_path = os.path.join(args_tss.fastas, "tmp")
        self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics")
        self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs")

    def _assign_dict(self, lib_datas):
        return {"wig": lib_datas[0],
                "tex": lib_datas[1],
                "condition": int(lib_datas[2]),
                "replicate": lib_datas[3],
                "strand": lib_datas[4]}

    def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix, rep_set):
        for num_id in range(1, lib_num+1):
            cond_list = []
            for lib in lib_list:
                if num_id == lib["condition"]:
                    cond_list.append(lib)
            cond_sort_list = sorted(cond_list, key=lambda k: k['replicate'])
            reps = []
            for cond in cond_sort_list:
                out.write("{0}_{1}{2} = {3}\n".format(
                          prefix, cond["condition"], cond["replicate"],
                          os.path.join(wig_folder, cond["wig"])))
                reps.append(cond["replicate"])
            for rep in sorted(rep_set):
                if rep not in reps:
                    out.write("{0}_{1}{2} = \n".format(
                              prefix, cond["condition"], rep))

    def _start_to_run(self, tsspredator_path, config_file, out_path, prefix, log):
        print("Running TSSpredator for " + prefix)
        log.write("Make sure the version of TSSpredator is at least 1.06.\n")
        out = open(os.path.join(out_path, "log.txt"), "w")
        err = open(os.path.join(out_path, "err.txt"), "w")
        log.write(" ".join(["java", "-jar", tsspredator_path,
                            config_file]) + "\n")
        call(["java", "-jar", tsspredator_path,
              config_file], stdout=out, stderr=err)
        out.close()
        err.close()
        log.write("Done!\n")
        log.write("The following files are generated in {0}:\n".format(out_path))
        for file_ in os.listdir(out_path):
            log.write("\t" + file_ + "\n")

    def _import_lib(self, libs, wig_folder, project_strain_name,
                    out, gff, program, fasta):
        lib_dict = {"fp": [], "fm": [], "nm": [], "np": []}
        lib_num = 0
        rep_set = set()
        list_num_id = []
        for lib in libs:
            lib_datas = lib.split(":")
            if not lib_datas[0].endswith(".wig"):
                print("Error: Wiggle files are not end with .wig!")
                sys.exit()
            for wig in os.listdir(wig_folder):
                filename = wig.split("_STRAIN_")
                if (filename[0] == lib_datas[0][:-4]) and (
                        filename[1][:-4] == project_strain_name):
                    lib_datas[0] = wig
            if int(lib_datas[2]) > lib_num:
                lib_num = int(lib_datas[2])
            if lib_datas[3] not in rep_set:
                rep_set.add(lib_datas[3])
            if (lib_datas[1] == "tex") and (lib_datas[4] == "+"):
                lib_dict["fp"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"):
                lib_dict["fm"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"):
                lib_dict["np"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"):
                lib_dict["nm"].append(self._assign_dict(lib_datas))
        for num_id in range(1, lib_num+1):
            out.write("annotation_{0} = {1}\n".format(num_id, gff))
        if program.lower() == "tss":
            self._print_lib(lib_num, lib_dict["fm"], out,
                            wig_folder, "fivePrimeMinus", rep_set)
            self._print_lib(lib_num, lib_dict["fp"], out,
                            wig_folder, "fivePrimePlus", rep_set)
        elif program.lower() == "ps":
            self._print_lib(lib_num, lib_dict["nm"], out,
                            wig_folder, "fivePrimeMinus", rep_set)
            self._print_lib(lib_num, lib_dict["np"], out,
                            wig_folder, "fivePrimePlus", rep_set)
        else:
            print("Error: Wrong program name! Please assing tss "
                  "or processing_site.")
            sys.exit()
        for num_id in range(1, lib_num+1):
            out.write("genome_{0} = {1}\n".format(num_id, fasta))
        for num_id in range(1, lib_num+1):
            list_num_id.append(str(num_id))
        return lib_num, num_id, rep_set, lib_dict, list_num_id

    def _print_repmatch(self, args_tss, out):
        '''check replicate match'''
        detect_all = False
        for rep in args_tss.repmatch:
            if "all" in rep:
                detect_all = True
                match = rep.split("_")[-1]
                out.write("minNumRepMatches = {0}\n".format(match))
                break
        if not detect_all:
            nums = {}
            matchs = {}
            for match in args_tss.repmatch:
                lib = match.split("_")[0]
                rep = match.split("_")[-1]
                matchs[lib] = rep
                if rep not in nums.keys():
                    nums[rep] = 1
                else:
                    nums[rep] += 1
            for rep, num in nums.items():
                if num == max(nums.values()):
                    out.write("minNumRepMatches = {0}\n".format(rep))
                    max_rep = rep
                    break
            for lib, rep in matchs.items():
                if rep != max_rep:
                    out.write("minNumRepMatches_{0} = {1}\n".format(
                        lib, rep))

    def _extract_best_para(self, args_tss, prefix, log):
        detect = False
        for best_file in os.listdir(args_tss.auto_load):
            if best_file == "_".join(["best", prefix + ".csv"]):
                bh = open(os.path.join(args_tss.auto_load, best_file),"r" )
                lines = bh.readlines()
                bh.close()
                if len(lines[len(lines)-1].split("\t")) < 8:
                    print("Error: some information in {0} is missing. "
                          "It may be due to that \"optimize_tss_ps\" did "
                          "not finish successfully.".format(best_file))
                    log.write("Error: some information in {0} is missing. "
                              "It may be due to that \"optimize_tss_ps\" did "
                              "not finish successfully.\n".format(best_file))
                    sys.exit()
                else:
                    para_info = lines[len(lines)-1].split("\t")[1].split("_")
                    detect_all =  all(elem in para_info
                            for elem in ["he", "rh", "fa", "rf",
                                         "bh", "ef", "pf"])
                    if (not detect_all) or (len(para_info) != 14):
                        print("Error: {0} is complete. Some parameters are "
                              "missing!".format(best_file))
                        log.write("Error: {0} is complete. Some parameters "
                                  "are missing!\n".format(best_file))
                        sys.exit()
                    else:
                        detect = True
                        height = para_info[para_info.index("he") + 1]
                        height_reduction = para_info[
                            para_info.index("rh") + 1]
                        factor = para_info[para_info.index("fa") + 1]
                        factor_reduction = para_info[
                            para_info.index("rf") + 1]
                        base_height = para_info[
                            para_info.index("bh") + 1]
                        enrichment_factor = para_info[
                            para_info.index("ef") + 1]
                        processing_factor = para_info[
                            para_info.index("pf") + 1]
        if detect:
            return height, height_reduction, factor, factor_reduction, \
                   base_height, enrichment_factor, processing_factor
        else:
            print("Error: No best_{0}.csv can be found in {1}! ".format(
                prefix, args_tss.auto_load))
            log.write("Error: No best_{0}.csv can be found in {1}\n".format(
                prefix, args_tss.auto_load))
            sys.exit()

    def _get_input_para(self, args_tss, prefix, log):
        if args_tss.genome_order is None:
            height = args_tss.height[0]
            height_reduction = args_tss.height_reduction[0]
            factor = args_tss.factor[0]
            factor_reduction = args_tss.factor_reduction[0]
            base_height = args_tss.base_height[0]
            enrichment_factor = args_tss.enrichment_factor[0]
            processing_factor = args_tss.processing_factor[0]
        else:
            if prefix not in args_tss.genome_order:
                print("Error: the parameters for {0} were not assigned!".format(
                    prefix))
                log.write("Error: the parameters for {0} were not assigned!\n".format(
                    prefix))
                sys.exit()
            else:
                index = args_tss.genome_order.index(prefix)
                height = args_tss.height[index]
                height_reduction = args_tss.height_reduction[index]
                factor = args_tss.factor[index]
                factor_reduction = args_tss.factor_reduction[index]
                base_height = args_tss.base_height[index]
                enrichment_factor = args_tss.enrichment_factor[index]
                processing_factor = args_tss.processing_factor[index]
        return height, height_reduction, factor, factor_reduction, \
               base_height, enrichment_factor, processing_factor

    def _gen_config(self, project_strain_name, args_tss, gff,
                    wig_folder, fasta, config_file, log):
        '''generation of config files'''
        log.write("Generating config files for TSSpredator.\n")
        if args_tss.auto_load is not None:
            height, height_reduction, factor, factor_reduction, \
            base_height, enrichment_factor, processing_factor = \
            self._extract_best_para(args_tss, project_strain_name, log)
        else:
            height, height_reduction, factor, factor_reduction, \
            base_height, enrichment_factor, processing_factor = \
            self._get_input_para(args_tss, project_strain_name, log)
        master_folder = "MasterTable_" + project_strain_name
        out_path = os.path.join(self.master, master_folder)
        self.helper.check_make_folder(out_path)
        out = open(config_file, "w")
        out.write("TSSinClusterSelectionMethod = HIGHEST\n")
        out.write("allowedCompareShift = 1\n")
        out.write("allowedRepCompareShift = 1\n")
        lib_num, num_id, rep_set, lib_dict, list_num_id = \
            self._import_lib(args_tss.libs, wig_folder, project_strain_name,
                             out, gff, args_tss.program, fasta)
        out.write("idList = ")
        out.write(",".join(list_num_id) + "\n")
        out.write("maxASutrLength = 100\n")
        out.write("maxGapLengthInGene = 500\n")
        out.write("maxNormalTo5primeFactor = {0}\n".format(
                  processing_factor))
        out.write("maxTSSinClusterDistance = {0}\n".format(
                  args_tss.cluster + 1))
        out.write("maxUTRlength = {0}\n".format(args_tss.utr_length))
        out.write("min5primeToNormalFactor = {0}\n".format(
                  enrichment_factor))
        out.write("minCliffFactor = {0}\n".format(factor))
        out.write("minCliffFactorDiscount = {0}\n".format(
                  factor_reduction))
        out.write("minCliffHeight = {0}\n".format(height))
        out.write("minCliffHeightDiscount = {0}\n".format(
                  height_reduction))
        out.write("minNormalHeight = {0}\n".format(base_height))
        self._print_repmatch(args_tss, out)
        out.write("minPlateauLength = 0\n")
        out.write("mode = cond\n")
        out.write("normPercentile = 0.9\n")
        if args_tss.program.lower() == "tss":
            self._print_lib(lib_num, lib_dict["nm"], out,
                            wig_folder, "normalMinus", rep_set)
            self._print_lib(lib_num, lib_dict["np"], out,
                            wig_folder, "normalPlus", rep_set)
        else:
            self._print_lib(lib_num, lib_dict["fm"], out,
                            wig_folder, "normalMinus", rep_set)
            self._print_lib(lib_num, lib_dict["fp"], out,
                            wig_folder, "normalPlus", rep_set)
        out.write("numReplicates = {0}\n".format(len(rep_set)))
        out.write("numberOfDatasets = {0}\n".format(lib_num))
        out.write("outputDirectory = {0}\n".format(out_path))
        for prefix_id in range(len(args_tss.output_prefixs)):
            out.write("outputPrefix_{0} = {1}\n".format(
                      prefix_id + 1, args_tss.output_prefixs[prefix_id]))
        out.write("projectName = {0}\n".format(project_strain_name))
        out.write("superGraphCompatibility = igb\n")
        out.write("texNormPercentile = 0.5\n")
        out.write("writeGraphs = 0\n")
        out.write("writeNocornacFiles = 0\n")
        log.write("\t" + config_file + " is generated.\n")
        out.close()

    def _convert_gff(self, prefixs, args_tss, log):
        for prefix in prefixs:
            out_file = os.path.join(self.gff_outfolder, "_".join([
                           prefix, args_tss.program]) + ".gff")
            gff_f = open(out_file, "w")
            out_path = os.path.join(self.master, "_".join([
                           "MasterTable", prefix]))
            if "MasterTable.tsv" not in os.listdir(out_path):
                print("Error: There is not MasterTable file in {0} ".format(
                      out_path))
                print("Please check configuration file.")
                log.write("not MasterTable file is found in {0}\n".format(
                           out_path))
            else:
                if args_tss.program.lower() == "processing":
                    feature = "processing_site"
                elif args_tss.program.lower() == "tss":
                    feature = "TSS"
                self.converter.convert_mastertable2gff(
                    os.path.join(out_path, "MasterTable.tsv"),
                    "ANNOgesic", feature, prefix, out_file)
                log.write("\t" + out_file + "is generated.\n")
            gff_f.close()

    def _merge_manual(self, tsss, args_tss):
        '''if manual detected TSS is provided, it can merge manual detected TSS 
        and TSSpredator predicted TSS'''
        self.helper.check_make_folder(os.path.join(os.getcwd(),
                                      self.tmps["tss"]))
        for tss in tsss:
            for gff in os.listdir(args_tss.gffs):
                if (gff[:-4] == tss) and (".gff" in gff):
                    break
            filename = "_".join([tss, args_tss.program]) + ".gff"
            predict = os.path.join(self.gff_outfolder, filename)
            manual = os.path.join(self.manual_path, tss + ".gff")
            fasta = os.path.join(self.fasta_path, tss + ".fa")
            stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss)
            if os.path.exists(manual):
                print("Merging and classiflying manually-detected "
                      "TSSs for {0}".format(tss))
                merge_manual_predict_tss(
                    predict, stat_file,
                    os.path.join(self.tmps["tss"], filename),
                    os.path.join(args_tss.gffs, gff), args_tss, manual, fasta)
            if os.path.exists(stat_file):
                shutil.move(stat_file, os.path.join(
                    args_tss.out_folder, "statistics", tss, stat_file))
        self.helper.move_all_content(self.tmps["tss"],
                                     self.gff_outfolder, [".gff"])
        shutil.rmtree(self.tmps["tss"])

    def _validate(self, tsss, args_tss, log):
        '''validate TSS with genome annotation'''
        print("Validating TSSs with genome annotations")
        log.write("Running validate_gene.py to compare genome "
                  "annotations and TSSs/PSs.\n")
        for tss in tsss:
            for gff in os.listdir(args_tss.gffs):
                if (gff[:-4] == tss) and (".gff" in gff):
                    break
            stat_file = os.path.join(
                    self.stat_outfolder, tss,
                    "".join(["stat_gene_vali_", tss, ".csv"]))
            out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff")
            if args_tss.program.lower() == "tss":
                compare_file = os.path.join(self.gff_outfolder,
                                            "_".join([tss, "TSS.gff"]))
            elif args_tss.program.lower() == "processing":
                compare_file = os.path.join(self.gff_outfolder,
                                            "_".join([tss, "processing.gff"]))
            validate_gff(compare_file, os.path.join(args_tss.gffs, gff),
                         stat_file, out_cds_file, args_tss.utr_length,
                         args_tss.program.lower())
            log.write("\t" + stat_file + " is generated.\n")
            shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff))

    def _compare_ta(self, tsss, args_tss, log):
        '''compare TSS with transcript'''
        detect = False
        log.write("Running stat_TA_comparison to compare transcripts "
                  "and TSSs/PSs.\n")
        print("Comparing transcripts and TSSs")
        self.multiparser.parser_gff(args_tss.ta_files, "transcript")
        self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"],
                                     None, "transcript")
        for tss in tsss:
            stat_out = os.path.join(
                    self.stat_outfolder, tss, "".join([
                        "stat_compare_TSS_transcript_",
                        tss, ".csv"]))
            for ta in os.listdir(self.tmps["ta"]):
                filename = ta.split("_transcript")
                if (filename[0] == tss) and (filename[1] == ".gff"):
                    detect = True
                    break
            compare_file = os.path.join(self.gff_outfolder,
                                        "_".join([tss, "TSS.gff"]))
            if detect:
                stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file,
                            stat_out, self.tmps["ta_tss"],
                            self.tmps["tss_ta"], args_tss.fuzzy)
                self.helper.sort_gff(self.tmps["tss_ta"], compare_file)
                self.helper.sort_gff(self.tmps["ta_tss"],
                                     os.path.join(args_tss.ta_files, ta))
                os.remove(self.tmps["tss_ta"])
                os.remove(self.tmps["ta_tss"])
                detect = False
            log.write("\t" + stat_out + " is generated.\n")

    def _stat_tss(self, tsss, feature, log):
        print("Running statistaics")
        for tss in tsss:
            compare_file = os.path.join(self.gff_outfolder,
                                        "_".join([tss, feature]) + ".gff")
            stat_tsspredator(
                compare_file, feature,
                os.path.join(self.stat_outfolder, tss, "_".join([
                    "stat", feature, "class", tss]) + ".csv"),
                os.path.join(self.stat_outfolder, tss, "_".join([
                    "stat", feature, "libs", tss]) + ".csv"))
            self.helper.move_all_content(os.getcwd(), os.path.join(
                self.stat_outfolder, tss), ["_class", ".png"])
            if os.path.exists(os.path.join(
                    self.stat_outfolder, "TSSstatistics.tsv")):
                shutil.move(
                    os.path.join(
                        self.stat_outfolder, "TSSstatistics.tsv"),
                    os.path.join(
                        self.stat_outfolder, tss, "TSSstatistics.tsv"))
            plot_venn(compare_file, feature)
            self.helper.move_all_content(os.getcwd(), os.path.join(
                self.stat_outfolder, tss), ["_venn", ".png"])
            log.write("The following files in {0} are generated:\n".format(
                (os.path.join(self.stat_outfolder, tss))))
            for file_ in os.listdir(os.path.join(
                    self.stat_outfolder, tss)):
                log.write("\t" + file_ + "\n")

    def _get_prefixs(self, args_tss):
        prefixs = []
        detect = False
        for fasta in os.listdir(self.fasta_path):
            run = False
            for gff in os.listdir(self.gff_path):
                if fasta[:-3] == gff[:-4]:
                    prefix = fasta[:-3]
                    for wig in os.listdir(self.wig_path):
                        filename = wig.split("_STRAIN_")
                        if filename[1][:-4] == prefix:
                            detect = True
                            break
                    if detect:
                        prefixs.append(prefix)
        return prefixs

    def _merge_wigs(self, wig_folder, prefix, libs):
        self.helper.check_make_folder(os.path.join(os.getcwd(),
                                      self.tmps["tmp"]))
        for wig_file in os.listdir(wig_folder):
            for lib in libs:
                info = lib.split(":")
                if (info[0][:-4] in wig_file) and (info[-1] == "+") and (
                        prefix in wig_file) and (
                        os.path.isfile(os.path.join(wig_folder, wig_file))):
                    Helper().merge_file(
                            os.path.join(wig_folder, wig_file),
                            os.path.join("tmp", "merge_forward.wig"))
                if (info[0][:-4] in wig_file) and (info[-1] == "-") and (
                        prefix in wig_file) and (
                        os.path.isfile(os.path.join(wig_folder, wig_file))):
                    Helper().merge_file(
                            os.path.join(wig_folder, wig_file),
                            os.path.join("tmp", "merge_reverse.wig"))

    def _check_orphan(self, prefixs, wig_folder, args_tss):
        '''if genome has no locus tag, it can use for classify the TSS'''
        for prefix in prefixs:
            self._merge_wigs(wig_folder, prefix, args_tss.libs)
            tmp_tss = os.path.join(self.tmps["tmp"], "_".join([
                          prefix, args_tss.program + ".gff"]))
            pre_tss = os.path.join(self.gff_outfolder, "_".join([
                          prefix, args_tss.program + ".gff"]))
            check_orphan(pre_tss, os.path.join(
                args_tss.gffs, prefix + ".gff"),
                "tmp/merge_forward.wig", "tmp/merge_reverse.wig", tmp_tss)
            shutil.move(tmp_tss, pre_tss)
        shutil.rmtree("tmp")

    def _remove_files(self, args_tss):
        print("Remove temperary files and folders")
        self.helper.remove_tmp_dir(args_tss.fastas)
        self.helper.remove_tmp_dir(args_tss.gffs)
        self.helper.remove_tmp_dir(args_tss.ta_files)
        if "merge_forward.wig" in os.listdir(os.getcwd()):
            os.remove("merge_forward.wig")
        if "merge_reverse.wig" in os.listdir(os.getcwd()):
            os.remove("merge_reverse.wig")
        shutil.rmtree(args_tss.wig_folder)
        if args_tss.manual is not None:
            shutil.rmtree(args_tss.manual)

    def _deal_with_overlap(self, out_folder, args_tss):
        '''deal with the situation that TSS and 
        processing site at the same position'''
        if not args_tss.overlap_feature:
            pass
        else:
            print("Comparing TSSs and Processing sites")
            if args_tss.program.lower() == "tss":
                for tss in os.listdir(out_folder):
                    if tss.endswith("_TSS.gff"):
                        ref = self.helper.get_correct_file(
                                args_tss.overlap_gffs, "_processing.gff",
                                tss.replace("_TSS.gff", ""), None, None)
                        filter_tss_pro(os.path.join(out_folder, tss),
                                       ref, args_tss.program,
                                       args_tss.cluster)
            elif args_tss.program.lower() == "processing":
                for tss in os.listdir(out_folder):
                    if tss.endswith("_processing.gff"):
                        ref = self.helper.get_correct_file(
                                args_tss.overlap_gffs, "_TSS.gff",
                                tss.replace("_processing.gff", ""), None, None)
                        filter_tss_pro(os.path.join(out_folder, tss),
                                       ref, args_tss.program,
                                       args_tss.cluster)

    def _low_expression(self, args_tss, gff_folder):
        '''deal with the low expressed TSS'''
        prefix = None
        self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs)
        for gff in os.listdir(gff_folder):
            if (args_tss.program.lower() == "tss") and (
                    gff.endswith("_TSS.gff")):
                prefix = gff.replace("_TSS.gff", "")
            elif (args_tss.program.lower() == "processing") and (
                    gff.endswith("_processing.gff")):
                prefix = gff.replace("_processing.gff", "")
            if prefix:
                out = open(os.path.join(
                    self.stat_outfolder, prefix, "_".join([
                        "stat", prefix, "low_expression_cutoff.csv"])), "w")
                out.write("\t".join(["Genome", "Cutoff_coverage"]) + "\n")
                cutoff = filter_low_expression(
                        os.path.join(gff_folder, gff), args_tss,
                        "tmp/merge_forward.wig", "tmp/merge_reverse.wig",
                        "tmp/without_low_expression.gff")
                out.write("\t".join([prefix, str(cutoff)]) + "\n")
                os.remove(os.path.join(gff_folder, gff))
                shutil.move("tmp/without_low_expression.gff",
                            os.path.join(gff_folder, gff))
                prefix = None
        out.close()

    def run_tsspredator(self, args_tss, log):
        input_folder = os.path.join(args_tss.out_folder, "configs")
        for gff in os.listdir(args_tss.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(
                                                 args_tss.gffs, gff))
        self.helper.check_make_folder(self.gff_outfolder)
        self.multiparser.parser_fasta(args_tss.fastas)
        self.multiparser.parser_gff(args_tss.gffs, None)
        self.multiparser.parser_wig(args_tss.wig_folder)
        prefixs = self._get_prefixs(args_tss)
        for prefix in prefixs:
            config = os.path.join(input_folder,
                                  "_".join(["config", prefix]) + ".ini")
            self._gen_config(
                prefix, args_tss,
                os.path.join(self.gff_path, prefix + ".gff"), self.wig_path,
                os.path.join(self.fasta_path, prefix + ".fa"), config, log)
            out_path = os.path.join(
                    self.master, "_".join(["MasterTable", prefix]))
            config_file = os.path.join(
                    input_folder, "_".join(["config", prefix]) + ".ini")
            self._start_to_run(args_tss.tsspredator_path, config_file,
                               out_path, prefix, log)
            if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")):
                shutil.move(os.path.join(out_path, "TSSstatistics.tsv"),
                            os.path.join(
                                self.stat_outfolder, "TSSstatistics.tsv"))
        if args_tss.program.lower() == "ps":
            args_tss.program = "processing"
        self._convert_gff(prefixs, args_tss, log)
        if args_tss.check_orphan:
            print("checking the orphan TSSs")
            log.write("Running check_orphan.py to re-check orphan TSSs.\n")
            self._check_orphan(prefixs,
                               os.path.join(args_tss.wig_folder, "tmp"),
                               args_tss)
        self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder,
                                     None, args_tss.program)
        datas = []
        for gff in os.listdir(self.gff_outfolder):
            if gff.endswith(".gff"):
                gff_folder = gff.replace("".join(["_", args_tss.program,
                                                  ".gff"]), "")
                self.helper.check_make_folder(
                     os.path.join(self.stat_outfolder, gff_folder))
                datas.append(gff_folder)
        if args_tss.remove_low_expression is not None:
            log.write("Running filter_low_expression.py to filter out "
                      "low expressed TSS/PS.\n")
            self._low_expression(args_tss, self.gff_outfolder)
        if args_tss.manual is not None:
            self.multiparser.parser_gff(args_tss.manual, None)
            self.multiparser.combine_gff(args_tss.gffs, self.manual_path,
                                         None, None)
            self.multiparser.combine_fasta(args_tss.gffs, self.fasta_path,
                                         None)
            self.multiparser.combine_wig(args_tss.gffs, self.wig_path,
                                         None, args_tss.libs)
            log.write("Running merge_manual.py to merge the manual TSSs.\n")
            self._merge_manual(datas, args_tss)
        log.write("Running filter_TSS_pro.py to deal with the overlap "
                  "position between TSS and PS.\n")
        self._deal_with_overlap(self.gff_outfolder, args_tss)
        log.write("Running stat_TSSpredator.py to do statistics.\n")
        self._stat_tss(datas, args_tss.program, log)
        if args_tss.validate:
            self._validate(datas, args_tss, log)
        if args_tss.ta_files is not None:
            self._compare_ta(datas, args_tss, log)
        self._remove_files(args_tss)
예제 #4
0
class TestMultiparser(unittest.TestCase):
    def setUp(self):
        self.multiparser = Multiparser()
        self.example = Example()
        self.ref_folder = "ref_folder"
        if (not os.path.exists(self.ref_folder)):
            os.mkdir(self.ref_folder)
        self.tar_folder = "tar_folder"
        if (not os.path.exists(self.tar_folder)):
            os.mkdir(self.tar_folder)

    def tearDown(self):
        if os.path.exists(self.ref_folder):
            shutil.rmtree(self.ref_folder)
        if os.path.exists(self.tar_folder):
            shutil.rmtree(self.tar_folder)

    def test_combine_fasta(self):
        tmp_tar = os.path.join(self.tar_folder, "tmp")
        tmp_ref = os.path.join(self.ref_folder, "test.gff_folder")
        os.mkdir(tmp_ref)
        os.mkdir(tmp_tar)
        sub_fasta1 = os.path.join(tmp_tar, "aaa.fa")
        with open(sub_fasta1, "w") as rh:
            rh.write(self.example.sub_fasta1)
        sub_fasta2 = os.path.join(tmp_tar, "bbb.fa")
        with open(sub_fasta2, "w") as rh:
            rh.write(self.example.sub_fasta2)
        sub_gff1 = os.path.join(tmp_ref, "aaa.gff")
        with open(sub_gff1, "w") as rh:
            rh.write(self.example.sub_gff1)
        sub_gff2 = os.path.join(tmp_ref, "bbb.gff")
        with open(sub_gff2, "w") as rh:
            rh.write(self.example.sub_gff2)
        self.multiparser.combine_fasta(self.ref_folder, tmp_tar, None)
        self.assertTrue(os.path.exists(os.path.join(tmp_tar, "test.fa")))

    def test_combine_wig(self):
        tmp_tar = os.path.join(self.tar_folder, "tmp")
        tmp_ref = os.path.join(self.ref_folder, "test.fa_folder")
        os.mkdir(tmp_ref)
        os.mkdir(tmp_tar)
        sub_fasta1 = os.path.join(tmp_ref, "aaa.fa")
        with open(sub_fasta1, "w") as rh:
            rh.write(self.example.sub_fasta1)
        sub_fasta2 = os.path.join(tmp_ref, "bbb.fa")
        with open(sub_fasta2, "w") as rh:
            rh.write(self.example.sub_fasta2)
        sub_wig1 = os.path.join(tmp_tar, "test_forward.wig_STRAIN_aaa.wig")
        sub_wig2 = os.path.join(tmp_tar, "test_forward.wig_STRAIN_bbb.wig")
        sub_wig3 = os.path.join(tmp_tar, "test_reverse.wig_STRAIN_aaa.wig")
        sub_wig4 = os.path.join(tmp_tar, "test_reverse.wig_STRAIN_bbb.wig")
        wig_files = [sub_wig1, sub_wig2, sub_wig3, sub_wig4]
        example_wigs = [
            self.example.sub_f_wig1, self.example.sub_f_wig2,
            self.example.sub_r_wig1, self.example.sub_r_wig2
        ]
        for index in range(0, 4):
            with open(wig_files[index], "w") as fh:
                fh.write(example_wigs[index])
        libs = [
            "test_forward.wig_STRAIN_aaa.wig:frag:1:a:+",
            "test_reverse.wig_STRAIN_aaa.wig:frag:1:a:-"
        ]
        self.multiparser.combine_wig(self.ref_folder, tmp_tar, "fasta", libs)
        self.assertTrue(
            os.path.exists(os.path.join(tmp_tar, "test_forward.wig")))
        self.assertTrue(
            os.path.exists(os.path.join(tmp_tar, "test_reverse.wig")))

    def test_combine_gff(self):
        tmp_tar = os.path.join(self.tar_folder, "tmp")
        tmp_ref = os.path.join(self.ref_folder, "test.fa_folder")
        os.mkdir(tmp_ref)
        os.mkdir(tmp_tar)
        sub_fasta1 = os.path.join(tmp_ref, "aaa.fa")
        with open(sub_fasta1, "w") as rh:
            rh.write(self.example.sub_fasta1)
        sub_fasta2 = os.path.join(tmp_ref, "bbb.fa")
        with open(sub_fasta2, "w") as rh:
            rh.write(self.example.sub_fasta2)
        sub_gff1 = os.path.join(tmp_tar, "aaa.gff")
        with open(sub_gff1, "w") as rh:
            rh.write(self.example.sub_gff1)
        sub_gff2 = os.path.join(tmp_tar, "bbb.gff")
        with open(sub_gff2, "w") as rh:
            rh.write(self.example.sub_gff2)
        self.multiparser.combine_gff(self.ref_folder, tmp_tar, "fasta", None)
        self.assertTrue(os.path.exists(os.path.join(tmp_tar, "test.gff")))

    def test_parser_fasta(self):
        fasta_file = os.path.join(self.ref_folder, "test.fa")
        with open(fasta_file, "w") as rh:
            rh.write(self.example.fasta_file)
        self.multiparser.parser_fasta(self.ref_folder)
        self.assertTrue(
            os.path.exists(os.path.join(self.ref_folder, "tmp/aaa.fa")))
        self.assertTrue(
            os.path.exists(os.path.join(self.ref_folder, "tmp/bbb.fa")))
        self.assertTrue(
            os.path.exists(
                os.path.join(self.ref_folder, "test.fa_folder/aaa.fa")))
        self.assertTrue(
            os.path.exists(
                os.path.join(self.ref_folder, "test.fa_folder/bbb.fa")))

    def test_parser_gff(self):
        gff_file = os.path.join(self.ref_folder, "test.gff")
        with open(gff_file, "w") as rh:
            rh.write(self.example.gff_file)
        self.multiparser.parser_gff(self.ref_folder, None)
        self.assertTrue(
            os.path.exists(os.path.join(self.ref_folder, "tmp/aaa.gff")))
        self.assertTrue(
            os.path.exists(os.path.join(self.ref_folder, "tmp/bbb.gff")))
        self.assertTrue(
            os.path.exists(
                os.path.join(self.ref_folder, "test.gff_folder/aaa.gff")))
        self.assertTrue(
            os.path.exists(
                os.path.join(self.ref_folder, "test.gff_folder/bbb.gff")))
        tss_file = os.path.join(self.ref_folder, "test_TSS.gff")
        os.rename(gff_file, tss_file)
        tss_file = os.path.join(self.ref_folder, "test_TSS.gff")
        with open(tss_file, "w") as rh:
            rh.write(self.example.gff_file)
        self.multiparser.parser_gff(self.ref_folder, "TSS")
        self.assertTrue(
            os.path.exists(os.path.join(self.ref_folder, "tmp/aaa_TSS.gff")))
        self.assertTrue(
            os.path.exists(os.path.join(self.ref_folder, "tmp/bbb_TSS.gff")))
        self.assertTrue(
            os.path.exists(
                os.path.join(self.ref_folder,
                             "test_TSS.gff_folder/aaa_TSS.gff")))
        self.assertTrue(
            os.path.exists(
                os.path.join(self.ref_folder,
                             "test_TSS.gff_folder/bbb_TSS.gff")))

    def test_parser_wig(self):
        wig_f_file = os.path.join(self.ref_folder, "test_forward.wig")
        with open(wig_f_file, "w") as rh:
            rh.write(self.example.wig_f_file)
        wig_r_file = os.path.join(self.ref_folder, "test_reverse.wig")
        with open(wig_r_file, "w") as rh:
            rh.write(self.example.wig_r_file)
        self.multiparser.parser_wig(self.ref_folder)
        self.assertTrue(
            os.path.exists(
                os.path.join(self.ref_folder,
                             "tmp/test_forward_STRAIN_aaa.wig")))
        self.assertTrue(
            os.path.exists(
                os.path.join(self.ref_folder,
                             "tmp/test_forward_STRAIN_bbb.wig")))
        self.assertTrue(
            os.path.exists(
                os.path.join(self.ref_folder,
                             "tmp/test_reverse_STRAIN_aaa.wig")))
        self.assertTrue(
            os.path.exists(
                os.path.join(self.ref_folder,
                             "tmp/test_reverse_STRAIN_bbb.wig")))
        self.assertTrue(
            os.path.exists(
                os.path.join(
                    self.ref_folder,
                    "test_forward.wig_folder/test_forward_STRAIN_aaa.wig")))
        self.assertTrue(
            os.path.exists(
                os.path.join(
                    self.ref_folder,
                    "test_forward.wig_folder/test_forward_STRAIN_bbb.wig")))
        self.assertTrue(
            os.path.exists(
                os.path.join(
                    self.ref_folder,
                    "test_reverse.wig_folder/test_reverse_STRAIN_aaa.wig")))
        self.assertTrue(
            os.path.exists(
                os.path.join(
                    self.ref_folder,
                    "test_reverse.wig_folder/test_reverse_STRAIN_bbb.wig")))
예제 #5
0
class TestMultiparser(unittest.TestCase):

    def setUp(self):
        self.multiparser = Multiparser()
        self.example = Example()
        self.ref_folder = "ref_folder"
        if (not os.path.exists(self.ref_folder)):
            os.mkdir(self.ref_folder)
        self.tar_folder = "tar_folder"
        if (not os.path.exists(self.tar_folder)):
            os.mkdir(self.tar_folder)

    def tearDown(self):
        if os.path.exists(self.ref_folder):
            shutil.rmtree(self.ref_folder)
        if os.path.exists(self.tar_folder):
            shutil.rmtree(self.tar_folder)

    def test_combine_fasta(self):
        tmp_tar = os.path.join(self.tar_folder, "tmp")
        tmp_ref = os.path.join(self.ref_folder, "test.gff_folder")
        os.mkdir(tmp_ref)
        os.mkdir(tmp_tar)
        sub_fasta1 = os.path.join(tmp_tar, "aaa.fa")
        with open(sub_fasta1, "w") as rh:
            rh.write(self.example.sub_fasta1)
        sub_fasta2 = os.path.join(tmp_tar, "bbb.fa")
        with open(sub_fasta2, "w") as rh:
            rh.write(self.example.sub_fasta2)
        sub_gff1 = os.path.join(tmp_ref, "aaa.gff")
        with open(sub_gff1, "w") as rh:
            rh.write(self.example.sub_gff1)
        sub_gff2 = os.path.join(tmp_ref, "bbb.gff")
        with open(sub_gff2, "w") as rh:
            rh.write(self.example.sub_gff2)
        self.multiparser.combine_fasta(self.ref_folder, tmp_tar, None)
        self.assertTrue(os.path.exists(os.path.join(tmp_tar, "test.fa")))

    def test_combine_wig(self):
        tmp_tar = os.path.join(self.tar_folder, "tmp")
        tmp_ref = os.path.join(self.ref_folder, "test.fa_folder")
        os.mkdir(tmp_ref)
        os.mkdir(tmp_tar)
        sub_fasta1 = os.path.join(tmp_ref, "aaa.fa")
        with open(sub_fasta1, "w") as rh:
            rh.write(self.example.sub_fasta1)
        sub_fasta2 = os.path.join(tmp_ref, "bbb.fa")
        with open(sub_fasta2, "w") as rh:
            rh.write(self.example.sub_fasta2)
        sub_wig1 = os.path.join(tmp_tar, "test_forward.wig_STRAIN_aaa.wig")
        sub_wig2 = os.path.join(tmp_tar, "test_forward.wig_STRAIN_bbb.wig")
        sub_wig3 = os.path.join(tmp_tar, "test_reverse.wig_STRAIN_aaa.wig")
        sub_wig4 = os.path.join(tmp_tar, "test_reverse.wig_STRAIN_bbb.wig")
        wig_files = [sub_wig1, sub_wig2, sub_wig3, sub_wig4]
        example_wigs = [self.example.sub_f_wig1, self.example.sub_f_wig2,
                        self.example.sub_r_wig1, self.example.sub_r_wig2]
        for index in range(0, 4):
            with open(wig_files[index], "w") as fh:
                fh.write(example_wigs[index])
        libs = ["test_forward.wig_STRAIN_aaa.wig:frag:1:a:+", "test_reverse.wig_STRAIN_aaa.wig:frag:1:a:-"]
        self.multiparser.combine_wig(self.ref_folder, tmp_tar, "fasta", libs)
        self.assertTrue(os.path.exists(os.path.join(tmp_tar, "test_forward.wig")))
        self.assertTrue(os.path.exists(os.path.join(tmp_tar, "test_reverse.wig")))

    def test_combine_gff(self):
        tmp_tar = os.path.join(self.tar_folder, "tmp")
        tmp_ref = os.path.join(self.ref_folder, "test.fa_folder")
        os.mkdir(tmp_ref)
        os.mkdir(tmp_tar)
        sub_fasta1 = os.path.join(tmp_ref, "aaa.fa")
        with open(sub_fasta1, "w") as rh:
            rh.write(self.example.sub_fasta1)
        sub_fasta2 = os.path.join(tmp_ref, "bbb.fa")
        with open(sub_fasta2, "w") as rh:
            rh.write(self.example.sub_fasta2)
        sub_gff1 = os.path.join(tmp_tar, "aaa.gff")
        with open(sub_gff1, "w") as rh:
            rh.write(self.example.sub_gff1)
        sub_gff2 = os.path.join(tmp_tar, "bbb.gff")
        with open(sub_gff2, "w") as rh:
            rh.write(self.example.sub_gff2)
        self.multiparser.combine_gff(self.ref_folder, tmp_tar, "fasta", None)
        self.assertTrue(os.path.exists(os.path.join(tmp_tar, "test.gff")))

    def test_parser_fasta(self):
        fasta_file = os.path.join(self.ref_folder, "test.fa")
        with open(fasta_file, "w") as rh:
            rh.write(self.example.fasta_file)
        self.multiparser.parser_fasta(self.ref_folder)
        self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/aaa.fa")))
        self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/bbb.fa")))
        self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "test.fa_folder/aaa.fa")))
        self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "test.fa_folder/bbb.fa")))

    def test_parser_gff(self):
        gff_file = os.path.join(self.ref_folder, "test.gff")
        with open(gff_file, "w") as rh:
            rh.write(self.example.gff_file)
        self.multiparser.parser_gff(self.ref_folder, None)
        self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/aaa.gff")))
        self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/bbb.gff")))
        self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "test.gff_folder/aaa.gff")))
        self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "test.gff_folder/bbb.gff")))
        tss_file = os.path.join(self.ref_folder, "test_TSS.gff")
        os.rename(gff_file, tss_file)
        tss_file = os.path.join(self.ref_folder, "test_TSS.gff")
        with open(tss_file, "w") as rh:
            rh.write(self.example.gff_file)
        self.multiparser.parser_gff(self.ref_folder, "TSS")
        self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/aaa_TSS.gff")))
        self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/bbb_TSS.gff")))
        self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "test_TSS.gff_folder/aaa_TSS.gff")))
        self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "test_TSS.gff_folder/bbb_TSS.gff")))

    def test_parser_wig(self):
        wig_f_file = os.path.join(self.ref_folder, "test_forward.wig")
        with open(wig_f_file, "w") as rh:
            rh.write(self.example.wig_f_file)
        wig_r_file = os.path.join(self.ref_folder, "test_reverse.wig")
        with open(wig_r_file, "w") as rh:
            rh.write(self.example.wig_r_file)
        self.multiparser.parser_wig(self.ref_folder)
        self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/test_forward_STRAIN_aaa.wig")))
        self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/test_forward_STRAIN_bbb.wig")))
        self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/test_reverse_STRAIN_aaa.wig")))
        self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/test_reverse_STRAIN_bbb.wig")))
        self.assertTrue(os.path.exists(
            os.path.join(self.ref_folder, "test_forward.wig_folder/test_forward_STRAIN_aaa.wig")))
        self.assertTrue(os.path.exists(
            os.path.join(self.ref_folder, "test_forward.wig_folder/test_forward_STRAIN_bbb.wig")))
        self.assertTrue(os.path.exists(
            os.path.join(self.ref_folder, "test_reverse.wig_folder/test_reverse_STRAIN_aaa.wig")))
        self.assertTrue(os.path.exists(
            os.path.join(self.ref_folder, "test_reverse.wig_folder/test_reverse_STRAIN_bbb.wig")))
예제 #6
0
class TSSpredator(object):

    def __init__(self, args_tss):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.master = os.path.join(args_tss.out_folder, "MasterTables")
        self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta":
                     "tmp_tss", "tmp": "tmp"}
        if args_tss.ta_files is not None:
            self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp")
        else:
            self.tmps["ta"] = None
        self.gff_path = os.path.join(args_tss.gffs, "tmp")
        if args_tss.manual is not None:
            self.manual_path = os.path.join(args_tss.manual, "tmp")
        self.wig_path = os.path.join(args_tss.wig_folder, "tmp")
        self.fasta_path = os.path.join(args_tss.fastas, "tmp")
        self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics")
        self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs")

    def _assign_dict(self, lib_datas):
        return {"wig": lib_datas[0],
                "tex": lib_datas[1],
                "condition": int(lib_datas[2]),
                "replicate": lib_datas[3],
                "strand": lib_datas[4]}

    def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix, rep_set):
        for num_id in range(1, lib_num+1):
            cond_list = []
            for lib in lib_list:
                if num_id == lib["condition"]:
                    cond_list.append(lib)
            cond_sort_list = sorted(cond_list, key=lambda k: k['replicate'])
            reps = []
            for cond in cond_sort_list:
                out.write("{0}_{1}{2} = {3}\n".format(
                          prefix, cond["condition"], cond["replicate"],
                          os.path.join(wig_folder, cond["wig"])))
                reps.append(cond["replicate"])
            for rep in sorted(rep_set):
                if rep not in reps:
                    out.write("{0}_{1}{2} = \n".format(
                              prefix, cond["condition"], rep))

    def _start_to_run(self, tsspredator_path, config_file, out_path, prefix, log):
        print("Running TSSpredator for " + prefix)
        log.write("Make sure the version of TSSpredator is at least 1.06.\n")
        out = open(os.path.join(out_path, "log.txt"), "w")
        err = open(os.path.join(out_path, "err.txt"), "w")
        log.write(" ".join(["java", "-jar", tsspredator_path,
                            config_file]) + "\n")
        call(["java", "-jar", tsspredator_path,
              config_file], stdout=out, stderr=err)
        out.close()
        err.close()
        log.write("Done!\n")
        log.write("The following files are generated in {0}:\n".format(out_path))
        for file_ in os.listdir(out_path):
            log.write("\t" + file_ + "\n")

    def _import_lib(self, libs, wig_folder, project_strain_name,
                    out, gff, program, fasta):
        lib_dict = {"fp": [], "fm": [], "nm": [], "np": []}
        lib_num = 0
        rep_set = set()
        list_num_id = []
        for lib in libs:
            lib_datas = lib.split(":")
            if not lib_datas[0].endswith(".wig"):
                print("Error: Wiggle files are not end with .wig!")
                sys.exit()
            for wig in os.listdir(wig_folder):
                filename = wig.split("_STRAIN_")
                if (filename[0] == lib_datas[0][:-4]) and (
                        filename[1][:-4] == project_strain_name):
                    lib_datas[0] = wig
            if int(lib_datas[2]) > lib_num:
                lib_num = int(lib_datas[2])
            if lib_datas[3] not in rep_set:
                rep_set.add(lib_datas[3])
            if (lib_datas[1] == "tex") and (lib_datas[4] == "+"):
                lib_dict["fp"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"):
                lib_dict["fm"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"):
                lib_dict["np"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"):
                lib_dict["nm"].append(self._assign_dict(lib_datas))
        for num_id in range(1, lib_num+1):
            out.write("annotation_{0} = {1}\n".format(num_id, gff))
        if program.lower() == "tss":
            self._print_lib(lib_num, lib_dict["fm"], out,
                            wig_folder, "fivePrimeMinus", rep_set)
            self._print_lib(lib_num, lib_dict["fp"], out,
                            wig_folder, "fivePrimePlus", rep_set)
        elif program.lower() == "ps":
            self._print_lib(lib_num, lib_dict["nm"], out,
                            wig_folder, "fivePrimeMinus", rep_set)
            self._print_lib(lib_num, lib_dict["np"], out,
                            wig_folder, "fivePrimePlus", rep_set)
        else:
            print("Error: Wrong program name! Please assing tss "
                  "or processing_site.")
            sys.exit()
        for num_id in range(1, lib_num+1):
            out.write("genome_{0} = {1}\n".format(num_id, fasta))
        for num_id in range(1, lib_num+1):
            list_num_id.append(str(num_id))
        return lib_num, num_id, rep_set, lib_dict, list_num_id

    def _print_repmatch(self, args_tss, out):
        '''check replicate match'''
        detect_all = False
        for rep in args_tss.repmatch:
            if "all" in rep:
                detect_all = True
                match = rep.split("_")[-1]
                out.write("minNumRepMatches = {0}\n".format(match))
                break
        if not detect_all:
            nums = {}
            matchs = {}
            for match in args_tss.repmatch:
                lib = match.split("_")[0]
                rep = match.split("_")[-1]
                matchs[lib] = rep
                if rep not in nums.keys():
                    nums[rep] = 1
                else:
                    nums[rep] += 1
            for rep, num in nums.items():
                if num == max(nums.values()):
                    out.write("minNumRepMatches = {0}\n".format(rep))
                    max_rep = rep
                    break
            for lib, rep in matchs.items():
                if rep != max_rep:
                    out.write("minNumRepMatches_{0} = {1}\n".format(
                        lib, rep))

    def _gen_config(self, project_strain_name, args_tss, gff,
                    wig_folder, fasta, config_file, log):
        '''generation of config files'''
        master_folder = "MasterTable_" + project_strain_name
        out_path = os.path.join(self.master, master_folder)
        self.helper.check_make_folder(out_path)
        out = open(config_file, "w")
        out.write("TSSinClusterSelectionMethod = HIGHEST\n")
        out.write("allowedCompareShift = 1\n")
        out.write("allowedRepCompareShift = 1\n")
        lib_num, num_id, rep_set, lib_dict, list_num_id = \
            self._import_lib(args_tss.libs, wig_folder, project_strain_name,
                             out, gff, args_tss.program, fasta)
        out.write("idList = ")
        out.write(",".join(list_num_id) + "\n")
        out.write("maxASutrLength = 100\n")
        out.write("maxGapLengthInGene = 500\n")
        out.write("maxNormalTo5primeFactor = {0}\n".format(
                  args_tss.processing_factor))
        out.write("maxTSSinClusterDistance = {0}\n".format(
                  args_tss.cluster + 1))
        out.write("maxUTRlength = {0}\n".format(args_tss.utr_length))
        out.write("min5primeToNormalFactor = {0}\n".format(
                  args_tss.enrichment_factor))
        out.write("minCliffFactor = {0}\n".format(args_tss.factor))
        out.write("minCliffFactorDiscount = {0}\n".format(
                  args_tss.factor_reduction))
        out.write("minCliffHeight = {0}\n".format(args_tss.height))
        out.write("minCliffHeightDiscount = {0}\n".format(
                  args_tss.height_reduction))
        out.write("minNormalHeight = {0}\n".format(args_tss.base_height))
        self._print_repmatch(args_tss, out)
        out.write("minPlateauLength = 0\n")
        out.write("mode = cond\n")
        out.write("normPercentile = 0.9\n")
        if args_tss.program.lower() == "tss":
            self._print_lib(lib_num, lib_dict["nm"], out,
                            wig_folder, "normalMinus", rep_set)
            self._print_lib(lib_num, lib_dict["np"], out,
                            wig_folder, "normalPlus", rep_set)
        else:
            self._print_lib(lib_num, lib_dict["fm"], out,
                            wig_folder, "normalMinus", rep_set)
            self._print_lib(lib_num, lib_dict["fp"], out,
                            wig_folder, "normalPlus", rep_set)
        out.write("numReplicates = {0}\n".format(len(rep_set)))
        out.write("numberOfDatasets = {0}\n".format(lib_num))
        out.write("outputDirectory = {0}\n".format(out_path))
        for prefix_id in range(len(args_tss.output_prefixs)):
            out.write("outputPrefix_{0} = {1}\n".format(
                      prefix_id + 1, args_tss.output_prefixs[prefix_id]))
        out.write("projectName = {0}\n".format(project_strain_name))
        out.write("superGraphCompatibility = igb\n")
        out.write("texNormPercentile = 0.5\n")
        out.write("writeGraphs = 0\n")
        out.write("writeNocornacFiles = 0\n")
        log.write("\t" + config_file + " is generated.\n")
        out.close()

    def _convert_gff(self, prefixs, args_tss, log):
        for prefix in prefixs:
            out_file = os.path.join(self.gff_outfolder, "_".join([
                           prefix, args_tss.program]) + ".gff")
            gff_f = open(out_file, "w")
            out_path = os.path.join(self.master, "_".join([
                           "MasterTable", prefix]))
            if "MasterTable.tsv" not in os.listdir(out_path):
                print("Error: There is not MasterTable file in {0} ".format(
                      out_path))
                print("Please check configuration file.")
                log.write("not MasterTable file is found in {0}\n".format(
                           out_path))
            else:
                if args_tss.program.lower() == "processing":
                    feature = "processing_site"
                elif args_tss.program.lower() == "tss":
                    feature = "TSS"
                self.converter.convert_mastertable2gff(
                    os.path.join(out_path, "MasterTable.tsv"),
                    "ANNOgesic", feature, prefix, out_file)
                log.write("\t" + out_file + "is generated.\n")
            gff_f.close()

    def _merge_manual(self, tsss, args_tss):
        '''if manual detected TSS is provided, it can merge manual detected TSS 
        and TSSpredator predicted TSS'''
        self.helper.check_make_folder(os.path.join(os.getcwd(),
                                      self.tmps["tss"]))
        for tss in tsss:
            for gff in os.listdir(args_tss.gffs):
                if (gff[:-4] == tss) and (".gff" in gff):
                    break
            filename = "_".join([tss, args_tss.program]) + ".gff"
            predict = os.path.join(self.gff_outfolder, filename)
            manual = os.path.join(self.manual_path, tss + ".gff")
            fasta = os.path.join(self.fasta_path, tss + ".fa")
            stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss)
            if os.path.exists(manual):
                print("Merging and classiflying manually-detected "
                      "TSSs for {0}".format(tss))
                merge_manual_predict_tss(
                    predict, stat_file,
                    os.path.join(self.tmps["tss"], filename),
                    os.path.join(args_tss.gffs, gff), args_tss, manual, fasta)
            if os.path.exists(stat_file):
                shutil.move(stat_file, os.path.join(
                    args_tss.out_folder, "statistics", tss, stat_file))
        self.helper.move_all_content(self.tmps["tss"],
                                     self.gff_outfolder, [".gff"])
        shutil.rmtree(self.tmps["tss"])

    def _validate(self, tsss, args_tss, log):
        '''validate TSS with genome annotation'''
        print("Validating TSSs with genome annotations")
        log.write("Running validate_gene.py to compare genome "
                  "annotations and TSSs/PSs.\n")
        for tss in tsss:
            for gff in os.listdir(args_tss.gffs):
                if (gff[:-4] == tss) and (".gff" in gff):
                    break
            stat_file = os.path.join(
                    self.stat_outfolder, tss,
                    "".join(["stat_gene_vali_", tss, ".csv"]))
            out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff")
            if args_tss.program.lower() == "tss":
                compare_file = os.path.join(self.gff_outfolder,
                                            "_".join([tss, "TSS.gff"]))
            elif args_tss.program.lower() == "processing":
                compare_file = os.path.join(self.gff_outfolder,
                                            "_".join([tss, "processing.gff"]))
            validate_gff(compare_file, os.path.join(args_tss.gffs, gff),
                         stat_file, out_cds_file, args_tss.utr_length,
                         args_tss.program.lower())
            log.write("\t" + stat_file + " is generated.\n")
            shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff))

    def _compare_ta(self, tsss, args_tss, log):
        '''compare TSS with transcript'''
        detect = False
        log.write("Running stat_TA_comparison to compare transcripts "
                  "and TSSs/PSs.\n")
        print("Comparing transcripts and TSSs")
        self.multiparser.parser_gff(args_tss.ta_files, "transcript")
        self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"],
                                     None, "transcript")
        for tss in tsss:
            stat_out = os.path.join(
                    self.stat_outfolder, tss, "".join([
                        "stat_compare_TSS_transcript_",
                        tss, ".csv"]))
            for ta in os.listdir(self.tmps["ta"]):
                filename = ta.split("_transcript")
                if (filename[0] == tss) and (filename[1] == ".gff"):
                    detect = True
                    break
            compare_file = os.path.join(self.gff_outfolder,
                                        "_".join([tss, "TSS.gff"]))
            if detect:
                stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file,
                            stat_out, self.tmps["ta_tss"],
                            self.tmps["tss_ta"], args_tss.fuzzy)
                self.helper.sort_gff(self.tmps["tss_ta"], compare_file)
                self.helper.sort_gff(self.tmps["ta_tss"],
                                     os.path.join(args_tss.ta_files, ta))
                os.remove(self.tmps["tss_ta"])
                os.remove(self.tmps["ta_tss"])
                detect = False
            log.write("\t" + stat_out + " is generated.\n")

    def _stat_tss(self, tsss, feature, log):
        print("Running statistaics")
        for tss in tsss:
            compare_file = os.path.join(self.gff_outfolder,
                                        "_".join([tss, feature]) + ".gff")
            stat_tsspredator(
                compare_file, feature,
                os.path.join(self.stat_outfolder, tss, "_".join([
                    "stat", feature, "class", tss]) + ".csv"),
                os.path.join(self.stat_outfolder, tss, "_".join([
                    "stat", feature, "libs", tss]) + ".csv"))
            self.helper.move_all_content(os.getcwd(), os.path.join(
                self.stat_outfolder, tss), ["_class", ".png"])
            if os.path.exists(os.path.join(
                    self.stat_outfolder, "TSSstatistics.tsv")):
                shutil.move(
                    os.path.join(
                        self.stat_outfolder, "TSSstatistics.tsv"),
                    os.path.join(
                        self.stat_outfolder, tss, "TSSstatistics.tsv"))
            plot_venn(compare_file, feature)
            self.helper.move_all_content(os.getcwd(), os.path.join(
                self.stat_outfolder, tss), ["_venn", ".png"])
            log.write("The following files in {0} are generated:\n".format(
                (os.path.join(self.stat_outfolder, tss))))
            for file_ in os.listdir(os.path.join(
                    self.stat_outfolder, tss)):
                log.write("\t" + file_ + "\n")

    def _set_gen_config(self, args_tss, input_folder, log):
        prefixs = []
        detect = False
        log.write("Generating config files for TSSpredator.\n")
        for fasta in os.listdir(self.fasta_path):
            run = False
            for gff in os.listdir(self.gff_path):
                if fasta[:-3] == gff[:-4]:
                    prefix = fasta[:-3]
                    for wig in os.listdir(self.wig_path):
                        filename = wig.split("_STRAIN_")
                        if filename[1][:-4] == prefix:
                            detect = True
                            break
                    if detect:
                        prefixs.append(prefix)
                        config = os.path.join(
                                input_folder,
                                "_".join(["config", prefix]) + ".ini")
                        self._gen_config(
                            prefix, args_tss,
                            os.path.join(self.gff_path, gff), self.wig_path,
                            os.path.join(self.fasta_path, fasta), config, log)
        return prefixs

    def _merge_wigs(self, wig_folder, prefix, libs):
        self.helper.check_make_folder(os.path.join(os.getcwd(),
                                      self.tmps["tmp"]))
        for wig_file in os.listdir(wig_folder):
            for lib in libs:
                info = lib.split(":")
                if (info[0][:-4] in wig_file) and (info[-1] == "+") and (
                        prefix in wig_file) and (
                        os.path.isfile(os.path.join(wig_folder, wig_file))):
                    Helper().merge_file(
                            os.path.join(wig_folder, wig_file),
                            os.path.join("tmp", "merge_forward.wig"))
                if (info[0][:-4] in wig_file) and (info[-1] == "-") and (
                        prefix in wig_file) and (
                        os.path.isfile(os.path.join(wig_folder, wig_file))):
                    Helper().merge_file(
                            os.path.join(wig_folder, wig_file),
                            os.path.join("tmp", "merge_reverse.wig"))

    def _check_orphan(self, prefixs, wig_folder, args_tss):
        '''if genome has no locus tag, it can use for classify the TSS'''
        for prefix in prefixs:
            self._merge_wigs(wig_folder, prefix, args_tss.libs)
            tmp_tss = os.path.join(self.tmps["tmp"], "_".join([
                          prefix, args_tss.program + ".gff"]))
            pre_tss = os.path.join(self.gff_outfolder, "_".join([
                          prefix, args_tss.program + ".gff"]))
            check_orphan(pre_tss, os.path.join(
                args_tss.gffs, prefix + ".gff"),
                "tmp/merge_forward.wig", "tmp/merge_reverse.wig", tmp_tss)
            shutil.move(tmp_tss, pre_tss)
        shutil.rmtree("tmp")

    def _remove_files(self, args_tss):
        print("Remove temperary files and folders")
        self.helper.remove_tmp_dir(args_tss.fastas)
        self.helper.remove_tmp_dir(args_tss.gffs)
        self.helper.remove_tmp_dir(args_tss.ta_files)
        if "merge_forward.wig" in os.listdir(os.getcwd()):
            os.remove("merge_forward.wig")
        if "merge_reverse.wig" in os.listdir(os.getcwd()):
            os.remove("merge_reverse.wig")
        shutil.rmtree(args_tss.wig_folder)
        if args_tss.manual is not None:
            shutil.rmtree(args_tss.manual)

    def _deal_with_overlap(self, out_folder, args_tss):
        '''deal with the situation that TSS and 
        processing site at the same position'''
        if not args_tss.overlap_feature:
            pass
        else:
            print("Comparing TSSs and Processing sites")
            if args_tss.program.lower() == "tss":
                for tss in os.listdir(out_folder):
                    if tss.endswith("_TSS.gff"):
                        ref = self.helper.get_correct_file(
                                args_tss.overlap_gffs, "_processing.gff",
                                tss.replace("_TSS.gff", ""), None, None)
                        filter_tss_pro(os.path.join(out_folder, tss),
                                       ref, args_tss.program,
                                       args_tss.cluster)
            elif args_tss.program.lower() == "processing":
                for tss in os.listdir(out_folder):
                    if tss.endswith("_processing.gff"):
                        ref = self.helper.get_correct_file(
                                args_tss.overlap_gffs, "_TSS.gff",
                                tss.replace("_processing.gff", ""), None, None)
                        filter_tss_pro(os.path.join(out_folder, tss),
                                       ref, args_tss.program,
                                       args_tss.cluster)

    def _low_expression(self, args_tss, gff_folder):
        '''deal with the low expressed TSS'''
        prefix = None
        self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs)
        for gff in os.listdir(gff_folder):
            if (args_tss.program.lower() == "tss") and (
                    gff.endswith("_TSS.gff")):
                prefix = gff.replace("_TSS.gff", "")
            elif (args_tss.program.lower() == "processing") and (
                    gff.endswith("_processing.gff")):
                prefix = gff.replace("_processing.gff", "")
            if prefix:
                out = open(os.path.join(
                    self.stat_outfolder, prefix, "_".join([
                        "stat", prefix, "low_expression_cutoff.csv"])), "w")
                out.write("\t".join(["Genome", "Cutoff_coverage"]) + "\n")
                cutoff = filter_low_expression(
                        os.path.join(gff_folder, gff), args_tss,
                        "tmp/merge_forward.wig", "tmp/merge_reverse.wig",
                        "tmp/without_low_expression.gff")
                out.write("\t".join([prefix, str(cutoff)]) + "\n")
                os.remove(os.path.join(gff_folder, gff))
                shutil.move("tmp/without_low_expression.gff",
                            os.path.join(gff_folder, gff))
                prefix = None
        out.close()

    def run_tsspredator(self, args_tss, log):
        input_folder = os.path.join(args_tss.out_folder, "configs")
        for gff in os.listdir(args_tss.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(
                                                 args_tss.gffs, gff))
        self.helper.check_make_folder(self.gff_outfolder)
        self.multiparser.parser_fasta(args_tss.fastas)
        self.multiparser.parser_gff(args_tss.gffs, None)
        self.multiparser.parser_wig(args_tss.wig_folder)
        prefixs = self._set_gen_config(args_tss, input_folder, log)
        for prefix in prefixs:
            out_path = os.path.join(
                    self.master, "_".join(["MasterTable", prefix]))
            config_file = os.path.join(
                    input_folder, "_".join(["config", prefix]) + ".ini")
            self._start_to_run(args_tss.tsspredator_path, config_file,
                               out_path, prefix, log)
            if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")):
                shutil.move(os.path.join(out_path, "TSSstatistics.tsv"),
                            os.path.join(
                                self.stat_outfolder, "TSSstatistics.tsv"))
        if args_tss.program.lower() == "ps":
            args_tss.program = "processing"
        self._convert_gff(prefixs, args_tss, log)
        if args_tss.check_orphan:
            print("checking the orphan TSSs")
            log.write("Running check_orphan.py to re-check orphan TSSs.\n")
            self._check_orphan(prefixs,
                               os.path.join(args_tss.wig_folder, "tmp"),
                               args_tss)
        self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder,
                                     None, args_tss.program)
        datas = []
        for gff in os.listdir(self.gff_outfolder):
            if gff.endswith(".gff"):
                gff_folder = gff.replace("".join(["_", args_tss.program,
                                                  ".gff"]), "")
                self.helper.check_make_folder(
                     os.path.join(self.stat_outfolder, gff_folder))
                datas.append(gff_folder)
        if args_tss.remove_low_expression is not None:
            log.write("Running filter_low_expression.py to filter out "
                      "low expressed TSS/PS.\n")
            self._low_expression(args_tss, self.gff_outfolder)
        if args_tss.manual is not None:
            self.multiparser.parser_gff(args_tss.manual, None)
            self.multiparser.combine_gff(args_tss.gffs, self.manual_path,
                                         None, None)
            self.multiparser.combine_fasta(args_tss.gffs, self.fasta_path,
                                         None)
            self.multiparser.combine_wig(args_tss.gffs, self.wig_path,
                                         None, args_tss.libs)
            log.write("Running merge_manual.py to merge the manual TSSs.\n")
            self._merge_manual(datas, args_tss)
        log.write("Running filter_TSS_pro.py to deal with the overlap "
                  "position between TSS and PS.\n")
        self._deal_with_overlap(self.gff_outfolder, args_tss)
        log.write("Running stat_TSSpredator.py to do statistics.\n")
        self._stat_tss(datas, args_tss.program, log)
        if args_tss.validate:
            self._validate(datas, args_tss, log)
        if args_tss.ta_files is not None:
            self._compare_ta(datas, args_tss, log)
        self._remove_files(args_tss)
예제 #7
0
class TSSpredator(object):

    def __init__(self, args_tss):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.master = os.path.join(args_tss.out_folder, "MasterTables")
        self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta":
                     "tmp_tss", "tmp": "tmp"}
        if args_tss.ta_files is not None:
            self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp")
        else:
            self.tmps["ta"] = None
        self.gff_path = os.path.join(args_tss.gffs, "tmp")
        self.wig_path = os.path.join(args_tss.wig_folder, "tmp")
        self.fasta_path = os.path.join(args_tss.fastas, "tmp")
        self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics")
        self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs")

    def _assign_dict(self, lib_datas):
        return {"wig": lib_datas[0],
                "tex": lib_datas[1],
                "condition": int(lib_datas[2]),
                "replicate": lib_datas[3],
                "strand": lib_datas[4]}

    def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix):
        for num_id in range(1, lib_num+1):
            cond_list = []
            for lib in lib_list:
                if num_id == lib["condition"]:
                    cond_list.append(lib)
            cond_sort_list = sorted(cond_list, key=lambda k: k['replicate'])
            for cond in cond_sort_list:
                out.write("{0}_{1}{2} = {3}\n".format(
                          prefix, cond["condition"], cond["replicate"],
                          os.path.join(wig_folder, cond["wig"])))

    def _start_to_run(self, tsspredator_path, config_file, out_path, prefix):
        print("Running TSSpredator for " + prefix)
        out = open(os.path.join(out_path, "log.txt"), "w")
        err = open(os.path.join(out_path, "err.txt"), "w")
        call(["java", "-jar", tsspredator_path,
              config_file], stdout=out, stderr=err)
        out.close()
        err.close()

    def _import_lib(self, libs, wig_folder, project_strain_name,
                    out, gff, program, fasta):
        lib_dict = {"fp": [], "fm": [], "nm": [], "np": []}
        lib_num = 0
        rep_set = set()
        list_num_id = []
        print("Runniun {0} now...".format(program))
        for lib in libs:
            lib_datas = lib.split(":")
            if not lib_datas[0].endswith(".wig"):
                print("Error:Exist a not proper wig files!!")
                sys.exit()
            for wig in os.listdir(wig_folder):
                filename = wig.split("_STRAIN_")
                if (filename[0] == lib_datas[0][:-4]) and (
                        filename[1][:-4] == project_strain_name):
                    lib_datas[0] = wig
            if int(lib_datas[2]) > lib_num:
                lib_num = int(lib_datas[2])
            if lib_datas[3] not in rep_set:
                rep_set.add(lib_datas[3])
            if (lib_datas[1] == "tex") and (lib_datas[4] == "+"):
                lib_dict["fp"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"):
                lib_dict["fm"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"):
                lib_dict["np"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"):
                lib_dict["nm"].append(self._assign_dict(lib_datas))
        for num_id in range(1, lib_num+1):
            out.write("annotation_{0} = {1}\n".format(num_id, gff))
        if program.lower() == "tss":
            self._print_lib(lib_num, lib_dict["fm"], out,
                            wig_folder, "fivePrimeMinus")
            self._print_lib(lib_num, lib_dict["fp"], out,
                            wig_folder, "fivePrimePlus")
        elif program.lower() == "processing_site":
            self._print_lib(lib_num, lib_dict["nm"], out,
                            wig_folder, "fivePrimeMinus")
            self._print_lib(lib_num, lib_dict["np"], out,
                            wig_folder, "fivePrimePlus")
        else:
            print("Error: Wrong program name!!!")
            sys.exit()
        for num_id in range(1, lib_num+1):
            out.write("genome_{0} = {1}\n".format(num_id, fasta))
        for num_id in range(1, lib_num+1):
            list_num_id.append(str(num_id))
        return lib_num, num_id, rep_set, lib_dict, list_num_id

    def _gen_config(self, project_strain_name, args_tss, gff,
                    wig_folder, fasta, config_file):
        master_folder = "MasterTable_" + project_strain_name
        out_path = os.path.join(self.master, master_folder)
        self.helper.check_make_folder(out_path)
        out = open(config_file, "w")
        out.write("TSSinClusterSelectionMethod = HIGHEST\n")
        out.write("allowedCompareShift = 1\n")
        out.write("allowedRepCompareShift = 1\n")
        lib_num, num_id, rep_set, lib_dict, list_num_id = \
            self._import_lib(args_tss.libs, wig_folder, project_strain_name,
                             out, gff, args_tss.program, fasta)
        out.write("idList = ")
        out.write(",".join(list_num_id) + "\n")
        out.write("maxASutrLength = 100\n")
        out.write("maxGapLengthInGene = 500\n")
        out.write("maxNormalTo5primeFactor = {0}\n".format(
                  args_tss.processing_factor))
        out.write("maxTSSinClusterDistance = {0}\n".format(
                  args_tss.cluster + 1))
        out.write("maxUTRlength = {0}\n".format(args_tss.utr_length))
        out.write("min5primeToNormalFactor = {0}\n".format(
                  args_tss.enrichment_factor))
        out.write("minCliffFactor = {0}\n".format(args_tss.factor))
        out.write("minCliffFactorDiscount = {0}\n".format(
                  args_tss.factor_reduction))
        out.write("minCliffHeight = {0}\n".format(args_tss.height))
        out.write("minCliffHeightDiscount = {0}\n".format(
                  args_tss.height_reduction))
        out.write("minNormalHeight = {0}\n".format(args_tss.base_height))
        out.write("minNumRepMatches = {0}\n".format(args_tss.repmatch))
        out.write("minPlateauLength = 0\n")
        out.write("mode = cond\n")
        out.write("normPercentile = 0.9\n")
        if args_tss.program.lower() == "tss":
            self._print_lib(lib_num, lib_dict["nm"], out,
                            wig_folder, "normalMinus")
            self._print_lib(lib_num, lib_dict["np"], out,
                            wig_folder, "normalPlus")
        else:
            self._print_lib(lib_num, lib_dict["fm"], out,
                            wig_folder, "normalMinus")
            self._print_lib(lib_num, lib_dict["fp"], out,
                            wig_folder, "normalPlus")
        out.write("numReplicates = {0}\n".format(len(rep_set)))
        out.write("numberOfDatasets = {0}\n".format(lib_num))
        out.write("outputDirectory = {0}\n".format(out_path))
        for prefix_id in range(len(args_tss.output_prefixs)):
            out.write("outputPrefix_{0} = {1}\n".format(
                      prefix_id + 1, args_tss.output_prefixs[prefix_id]))
        out.write("projectName = {0}\n".format(project_strain_name))
        out.write("superGraphCompatibility = igb\n")
        out.write("texNormPercentile = 0.5\n")
        out.write("writeGraphs = 0\n")
        out.write("writeNocornacFiles = 0\n")
        out.close()

    def _convert_gff(self, prefixs, args_tss):
        for prefix in prefixs:
            out_file = os.path.join(self.gff_outfolder, "_".join([
                           prefix, args_tss.program]) + ".gff")
            gff_f = open(out_file, "w")
            out_path = os.path.join(self.master, "_".join([
                           "MasterTable", prefix]))
            if "MasterTable.tsv" not in os.listdir(out_path):
                print("Error:there is not MasterTable file in {0}".format(
                      out_path))
                print("Please check configuration file.")
            else:
                self.converter.convert_mastertable2gff(
                    os.path.join(out_path, "MasterTable.tsv"),
                    "ANNOgesic", args_tss.program, prefix, out_file)
            gff_f.close()

    def _merge_manual(self, tsss, args_tss):
        self.helper.check_make_folder(os.path.join(os.getcwd(),
                                      self.tmps["tss"]))
        for tss in tsss:
            for gff in os.listdir(args_tss.gffs):
                if (gff[:-4] == tss) and (".gff" in gff):
                    break
            filename = "_".join([tss, args_tss.program]) + ".gff"
            predict = os.path.join(self.gff_outfolder, filename)
            print("Running merge and classify manual ....")
            stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss)
            merge_manual_predict_tss(
                predict, stat_file,
                os.path.join(self.tmps["tss"], filename),
                os.path.join(args_tss.gffs, gff), args_tss)
            shutil.move(stat_file, os.path.join(args_tss.out_folder,
                                                "statistics", tss, stat_file))
        self.helper.move_all_content(self.tmps["tss"],
                                     self.gff_outfolder, [".gff"])
        shutil.rmtree(self.tmps["tss"])

    def _validate(self, tsss, args_tss):
        print("Running validation of annotation....")
        for tss in tsss:
            for gff in os.listdir(args_tss.gffs):
                if (gff[:-4] == tss) and (".gff" in gff):
                    break
            stat_file = os.path.join(
                    self.stat_outfolder, tss,
                    "".join(["stat_gene_vali_", tss, ".csv"]))
            out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff")
            if args_tss.program.lower() == "tss":
                compare_file = os.path.join(self.gff_outfolder,
                                            "_".join([tss, "TSS.gff"]))
            elif args_tss.program.lower() == "processing":
                compare_file = os.path.join(self.gff_outfolder,
                                            "_".join([tss, "processing.gff"]))
            validate_gff(compare_file, os.path.join(args_tss.gffs, gff),
                         stat_file, out_cds_file, args_tss.utr_length,
                         args_tss.program.lower())
            shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff))

    def _compare_ta(self, tsss, args_tss):
        detect = False
        print("Running compare transcript assembly and TSS ...")
        self.multiparser.parser_gff(args_tss.ta_files, "transcript")
        self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"],
                                     None, "transcript")
        for tss in tsss:
            stat_out = os.path.join(
                    self.stat_outfolder, tss, "".join([
                        "stat_compare_TSS_Transcriptome_assembly_",
                        tss, ".csv"]))
            for ta in os.listdir(self.tmps["ta"]):
                filename = ta.split("_transcript")
                if (filename[0] == tss) and (filename[1] == ".gff"):
                    detect = True
                    break
            compare_file = os.path.join(self.gff_outfolder,
                                        "_".join([tss, "TSS.gff"]))
            if detect:
                stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file,
                            stat_out, self.tmps["ta_tss"],
                            self.tmps["tss_ta"], args_tss.fuzzy)
                self.helper.sort_gff(self.tmps["tss_ta"], compare_file)
                self.helper.sort_gff(self.tmps["ta_tss"],
                                     os.path.join(args_tss.ta_files, ta))
                os.remove(self.tmps["tss_ta"])
                os.remove(self.tmps["ta_tss"])
                detect = False

    def _stat_tss(self, tsss, feature):
        print("Running statistaics.....")
        for tss in tsss:
            compare_file = os.path.join(self.gff_outfolder,
                                        "_".join([tss, feature]) + ".gff")
            stat_tsspredator(
                compare_file, feature,
                os.path.join(self.stat_outfolder, tss, "_".join([
                    "stat", feature, "class", tss]) + ".csv"),
                os.path.join(self.stat_outfolder, tss, "_".join([
                    "stat", feature, "libs", tss]) + ".csv"))
            self.helper.move_all_content(os.getcwd(), os.path.join(
                self.stat_outfolder, tss), ["_class", ".png"])
            if os.path.exists(os.path.join(
                    self.stat_outfolder, "TSSstatistics.tsv")):
                shutil.move(
                    os.path.join(
                        self.stat_outfolder, "TSSstatistics.tsv"),
                    os.path.join(
                        self.stat_outfolder, tss, "TSSstatistics.tsv"))
            plot_venn(compare_file, feature)
            self.helper.move_all_content(os.getcwd(), os.path.join(
                self.stat_outfolder, tss), ["_venn", ".png"])

    def _set_gen_config(self, args_tss, input_folder):
        prefixs = []
        detect = False
        for fasta in os.listdir(self.fasta_path):
            for gff in os.listdir(self.gff_path):
                if fasta[:-3] == gff[:-4]:
                    prefix = fasta[:-3]
                    for wig in os.listdir(self.wig_path):
                        filename = wig.split("_STRAIN_")
                        if filename[1][:-4] == prefix:
                            detect = True
                            break
                    if detect:
                        prefixs.append(prefix)
                        config = os.path.join(
                                input_folder,
                                "_".join(["config", prefix]) + ".ini")
                        self._gen_config(
                            prefix, args_tss,
                            os.path.join(self.gff_path, gff), self.wig_path,
                            os.path.join(self.fasta_path, fasta), config)
        return prefixs

    def _merge_wigs(self, wig_folder, prefix, libs):
        self.helper.check_make_folder(os.path.join(os.getcwd(),
                                      self.tmps["tmp"]))
        for wig_file in os.listdir(wig_folder):
            for lib in libs:
                info = lib.split(":")
                if (info[0][:-4] in wig_file) and (info[-1] == "+") and (
                        prefix in wig_file) and (
                        os.path.isfile(os.path.join(wig_folder, wig_file))):
                    Helper().merge_file(
                            os.path.join(wig_folder, wig_file),
                            os.path.join("tmp", "merge_forward.wig"))
                if (info[0][:-4] in wig_file) and (info[-1] == "-") and (
                        prefix in wig_file) and (
                        os.path.isfile(os.path.join(wig_folder, wig_file))):
                    Helper().merge_file(
                            os.path.join(wig_folder, wig_file),
                            os.path.join("tmp", "merge_reverse.wig"))

    def _check_orphan(self, prefixs, wig_folder, args_tss):
        for prefix in prefixs:
            self._merge_wigs(wig_folder, prefix, args_tss.libs)
            tmp_tss = os.path.join(self.tmps["tmp"], "_".join([
                          prefix, args_tss.program + ".gff"]))
            pre_tss = os.path.join(self.gff_outfolder, "_".join([
                          prefix, args_tss.program + ".gff"]))
            check_orphan(pre_tss, os.path.join(
                args_tss.gffs, prefix + ".gff"),
                "tmp/merge_forward.wig", "tmp/merge_reverse.wig", tmp_tss)
            shutil.move(tmp_tss, pre_tss)
        shutil.rmtree("tmp")

    def _remove_files(self, args_tss):
        print("Remove temperary files and folders...")
        self.helper.remove_tmp(args_tss.fastas)
        self.helper.remove_tmp(args_tss.gffs)
        self.helper.remove_tmp(args_tss.wig_folder)
        self.helper.remove_tmp(args_tss.ta_files)
        if "merge_forward.wig" in os.listdir(os.getcwd()):
            os.remove("merge_forward.wig")
        if "merge_reverse.wig" in os.listdir(os.getcwd()):
            os.remove("merge_reverse.wig")

    def _deal_with_overlap(self, out_folder, args_tss):
        if args_tss.overlap_feature.lower() == "both":
            pass
        else:
            print("Comparing TSS and Processing site...")
            if args_tss.program.lower() == "tss":
                for tss in os.listdir(out_folder):
                    if tss.endswith("_TSS.gff"):
                        ref = self.helper.get_correct_file(
                                args_tss.references, "_processing.gff",
                                tss.replace("_TSS.gff", ""), None, None)
                        filter_tss_pro(os.path.join(out_folder, tss),
                                       ref, args_tss.overlap_feature,
                                       args_tss.cluster)
            elif args_tss.program.lower() == "processing_site":
                for tss in os.listdir(out_folder):
                    if tss.endswith("_processing.gff"):
                        ref = self.helper.get_correct_file(
                                args_tss.references, "_TSS.gff",
                                tss.replace("_processing.gff", ""), None, None)
                        filter_tss_pro(os.path.join(out_folder, tss),
                                       ref, args_tss.overlap_feature,
                                       args_tss.cluster)

    def _low_expression(self, args_tss, gff_folder):
        prefix = None
        self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs)
        for gff in os.listdir(gff_folder):
            if (args_tss.program.lower() == "tss") and (
                    gff.endswith("_TSS.gff")):
                prefix = gff.replace("_TSS.gff", "")
            elif (args_tss.program.lower() == "processing") and (
                    gff.endswith("_processing.gff")):
                prefix = gff.replace("_processing.gff", "")
            if prefix:
                out = open(os.path.join(
                    self.stat_outfolder, prefix, "_".join([
                        "stat", prefix, "low_expression_cutoff.csv"])), "w")
                out.write("\t".join(["strain", "cutoff_coverage"]) + "\n")
                cutoff = filter_low_expression(
                        os.path.join(gff_folder, gff), args_tss,
                        "tmp/merge_forward.wig", "tmp/merge_reverse.wig",
                        "tmp/without_low_expression.gff")
                out.write("\t".join([prefix, str(cutoff)]) + "\n")
                os.remove(os.path.join(gff_folder, gff))
                shutil.move("tmp/without_low_expression.gff",
                            os.path.join(gff_folder, gff))
                prefix = None
        out.close()

    def run_tsspredator(self, args_tss):
        input_folder = os.path.join(args_tss.out_folder, "configs")
        for gff in os.listdir(args_tss.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(
                                                 args_tss.gffs, gff))
        self.helper.check_make_folder(self.gff_outfolder)
        self.multiparser.parser_fasta(args_tss.fastas)
        self.multiparser.parser_gff(args_tss.gffs, None)
        self.multiparser.parser_wig(args_tss.wig_folder)
        prefixs = self._set_gen_config(args_tss, input_folder)
        for prefix in prefixs:
            out_path = os.path.join(
                    self.master, "_".join(["MasterTable", prefix]))
            config_file = os.path.join(
                    input_folder, "_".join(["config", prefix]) + ".ini")
            self._start_to_run(args_tss.tsspredator_path, config_file,
                               out_path, prefix)
            if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")):
                shutil.move(os.path.join(out_path, "TSSstatistics.tsv"),
                            os.path.join(
                                self.stat_outfolder, "TSSstatistics.tsv"))
        if args_tss.program.lower() == "processing_site":
            args_tss.program = "processing"
        self._convert_gff(prefixs, args_tss)
        if args_tss.check_orphan:
            print("checking the orphan TSS...")
            self._check_orphan(prefixs,
                               os.path.join(args_tss.wig_folder, "tmp"),
                               args_tss)
        self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder,
                                     None, args_tss.program)
        datas = []
        for gff in os.listdir(self.gff_outfolder):
            if gff.endswith(".gff"):
                gff_folder = gff.replace("".join(["_", args_tss.program,
                                                  ".gff"]), "")
                self.helper.check_make_folder(
                     os.path.join(self.stat_outfolder, gff_folder))
                datas.append(gff_folder)
        if args_tss.remove_low_expression is not None:
            self._low_expression(args_tss, self.gff_outfolder)
        if args_tss.manual is not None:
            self.multiparser.combine_wig(args_tss.gffs, self.wig_path,
                                         None, args_tss.libs)
            self._merge_manual(datas, args_tss)
        self._deal_with_overlap(self.gff_outfolder, args_tss)
        if args_tss.stat:
            self._stat_tss(datas, args_tss.program)
        if args_tss.validate:
            self._validate(datas, args_tss)
        if args_tss.ta_files is not None:
            self._compare_ta(datas, args_tss)
        self._remove_files(args_tss)