def run_real(self): """Runs the application""" # Load valid sequence IDs (if necessary) if self.options.sequences_file: self.log.info("Loading sequences from %s..." % self.options.sequences_file) self.total_sequence_length = 0 self.valid_sequence_ids = set() parser = fasta.Parser(open_anything(self.options.sequences_file)) parser = fasta.regexp_remapper(parser, self.options.sequence_id_regexp) for seq in parser: self.valid_sequence_ids.add(seq.id) self.total_sequence_length += len(seq.seq) else: self.valid_sequence_ids = complementerset() self.total_sequence_length = None # Find which sources will be allowed if not self.options.include_sources: self.sources = complementerset() else: self.sources = set(self.options.include_sources) self.sources.difference_update(self.options.exclude_sources) if isinstance(self.sources, complementerset): self.log.info("Ignored sources: %s" % ", ".join(self.sources.iterexcluded())) else: self.log.info("Accepted sources: %s" % ", ".join(self.sources)) if not self.args: self.args = ["-"] for arg in self.args: # Set up the output formatter if self.options.print_totals: self.output_formatter = GenomeLevelOutputFormatter(self) else: self.output_formatter = SequenceLevelOutputFormatter(self) # Process the file self.process_infile(arg) # Print the results self.output_formatter.finish()
def get_stages_from_config(self): """Turns to the configuration file specified at startup to fetch the data sources to be used in each stage of the algorithm. If there is no configuration file specified or it does not contain the corresponding keys, it will simply use a default stage setup which ignores HMMPanther and Gene3D in the first and second steps, but uses all sources in the third step. The method will be looking for configuration keys named like ``stages.1``, ``stages.2`` and so on in the ``analysis:iprscan_filter`` section of the config file. The value of each such config key must be an expression consisting of assignment source names and the operators ``+`` and ``-``, with their usual meaning of addition and exclusion. The special source name ``ALL`` means all possible data sources, enabling us to write expressions like ``ALL-HMMPanther`` (meaning all the sources except HMMPanther). Some examples: - ``HMMPanther`` means HMMPanther only. - ``ALL`` means all possible data sources. - ``HMMPanther+HMMPfam`` means HMMPanther or HMMPfam. - ``ALL-HMMPanther-Gene3D`` means all possible data sources but HMMPanther or Gene3D. - ``ALL+HMMPanther`` does not really make sense as you are extending all data sources with HMMPanther, so it is equivalent to ``ALL``. GFam will figure out what you meant anyway. """ cfg = self.parser.config if cfg is None: spec = ["ALL-HMMPanther-Gene3D", "ALL-HMMPanther-Gene3D", "ALL"] else: spec, idx = [], 1 section = "analysis:iprscan_filter" while cfg.has_option(section, "stages.%d" % idx): spec.append(cfg.get(section, "stages.%d" % idx)) idx += 1 regexp = re.compile("([-+])?\s*([^-+]+)") result = [] for item in spec: sources = set() for match in regexp.finditer(item): sign, source = match.groups() if source == "ALL": source = complementerset() else: source = set([source.strip()]) if sign == "-": sources -= source else: sources |= source result.append(sources) return result
def run_real(self): """Runs the application""" AssignmentOverlapChecker.max_overlap = self.options.max_overlap if self.options.interpro_file: self.log.info("Loading known InterPro IDs from %s..." % self.options.interpro_file) self.interpro = InterPro.FromFile(self.options.interpro_file) else: self.interpro = InterPro() if self.options.gene_id_file: self.log.info("Loading sequence IDs from %s..." % self.options.gene_id_file) self.valid_sequence_ids = set() for line in open_anything(self.options.gene_id_file): self.valid_sequence_ids.add(line.strip()) else: self.valid_sequence_ids = complementerset() if self.options.exclusions_log_file: self.log.info("Logging excluded sequences to %s." % self.options.exclusions_log_file) self.exclusion_log = open(self.options.exclusions_log_file, "a+") else: self.exclusion_log = None self.ignored = set() for ignored_source in self.options.ignored: parts = ignored_source.split() self.ignored.update(parts) if not self.args: self.args = ["-"] if len(self.args) > 1: self.error("Only one input file may be given") self.process_infile(self.args[0])
def run_real(self): """Runs the applications""" if len(self.args) != 2: self.error("exactly two input files are expected") AssignmentOverlapChecker.max_overlap = self.options.max_overlap AssignmentOverlapChecker.log = self.log if self.options.interpro_parent_child_file: self.log.info("Loading InterPro parent-child" " assignments from %s..." % self.options.interpro_parent_child_file) self.interpro = InterPro.FromFile( self.options.interpro_parent_child_file) else: self.interpro = InterPro() self.interpro_names = InterProNames.FromFile( self.options.interpro_names_file) if self.options.details: self.details_file = open(self.options.details, "w") else: self.details_file = None if self.options.old_table: self.process_old_table(self.options.old_table) self.using_old_table = True else: self.using_old_table = False self.current_cluster_id = 1 interpro_file, clustering_file = self.args self.process_interpro_file(interpro_file) table = self.process_clustering_file(clustering_file) self.sort_by_domain_architecture() if self.options.new_domains_table: self.print_new_domains_table(table) for seqs in self.domain_archs.values(): seqs.sort() self.domain_archs = self.domain_archs.items() self.domain_archs.sort(key=lambda x: len(x[1]), reverse=True) for domain_arch, members in self.domain_archs: if domain_arch: arch_str = domain_arch else: arch_str = "NO_ASSIGNMENT" arch_str_pos = "NO_ASSIGNMENT" arch_desc = "NO_DESCRIPTION" family_length = len(members) for member in members: seq = self.seqcat[member] if domain_arch: arch_str_pos = seq.architecture_pos arch_desc = ";".join(self.interpro_names[assignment.domain] for assignment in seq.assignments) print("%s\t%d\t%d\t%s\t%d\t%s\t%s" % (member, seq.length, seq.num_covered(), arch_str, family_length, arch_str_pos, arch_desc)) self.details_file.close() if self.options.stats: stats_file = open(self.options.stats, "w") total_residues = 0.0 covered_residues = 0 covered_residues_nonnovel = 0 nonnovel_sources = complementerset(["Novel"]) for seq in self.seqcat.values(): total_residues += seq.length covered_residues += round(seq.coverage() * seq.length) covered_residues_nonnovel += round( seq.coverage(sources=nonnovel_sources) * seq.length) all_archs = set(arch for arch, _ in self.domain_archs) num_archs = len(all_archs) if "" in self.domain_archs: num_archs -= 1 if self.options.prefix: prefix = self.options.prefix else: prefix = "NOVEL" def split_arch(arch): return [ x for x in arch.replace("{", ";").replace("}", ";").split( ";") if x ] def exclude_novel_domains(domain_architecture): """Excludes novel domains from a domain architecture and returns the filtered domain architecture as a tuple.""" return tuple(a for a in split_arch(domain_architecture) if not a.startswith(prefix)) archs_without_novel = set( exclude_novel_domains(arch) for arch in all_archs) if () in archs_without_novel: archs_without_novel.remove(()) num_archs_without_novel = len(archs_without_novel) num_seqs_with_nonempty_domain_arch = sum( len(value) for ke, value in self.domain_archs if ke and ke != "NO_ASSIGNMENT") num_seqs_with_nonempty_domain_arch_ignore_novel = \ sum(len(value) for key, value in self.domain_archs if exclude_novel_domains(key) in archs_without_novel and key != "NO_ASSIGNMENT") num_seqs_with_nonempty_nonnovel_domain_arch = \ sum(len(value) for key, value in self.domain_archs if key and not any(a.startswith(prefix) for a in split_arch(key)) and key != "NO_ASSIGNMENT") with redirected(stdout=stats_file): print("Domain architectures") print("====================") print() print("Non-empty: %d" % num_archs) print("Non-empty (when ignoring novel domains): %d" % num_archs_without_novel) print() print("Sequences") print("=========") print() print("Total: %d" % len(self.seqcat)) print("With at least one domain: %d (%.4f%%)" % (num_seqs_with_nonempty_domain_arch, 100.0 * num_seqs_with_nonempty_domain_arch / len(self.seqcat))) print("With at least one non-novel domain: %d (%.4f%%)" % (num_seqs_with_nonempty_domain_arch_ignore_novel, 100. * num_seqs_with_nonempty_domain_arch_ignore_novel / len(self.seqcat))) print("With at least one domain and no novel domains: " "%d (%.4f%%)" % (num_seqs_with_nonempty_nonnovel_domain_arch, 100.0 * num_seqs_with_nonempty_nonnovel_domain_arch / len(self.seqcat))) print() print("Residues") print("========") print() print("Total: %d" % total_residues) print("Covered: %d (%.4f%%)" % (covered_residues, 100.0 * covered_residues / total_residues)) print("Covered by non-novel: %d (%.4f%%)" % (covered_residues_nonnovel, 100.0 * covered_residues_nonnovel / total_residues)) stats_file.close()
def run_real(self): """Runs the applications""" if len(self.args) != 2: self.error("exactly two input files are expected") AssignmentOverlapChecker.max_overlap = self.options.max_overlap if self.options.interpro_parent_child_file: self.log.info("Loading InterPro parent-child assignments from %s..." % \ self.options.interpro_parent_child_file) self.interpro = InterPro.FromFile(self.options.interpro_parent_child_file) else: self.interpro = InterPro() self.interpro_names = InterProNames.FromFile(self.options.interpro_names_file) if self.options.details: self.details_file = open(self.options.details, "w") else: self.details_file = None interpro_file, clustering_file = self.args self.process_interpro_file(interpro_file) self.process_clustering_file(clustering_file) self.sort_by_domain_architecture() for seqs in self.domain_archs.itervalues(): seqs.sort() self.domain_archs = self.domain_archs.items() self.domain_archs.sort(key=lambda x: len(x[1]), reverse=True) for domain_arch, members in self.domain_archs: if domain_arch: arch_str = ";".join(domain_arch) else: arch_str = "NO_ASSIGNMENT" arch_str_pos = "NO_ASSIGNMENT" arch_desc = "NO_DESCRIPTION" family_length = len(members) for member in members: seq = self.seqcat[member] if domain_arch: arch_str_pos = ";".join(assignment.short_repr() \ for assignment in seq.assignments) arch_desc = ";".join( \ self.interpro_names[assignment.domain] for assignment in seq.assignments ) print "%s\t%d\t%s\t%d\t%s\t%s" % (member, seq.length, arch_str, \ family_length, arch_str_pos, \ arch_desc) self.details_file.close() if self.options.stats: stats_file = open(self.options.stats, "w") total_residues, covered_residues, covered_residues_nonnovel = 0.0, 0, 0 nonnovel_sources = complementerset(["Novel"]) for seq in self.seqcat.itervalues(): total_residues += seq.length covered_residues += round(seq.coverage() * seq.length) covered_residues_nonnovel += round(seq.coverage(sources=nonnovel_sources) * seq.length) all_archs = set(arch for arch, _ in self.domain_archs) num_archs = len(all_archs) if "" in self.domain_archs: num_archs -= 1 def exclude_novel_domains(domain_architecture): """Excludes novel domains from a domain architecture and returns the filtered domain architecture as a tuple.""" return tuple(a for a in domain_architecture if not a.startswith("NOVEL")) archs_without_novel = set(exclude_novel_domains(arch) for arch in all_archs) archs_without_novel.discard(()) num_archs_without_novel = len(archs_without_novel) num_seqs_with_nonempty_domain_arch = \ sum(len(value) for key, value in self.domain_archs if key) num_seqs_with_nonempty_domain_arch_ignore_novel = \ sum(len(value) for key, value in self.domain_archs if exclude_novel_domains(key) in archs_without_novel) num_seqs_with_nonempty_nonnovel_domain_arch = \ sum(len(value) for key, value in self.domain_archs if key and not any(a.startswith("NOVEL") for a in key)) with redirected(stdout=stats_file): print "Domain architectures" print "====================" print "" print "Non-empty: %d" % num_archs print "Non-empty (when ignoring novel domains): %d" % num_archs_without_novel print "" print "Sequences" print "=========" print "" print "Total: %d" % len(self.seqcat) print "With at least one domain: %d (%.4f%%)" %\ (num_seqs_with_nonempty_domain_arch, 100.0 * num_seqs_with_nonempty_domain_arch / len(self.seqcat)) print "With at least one non-novel domain: %d (%.4f%%)" %\ (num_seqs_with_nonempty_domain_arch_ignore_novel, 100.0 * num_seqs_with_nonempty_domain_arch_ignore_novel / len(self.seqcat)) print "With at least one domain and no novel domains: %d (%.4f%%)" %\ (num_seqs_with_nonempty_nonnovel_domain_arch, 100.0 * num_seqs_with_nonempty_nonnovel_domain_arch / len(self.seqcat)) print "" print "Residues" print "========" print "" print "Total: %d" % total_residues print "Covered: %d (%.4f%%)" % (covered_residues, 100.0*covered_residues/total_residues) print "Covered by non-novel: %d (%.4f%%)" % (covered_residues_nonnovel, 100.0*covered_residues_nonnovel/total_residues) stats_file.close()
def run_real(self): """Runs the applications""" if len(self.args) != 2: self.error("exactly two input files are expected") AssignmentOverlapChecker.max_overlap = self.options.max_overlap if self.options.interpro_parent_child_file: self.log.info("Loading InterPro parent-child" " assignments from %s..." % self.options.interpro_parent_child_file) self.interpro = InterPro.FromFile( self.options.interpro_parent_child_file) else: self.interpro = InterPro() self.interpro_names = InterProNames.FromFile( self.options.interpro_names_file) if self.options.details: self.details_file = open(self.options.details, "w") else: self.details_file = None interpro_file, hmmer_file = self.args self.process_interpro_file(interpro_file) self.process_hmmer_file(hmmer_file) self.sort_by_domain_architecture() for seqs in self.domain_archs.values(): seqs.sort() self.domain_archs = self.domain_archs.items() self.domain_archs.sort(key=lambda x: len(x[1]), reverse=True) for domain_arch, members in self.domain_archs: if domain_arch: arch_str = domain_arch # ";".join(domain_arch) else: arch_str = "NO_ASSIGNMENT" arch_str_pos = "NO_ASSIGNMENT" arch_desc = "NO_DESCRIPTION" family_length = len(members) for member in members: seq = self.seqcat[member] if domain_arch: arch_str_pos = seq.architecture_pos arch_desc = ";".join( self.interpro_names[assignment.domain] for assignment in seq.assignments) print("%s\t%d\t%d\t%s\t%d\t%s\t%s" % (member, seq.length, seq.num_covered(), arch_str, family_length, arch_str_pos, arch_desc)) self.details_file.close() if self.options.stats: stats_file = open(self.options.stats, "w") total_residues = 0.0 covered_residues, covered_residues_nonnovel = 0, 0 nonnovel_sources = complementerset(["Novel"]) for seq in self.seqcat.values(): total_residues += seq.length covered_residues += round(seq.coverage() * seq.length) covered_residues_nonnovel += round( seq.coverage(sources=nonnovel_sources) * seq.length) all_archs = set(arch for arch, _ in self.domain_archs) num_archs = len(all_archs) if "" in self.domain_archs: num_archs -= 1 def split_arch(arch): return [x for x in arch.replace("{", ";").replace("}", ";") .split(";") if x] def exclude_novel_domains(domain_architecture): """Excludes novel domains from a domain architecture and returns the filtered domain architecture as a tuple.""" return tuple(a for a in split_arch(domain_architecture) if a not in self.hmm_domains) archs_without_novel = set(exclude_novel_domains(arch) for arch in all_archs) if () in archs_without_novel: archs_without_novel.remove(()) num_archs_without_novel = len(archs_without_novel) num_seqs_with_nonempty_domain_arch = sum( len(value) for key, value in self.domain_archs if key and key != "NO_ASSIGNMENT") num_seqs_with_nonempty_domain_arch_ignore_novel = sum( len(value) for key, value in self.domain_archs if exclude_novel_domains(key) in archs_without_novel and key != "NO_ASSIGNMENT") num_seqs_with_nonempty_nonnovel_domain_arch = sum( len(value) for ke, value in self.domain_archs if ke and not any(a in self.hmm_domains for a in ke) and ke != "NO_ASSIGNMENT") with redirected(stdout=stats_file): print("Domain architectures") print("====================") print() print("Non-empty: %d" % num_archs) print("Non-empty (when ignoring novel domains): %d" % num_archs_without_novel) print() print("Sequences") print("=========") print() print("Total: %d" % len(self.seqcat)) print("With at least one domain: %d (%.4f%%)" % (num_seqs_with_nonempty_domain_arch, 100.0 * num_seqs_with_nonempty_domain_arch / len(self.seqcat))) print("With at least one non-novel domain: %d (%.4f%%)" % (num_seqs_with_nonempty_domain_arch_ignore_novel, 100. * num_seqs_with_nonempty_domain_arch_ignore_novel / len(self.seqcat))) print("With at least one domain and no novel" "domains: %d (%.4f%%)" % (num_seqs_with_nonempty_nonnovel_domain_arch, 100.0 * num_seqs_with_nonempty_nonnovel_domain_arch / len(self.seqcat))) print() print("Residues") print("========") print() print("Total: %d" % total_residues) print("Covered: %d (%.4f%%)" % (covered_residues, 100.0 * covered_residues / total_residues)) print("Covered by non-novel: %d (%.4f%%)" % ( covered_residues_nonnovel, 100.0 * covered_residues_nonnovel/total_residues)) stats_file.close()