def run(self): """Runs the calculation""" self.logger.info("Starting module %s", self.name) self.prepare() # Search for the CommandLineApp object in the module app = [] for value in self.module.__dict__.values(): if isinstance(value, type) and value != CommandLineApp \ and issubclass(value, CommandLineApp): app.append(value) if len(app) != 1: raise ValueError("more than one CommandLineApp in %s" % self.name) # Create the application app = app[0](logger=self.logger) args = ["-c", self.config.get("@global.config_file")] # add some extra args, if any try: self.extra_args except NameError: self.extra_args = [] args.extend(self.extra_args) for param, value in self.parameters.items(): if not param.startswith("switch."): continue switch, value = value.split(" ", 1) value = modula.STORAGE_ENGINE.get_filename(value.strip()) args.extend([switch, value]) if "infile" in self.parameters: infiles = self.parameters["infile"].split(",") for infile in infiles: infile = modula.STORAGE_ENGINE.get_filename(infile.strip()) args.append(infile) if "stdin" in self.parameters: stdin = modula.STORAGE_ENGINE.get_source(self.parameters["stdin"]) else: stdin = None out_fname = modula.STORAGE_ENGINE.get_filename(self.name) stdout = modula.STORAGE_ENGINE.get_result_stream(self, mode="wb") try: with redirected(stdin=stdin, stdout=stdout): retcode = app.run(args) stdout.close() if retcode: raise RuntimeError("non-zero return code from child module") except Exception: # If an error happens, remove the output file and re-raise # the exception stdout.close() os.unlink(out_fname) raise self.logger.info("Finished module %s", self.name)
def print_new_domains_table(self, table): """Prints the new domain table""" self.log.info("Printing the new domains table") table_file = open(self.options.new_domains_table, "w") with redirected(stdout=table_file): for cluster_name in sorted(table.keys()): print(cluster_name + "\t" + "\t".join(table[cluster_name])) table_file.close()
def print_new_domains_table(self, table): """Prints the new domain table""" self.log.info("Printing the new domains table") table_file = open(self.options.new_domains_table, "w") with redirected(stdout=table_file): for cluster_name in sorted(table.keys()): print cluster_name + "\t" + "\t".join(table[cluster_name]) table_file.close()
def run_real(self): """Runs the applications""" if len(self.args) != 2: self.error("exactly two input files are expected") AssignmentOverlapChecker.max_overlap = self.options.max_overlap AssignmentOverlapChecker.log = self.log if self.options.interpro_parent_child_file: self.log.info("Loading InterPro parent-child" " assignments from %s..." % self.options.interpro_parent_child_file) self.interpro = InterPro.FromFile( self.options.interpro_parent_child_file) else: self.interpro = InterPro() self.interpro_names = InterProNames.FromFile( self.options.interpro_names_file) if self.options.details: self.details_file = open(self.options.details, "w") else: self.details_file = None if self.options.old_table: self.process_old_table(self.options.old_table) self.using_old_table = True else: self.using_old_table = False self.current_cluster_id = 1 interpro_file, clustering_file = self.args self.process_interpro_file(interpro_file) table = self.process_clustering_file(clustering_file) self.sort_by_domain_architecture() if self.options.new_domains_table: self.print_new_domains_table(table) for seqs in self.domain_archs.values(): seqs.sort() self.domain_archs = self.domain_archs.items() self.domain_archs.sort(key=lambda x: len(x[1]), reverse=True) for domain_arch, members in self.domain_archs: if domain_arch: arch_str = domain_arch else: arch_str = "NO_ASSIGNMENT" arch_str_pos = "NO_ASSIGNMENT" arch_desc = "NO_DESCRIPTION" family_length = len(members) for member in members: seq = self.seqcat[member] if domain_arch: arch_str_pos = seq.architecture_pos arch_desc = ";".join(self.interpro_names[assignment.domain] for assignment in seq.assignments) print("%s\t%d\t%d\t%s\t%d\t%s\t%s" % (member, seq.length, seq.num_covered(), arch_str, family_length, arch_str_pos, arch_desc)) self.details_file.close() if self.options.stats: stats_file = open(self.options.stats, "w") total_residues = 0.0 covered_residues = 0 covered_residues_nonnovel = 0 nonnovel_sources = complementerset(["Novel"]) for seq in self.seqcat.values(): total_residues += seq.length covered_residues += round(seq.coverage() * seq.length) covered_residues_nonnovel += round( seq.coverage(sources=nonnovel_sources) * seq.length) all_archs = set(arch for arch, _ in self.domain_archs) num_archs = len(all_archs) if "" in self.domain_archs: num_archs -= 1 if self.options.prefix: prefix = self.options.prefix else: prefix = "NOVEL" def split_arch(arch): return [ x for x in arch.replace("{", ";").replace("}", ";").split( ";") if x ] def exclude_novel_domains(domain_architecture): """Excludes novel domains from a domain architecture and returns the filtered domain architecture as a tuple.""" return tuple(a for a in split_arch(domain_architecture) if not a.startswith(prefix)) archs_without_novel = set( exclude_novel_domains(arch) for arch in all_archs) if () in archs_without_novel: archs_without_novel.remove(()) num_archs_without_novel = len(archs_without_novel) num_seqs_with_nonempty_domain_arch = sum( len(value) for ke, value in self.domain_archs if ke and ke != "NO_ASSIGNMENT") num_seqs_with_nonempty_domain_arch_ignore_novel = \ sum(len(value) for key, value in self.domain_archs if exclude_novel_domains(key) in archs_without_novel and key != "NO_ASSIGNMENT") num_seqs_with_nonempty_nonnovel_domain_arch = \ sum(len(value) for key, value in self.domain_archs if key and not any(a.startswith(prefix) for a in split_arch(key)) and key != "NO_ASSIGNMENT") with redirected(stdout=stats_file): print("Domain architectures") print("====================") print() print("Non-empty: %d" % num_archs) print("Non-empty (when ignoring novel domains): %d" % num_archs_without_novel) print() print("Sequences") print("=========") print() print("Total: %d" % len(self.seqcat)) print("With at least one domain: %d (%.4f%%)" % (num_seqs_with_nonempty_domain_arch, 100.0 * num_seqs_with_nonempty_domain_arch / len(self.seqcat))) print("With at least one non-novel domain: %d (%.4f%%)" % (num_seqs_with_nonempty_domain_arch_ignore_novel, 100. * num_seqs_with_nonempty_domain_arch_ignore_novel / len(self.seqcat))) print("With at least one domain and no novel domains: " "%d (%.4f%%)" % (num_seqs_with_nonempty_nonnovel_domain_arch, 100.0 * num_seqs_with_nonempty_nonnovel_domain_arch / len(self.seqcat))) print() print("Residues") print("========") print() print("Total: %d" % total_residues) print("Covered: %d (%.4f%%)" % (covered_residues, 100.0 * covered_residues / total_residues)) print("Covered by non-novel: %d (%.4f%%)" % (covered_residues_nonnovel, 100.0 * covered_residues_nonnovel / total_residues)) stats_file.close()
def run_real(self): """Runs the applications""" if len(self.args) != 2: self.error("exactly two input files are expected") AssignmentOverlapChecker.max_overlap = self.options.max_overlap if self.options.interpro_parent_child_file: self.log.info("Loading InterPro parent-child assignments from %s..." % \ self.options.interpro_parent_child_file) self.interpro = InterPro.FromFile(self.options.interpro_parent_child_file) else: self.interpro = InterPro() self.interpro_names = InterProNames.FromFile(self.options.interpro_names_file) if self.options.details: self.details_file = open(self.options.details, "w") else: self.details_file = None interpro_file, clustering_file = self.args self.process_interpro_file(interpro_file) self.process_clustering_file(clustering_file) self.sort_by_domain_architecture() for seqs in self.domain_archs.itervalues(): seqs.sort() self.domain_archs = self.domain_archs.items() self.domain_archs.sort(key=lambda x: len(x[1]), reverse=True) for domain_arch, members in self.domain_archs: if domain_arch: arch_str = ";".join(domain_arch) else: arch_str = "NO_ASSIGNMENT" arch_str_pos = "NO_ASSIGNMENT" arch_desc = "NO_DESCRIPTION" family_length = len(members) for member in members: seq = self.seqcat[member] if domain_arch: arch_str_pos = ";".join(assignment.short_repr() \ for assignment in seq.assignments) arch_desc = ";".join( \ self.interpro_names[assignment.domain] for assignment in seq.assignments ) print "%s\t%d\t%s\t%d\t%s\t%s" % (member, seq.length, arch_str, \ family_length, arch_str_pos, \ arch_desc) self.details_file.close() if self.options.stats: stats_file = open(self.options.stats, "w") total_residues, covered_residues, covered_residues_nonnovel = 0.0, 0, 0 nonnovel_sources = complementerset(["Novel"]) for seq in self.seqcat.itervalues(): total_residues += seq.length covered_residues += round(seq.coverage() * seq.length) covered_residues_nonnovel += round(seq.coverage(sources=nonnovel_sources) * seq.length) all_archs = set(arch for arch, _ in self.domain_archs) num_archs = len(all_archs) if "" in self.domain_archs: num_archs -= 1 def exclude_novel_domains(domain_architecture): """Excludes novel domains from a domain architecture and returns the filtered domain architecture as a tuple.""" return tuple(a for a in domain_architecture if not a.startswith("NOVEL")) archs_without_novel = set(exclude_novel_domains(arch) for arch in all_archs) archs_without_novel.discard(()) num_archs_without_novel = len(archs_without_novel) num_seqs_with_nonempty_domain_arch = \ sum(len(value) for key, value in self.domain_archs if key) num_seqs_with_nonempty_domain_arch_ignore_novel = \ sum(len(value) for key, value in self.domain_archs if exclude_novel_domains(key) in archs_without_novel) num_seqs_with_nonempty_nonnovel_domain_arch = \ sum(len(value) for key, value in self.domain_archs if key and not any(a.startswith("NOVEL") for a in key)) with redirected(stdout=stats_file): print "Domain architectures" print "====================" print "" print "Non-empty: %d" % num_archs print "Non-empty (when ignoring novel domains): %d" % num_archs_without_novel print "" print "Sequences" print "=========" print "" print "Total: %d" % len(self.seqcat) print "With at least one domain: %d (%.4f%%)" %\ (num_seqs_with_nonempty_domain_arch, 100.0 * num_seqs_with_nonempty_domain_arch / len(self.seqcat)) print "With at least one non-novel domain: %d (%.4f%%)" %\ (num_seqs_with_nonempty_domain_arch_ignore_novel, 100.0 * num_seqs_with_nonempty_domain_arch_ignore_novel / len(self.seqcat)) print "With at least one domain and no novel domains: %d (%.4f%%)" %\ (num_seqs_with_nonempty_nonnovel_domain_arch, 100.0 * num_seqs_with_nonempty_nonnovel_domain_arch / len(self.seqcat)) print "" print "Residues" print "========" print "" print "Total: %d" % total_residues print "Covered: %d (%.4f%%)" % (covered_residues, 100.0*covered_residues/total_residues) print "Covered by non-novel: %d (%.4f%%)" % (covered_residues_nonnovel, 100.0*covered_residues_nonnovel/total_residues) stats_file.close()
def run(self): """Runs the calculation""" self.logger.info("Starting module %s", self.name) self.prepare() # Search for the CommandLineApp object in the module app = [] for value in self.module.__dict__.values(): if isinstance(value, type) and value != CommandLineApp \ and issubclass(value, CommandLineApp): app.append(value) if len(app) != 1: raise ValueError("more than one CommandLineApp in %s" % self.name) # Create the application app = app[0](logger=self.logger) args = ["-c", self.config.get("@global.config_file")] # add some extra args, if any try: self.extra_args except NameError: self.extra_args = [] if self.extra_args is not None: args.extend(self.extra_args) for param, value in self.parameters.items(): if not param.startswith("switch."): continue switch, value = value.split(" ", 1) value = modula.STORAGE_ENGINE.get_filename(value.strip()) args.extend([switch, value]) if "infile" in self.parameters: infiles = self.parameters["infile"].split(",") for infile in infiles: infile = modula.STORAGE_ENGINE.get_filename(infile.strip()) args.append(infile) if "stdin" in self.parameters: stdin = modula.STORAGE_ENGINE.get_source(self.parameters["stdin"]) else: stdin = None out_fname = modula.STORAGE_ENGINE.get_filename(self.name) stdout = modula.STORAGE_ENGINE.get_result_stream(self, mode="wb") try: with redirected(stdin=stdin, stdout=stdout): retcode = app.run(args) stdout.close() if retcode: raise RuntimeError("non-zero return code from child module") except RuntimeError: # If an error happens, remove the output file and re-raise # the exception stdout.close() os.unlink(out_fname) raise self.logger.info("Finished module %s", self.name)
def run_real(self): """Runs the applications""" if len(self.args) != 2: self.error("exactly two input files are expected") AssignmentOverlapChecker.max_overlap = self.options.max_overlap if self.options.interpro_parent_child_file: self.log.info("Loading InterPro parent-child" " assignments from %s..." % self.options.interpro_parent_child_file) self.interpro = InterPro.FromFile( self.options.interpro_parent_child_file) else: self.interpro = InterPro() self.interpro_names = InterProNames.FromFile( self.options.interpro_names_file) if self.options.details: self.details_file = open(self.options.details, "w") else: self.details_file = None interpro_file, hmmer_file = self.args self.process_interpro_file(interpro_file) self.process_hmmer_file(hmmer_file) self.sort_by_domain_architecture() for seqs in self.domain_archs.values(): seqs.sort() self.domain_archs = self.domain_archs.items() self.domain_archs.sort(key=lambda x: len(x[1]), reverse=True) for domain_arch, members in self.domain_archs: if domain_arch: arch_str = domain_arch # ";".join(domain_arch) else: arch_str = "NO_ASSIGNMENT" arch_str_pos = "NO_ASSIGNMENT" arch_desc = "NO_DESCRIPTION" family_length = len(members) for member in members: seq = self.seqcat[member] if domain_arch: arch_str_pos = seq.architecture_pos arch_desc = ";".join( self.interpro_names[assignment.domain] for assignment in seq.assignments) print("%s\t%d\t%d\t%s\t%d\t%s\t%s" % (member, seq.length, seq.num_covered(), arch_str, family_length, arch_str_pos, arch_desc)) self.details_file.close() if self.options.stats: stats_file = open(self.options.stats, "w") total_residues = 0.0 covered_residues, covered_residues_nonnovel = 0, 0 nonnovel_sources = complementerset(["Novel"]) for seq in self.seqcat.values(): total_residues += seq.length covered_residues += round(seq.coverage() * seq.length) covered_residues_nonnovel += round( seq.coverage(sources=nonnovel_sources) * seq.length) all_archs = set(arch for arch, _ in self.domain_archs) num_archs = len(all_archs) if "" in self.domain_archs: num_archs -= 1 def split_arch(arch): return [x for x in arch.replace("{", ";").replace("}", ";") .split(";") if x] def exclude_novel_domains(domain_architecture): """Excludes novel domains from a domain architecture and returns the filtered domain architecture as a tuple.""" return tuple(a for a in split_arch(domain_architecture) if a not in self.hmm_domains) archs_without_novel = set(exclude_novel_domains(arch) for arch in all_archs) if () in archs_without_novel: archs_without_novel.remove(()) num_archs_without_novel = len(archs_without_novel) num_seqs_with_nonempty_domain_arch = sum( len(value) for key, value in self.domain_archs if key and key != "NO_ASSIGNMENT") num_seqs_with_nonempty_domain_arch_ignore_novel = sum( len(value) for key, value in self.domain_archs if exclude_novel_domains(key) in archs_without_novel and key != "NO_ASSIGNMENT") num_seqs_with_nonempty_nonnovel_domain_arch = sum( len(value) for ke, value in self.domain_archs if ke and not any(a in self.hmm_domains for a in ke) and ke != "NO_ASSIGNMENT") with redirected(stdout=stats_file): print("Domain architectures") print("====================") print() print("Non-empty: %d" % num_archs) print("Non-empty (when ignoring novel domains): %d" % num_archs_without_novel) print() print("Sequences") print("=========") print() print("Total: %d" % len(self.seqcat)) print("With at least one domain: %d (%.4f%%)" % (num_seqs_with_nonempty_domain_arch, 100.0 * num_seqs_with_nonempty_domain_arch / len(self.seqcat))) print("With at least one non-novel domain: %d (%.4f%%)" % (num_seqs_with_nonempty_domain_arch_ignore_novel, 100. * num_seqs_with_nonempty_domain_arch_ignore_novel / len(self.seqcat))) print("With at least one domain and no novel" "domains: %d (%.4f%%)" % (num_seqs_with_nonempty_nonnovel_domain_arch, 100.0 * num_seqs_with_nonempty_nonnovel_domain_arch / len(self.seqcat))) print() print("Residues") print("========") print() print("Total: %d" % total_residues) print("Covered: %d (%.4f%%)" % (covered_residues, 100.0 * covered_residues / total_residues)) print("Covered by non-novel: %d (%.4f%%)" % ( covered_residues_nonnovel, 100.0 * covered_residues_nonnovel/total_residues)) stats_file.close()