def process_file(self, input_file): """Processes the given input file that contains the domain architectures.""" self.log.info("Running overrepresentation analysis") self.log.info("p-value = %.4f, correction method = %s", self.options.confidence, self.options.correction) if self.options.arch_file: arch_file_name = self.options.arch_file if self.options.ignore: arch_file_name += "_unfiltered" arch_file = open(arch_file_name, "w") confidence = self.options.confidence if self.options.ignore: confidence = float("inf") self.log.info("Ignored the significance value." " We will filter results later.") overrep = OverrepresentationAnalyser(self.go_tree, self.go_mapping, confidence=confidence, min_count=self.options.min_size, correction=self.options. correction) cache = {} num_no_annotations = 0 num_no_domains = 0 total_seqs = 0 for line in open_anything(input_file): parts = line.strip().split("\t") gene_id = parts[0] prts = parts[3].replace("{", ";").replace("}", ";").split(";") arch = tuple([x for x in prts if x]) total_seqs += 1 if arch == ("NO_ASSIGNMENT", ): num_no_domains += 1 num_no_annotations += 1 continue if arch not in cache: cache[arch] = overrep.test_group(arch) if self.options.arch_file: arch_file.write("{}\n".format(parts[3])) # architecture for term, p_value in cache[arch]: line = " %.4f: %s (%s)\n" % (p_value, term.id, term.name) arch_file.write(line) arch_file.write("\n") if self.options.results_by_protein: print(gene_id) for term, p_value in cache[arch]: print(" %.4f: %s (%s)" % (p_value, term.id, term.name)) print() if not cache[arch]: num_no_annotations += 1 self.log.info("Total number of sequences processed: %d", total_seqs) if num_no_annotations: self.log.info("%d sequences have no overrepresented annots. :(", num_no_annotations) if num_no_domains: self.log.info("%d sequences have no domains at all :(", num_no_domains) if self.options.arch_file: arch_file.close() if self.options.ignore: # we filter the file with the significance value filterer = ResultFileFilter(arch_file_name) filterer.filter(self.options.arch_file, confidence=self.options.confidence)
def _filter_arch_file(self, unfiltered, filtered): if self.options.ignore and self.options.results_by_arch: # we filter the file with the significance value filterer = ResultFileFilter(unfiltered) filterer.filter(filtered, confidence=self.options.max_pvalue)
def do_run(self): """Runs the whole ConSAT pipeline""" # Get the output folder name outfolder = self.config.get("DEFAULT", "folder.output") # Run and export the inferred domain architectures outfile = os.path.join(outfolder, "domain_architectures.tab") self.modula.run("find_domain_arch_with_hmms", force=self.options.force) if not os.path.exists(outfile): shutil.copy( self.modula.STORAGE_ENGINE.get_filename( "find_domain_arch_with_hmms"), outfile) self.log.info("Exported domain architectures to %s.", outfile) # Run and export the label assignment outfile = os.path.join(outfolder, "assigned_labels.txt") self.modula.run("label_assignment", force=self.options.force) if not os.path.exists(outfile): shutil.copy(self.modula.STORAGE_ENGINE.get_filename( "label_assignment"), outfile) self.log.info("Exported label assignment to %s.", outfile) # Run and export the overrepresentation analysis there_is_combination = self.config.get("DEFAULT", "file.function.goa_file") overrep_detail = self.config.get("analysis:overrep", "per_protein") transfer_detail = self.config.get("analysis:overrep", "per_protein") outfile = os.path.join(outfolder, "overrepresentation_analysis.txt") if not there_is_combination: self.modula.run("overrep", force=self.options.force) if not os.path.exists(outfile) and overrep_detail: shutil.copy(self.modula.STORAGE_ENGINE.get_filename("overrep"), outfile) self.log.info("Exported overrepresentation analysis to %s.", outfile) else: self.modula.run("overrep", force=self.options.force, extra_args=["-i"]) if not os.path.exists(outfile) and overrep_detail: filterer = ResultFileFilter( self.modula.STORAGE_ENGINE.get_filename("overrep")) conf = float(self.config.get("analysis:overrep", "confidence")) self.log.info("Obtaining filtered file from %s", self.modula.STORAGE_ENGINE.get_filename( "overrep")) filterer.filter(outfile, confidence=conf) self.log.info("Exported overrepresentation analysis to %s.", outfile) # Run the functional prediction, if we have to if self.config.get("DEFAULT", "file.function.goa_file"): if not there_is_combination: self.modula.run("function_arch", force=self.options.force) outfile = os.path.join(outfolder, "predicted_function_by_transfer.txt") if not os.path.exists(outfile) and transfer_detail: shutil.copy( self.modula.STORAGE_ENGINE.get_filename( "function_arch"), outfile) self.log.info("Exported predicted function " "by transfer to %s.", outfile) else: self.modula.run("function_arch", force=self.options.force, extra_args=["-i"]) outfile = os.path.join(outfolder, "predicted_function_by_transfer.txt") if not os.path.exists(outfile) and transfer_detail: filterer = ResultFileFilter( self.modula.STORAGE_ENGINE.get_filename( "function_arch")) conf = float(self.config.get("analysis:function_arch", "max_pvalue")) filterer.filter(outfile, confidence=conf) self.log.info("Exported predicted function by" " transfer to %s.", outfile) else: self.log.info("No GOA source file was found and therefore") self.log.info("no funtion transfer will be performed") if there_is_combination: # we combine the overrep and function_arch results infile1 = self.modula.STORAGE_ENGINE.get_filename("overrep") infile2 = self.modula.STORAGE_ENGINE.get_filename("function_arch") outfile = os.path.join(outfolder, "combined_prediction.txt") # confidence is 0.05 (default value) # TODO: add this as a parameter in the configuration file if not os.path.exists(outfile) and overrep_detail\ and transfer_detail: combiner = ResultFileCombiner(infile1, infile2) combiner.combine(outfile) # if there are files by arch, we combine them if self.config.get("generated", "file.overrep.arch_file") and\ self.config.get("generated", "file.function_arch.general_arch_file"): infile_arch1 = self.config.get( "generated", "file.overrep.arch_file") + "_unfiltered" infile_arch2 = self.config.get( "generated", "file.function_arch.general_arch_file") +\ "_unfiltered" outfile_arch = os.path.join(outfolder, "combined_prediction_by_arch.txt") if not os.path.exists(outfile_arch): combiner_arch = ResultFileCombiner(infile_arch1, infile_arch2) combiner_arch.combine(outfile_arch) else: # the combination is a copy of the overrep file infile = os.path.join(outfolder, "overrepresentation_analysis.txt") outfile = os.path.join(outfolder, "combined_prediction.txt") if overrep_detail: shutil.copy(infile, outfile) # same for the overrep by arch, if it exists infile_arch = self.config.get("generated", "file.overrep.arch_file") outfile_arch = os.path.join(outfolder, "combined_prediction_by_arch.txt") shutil.copy(infile_arch, outfile_arch) # Run the words prediction, if we have to if self.config.get("DEFAULT", "file.idmapping") and\ self.config.get("DEFAULT", "file.rdffile"): self.modula.run("get_text", force=self.options.force) self.log.info("Text weighting done!") outfile = os.path.join(outfolder, "weight_file_per_arch") if not os.path.exists(outfile): shutil.copy(self.modula.STORAGE_ENGINE.get_filename( "get_text"), outfile) self.log.info("Exported weight vectors per architecture to %s", outfile) else: self.log.info("Either not idmapping or RDF file were specified") self.log.info("no text weighting will be performed")
def process_file(self, input_file): """Processes the given input file that contains the domain architectures.""" self.log.info("Running overrepresentation analysis") self.log.info("p-value = %.4f, correction method = %s", self.options.confidence, self.options.correction) if self.options.arch_file: arch_file_name = self.options.arch_file if self.options.ignore: arch_file_name += "_unfiltered" arch_file = open(arch_file_name, "w") confidence = self.options.confidence if self.options.ignore: confidence = float("inf") self.log.info("Ignored the significance value." " We will filter results later.") overrep = OverrepresentationAnalyser( self.go_tree, self.go_mapping, confidence=confidence, min_count=self.options.min_size, correction=self.options.correction) cache = {} num_no_annotations = 0 num_no_domains = 0 total_seqs = 0 for line in open_anything(input_file): parts = line.strip().split("\t") gene_id = parts[0] prts = parts[3].replace("{", ";").replace("}", ";").split(";") arch = tuple([x for x in prts if x]) total_seqs += 1 if arch == ("NO_ASSIGNMENT", ): num_no_domains += 1 num_no_annotations += 1 continue if arch not in cache: cache[arch] = overrep.test_group(arch) if self.options.arch_file: arch_file.write("{}\n".format(parts[3])) # architecture for term, p_value in cache[arch]: line = " %.4f: %s (%s)\n" % (p_value, term.id, term.name) arch_file.write(line) arch_file.write("\n") if self.options.results_by_protein: print(gene_id) for term, p_value in cache[arch]: print(" %.4f: %s (%s)" % (p_value, term.id, term.name)) print() if not cache[arch]: num_no_annotations += 1 self.log.info("Total number of sequences processed: %d", total_seqs) if num_no_annotations: self.log.info("%d sequences have no overrepresented annots. :(", num_no_annotations) if num_no_domains: self.log.info("%d sequences have no domains at all :(", num_no_domains) if self.options.arch_file: arch_file.close() if self.options.ignore: # we filter the file with the significance value filterer = ResultFileFilter(arch_file_name) filterer.filter(self.options.arch_file, confidence=self.options.confidence)