Example #1
0
    def process_file(self, input_file):
        """Processes the given input file that contains the domain
        architectures."""
        self.log.info("Running overrepresentation analysis")
        self.log.info("p-value = %.4f, correction method = %s",
                      self.options.confidence, self.options.correction)

        if self.options.arch_file:
            arch_file_name = self.options.arch_file
            if self.options.ignore:
                arch_file_name += "_unfiltered"
            arch_file = open(arch_file_name, "w")

        confidence = self.options.confidence
        if self.options.ignore:
            confidence = float("inf")
        self.log.info("Ignored the significance value."
                      " We will filter results later.")

        overrep = OverrepresentationAnalyser(self.go_tree,
                                             self.go_mapping,
                                             confidence=confidence,
                                             min_count=self.options.min_size,
                                             correction=self.options.
                                             correction)
        cache = {}

        num_no_annotations = 0
        num_no_domains = 0
        total_seqs = 0

        for line in open_anything(input_file):
            parts = line.strip().split("\t")
            gene_id = parts[0]
            prts = parts[3].replace("{", ";").replace("}", ";").split(";")
            arch = tuple([x for x in prts if x])
            total_seqs += 1

            if arch == ("NO_ASSIGNMENT", ):
                num_no_domains += 1
                num_no_annotations += 1
                continue

            if arch not in cache:
                cache[arch] = overrep.test_group(arch)
                if self.options.arch_file:
                    arch_file.write("{}\n".format(parts[3]))  # architecture
                    for term, p_value in cache[arch]:
                        line = "  %.4f: %s (%s)\n" % (p_value, term.id,
                                                      term.name)
                        arch_file.write(line)
                    arch_file.write("\n")
            if self.options.results_by_protein:
                print(gene_id)
                for term, p_value in cache[arch]:
                    print("  %.4f: %s (%s)" % (p_value, term.id, term.name))
                print()

            if not cache[arch]:
                num_no_annotations += 1

        self.log.info("Total number of sequences processed: %d", total_seqs)
        if num_no_annotations:
            self.log.info("%d sequences have no overrepresented annots. :(",
                          num_no_annotations)
        if num_no_domains:
            self.log.info("%d sequences have no domains at all :(",
                          num_no_domains)

        if self.options.arch_file:
            arch_file.close()
            if self.options.ignore:
                # we filter the file with the significance value
                filterer = ResultFileFilter(arch_file_name)
                filterer.filter(self.options.arch_file,
                                confidence=self.options.confidence)
Example #2
0
 def _filter_arch_file(self, unfiltered, filtered):
     if self.options.ignore and self.options.results_by_arch:
         # we filter the file with the significance value
         filterer = ResultFileFilter(unfiltered)
         filterer.filter(filtered, confidence=self.options.max_pvalue)
Example #3
0
    def do_run(self):
        """Runs the whole ConSAT pipeline"""
        # Get the output folder name
        outfolder = self.config.get("DEFAULT", "folder.output")

        # Run and export the inferred domain architectures
        outfile = os.path.join(outfolder, "domain_architectures.tab")
        self.modula.run("find_domain_arch_with_hmms", force=self.options.force)
        if not os.path.exists(outfile):
            shutil.copy(
                self.modula.STORAGE_ENGINE.get_filename(
                    "find_domain_arch_with_hmms"), outfile)
            self.log.info("Exported domain architectures to %s.", outfile)

        # Run and export the label assignment
        outfile = os.path.join(outfolder, "assigned_labels.txt")
        self.modula.run("label_assignment", force=self.options.force)
        if not os.path.exists(outfile):
            shutil.copy(self.modula.STORAGE_ENGINE.get_filename(
                "label_assignment"), outfile)
            self.log.info("Exported label assignment to %s.", outfile)

        # Run and export the overrepresentation analysis
        there_is_combination = self.config.get("DEFAULT",
                                               "file.function.goa_file")
        overrep_detail = self.config.get("analysis:overrep", "per_protein")
        transfer_detail = self.config.get("analysis:overrep", "per_protein")
        outfile = os.path.join(outfolder, "overrepresentation_analysis.txt")

        if not there_is_combination:
            self.modula.run("overrep", force=self.options.force)
            if not os.path.exists(outfile) and overrep_detail:
                shutil.copy(self.modula.STORAGE_ENGINE.get_filename("overrep"),
                            outfile)
                self.log.info("Exported overrepresentation analysis to %s.",
                              outfile)
        else:
            self.modula.run("overrep", force=self.options.force,
                            extra_args=["-i"])
            if not os.path.exists(outfile) and overrep_detail:
                filterer = ResultFileFilter(
                    self.modula.STORAGE_ENGINE.get_filename("overrep"))
                conf = float(self.config.get("analysis:overrep", "confidence"))
                self.log.info("Obtaining filtered file from %s",
                              self.modula.STORAGE_ENGINE.get_filename(
                                  "overrep"))
                filterer.filter(outfile, confidence=conf)
                self.log.info("Exported overrepresentation analysis to %s.",
                              outfile)

        # Run the functional prediction, if we have to
        if self.config.get("DEFAULT", "file.function.goa_file"):
            if not there_is_combination:
                self.modula.run("function_arch", force=self.options.force)
                outfile = os.path.join(outfolder,
                                       "predicted_function_by_transfer.txt")
                if not os.path.exists(outfile) and transfer_detail:
                    shutil.copy(
                        self.modula.STORAGE_ENGINE.get_filename(
                            "function_arch"), outfile)
                    self.log.info("Exported predicted function "
                                  "by transfer to %s.", outfile)
            else:
                self.modula.run("function_arch",
                                force=self.options.force,
                                extra_args=["-i"])
                outfile = os.path.join(outfolder,
                                       "predicted_function_by_transfer.txt")
                if not os.path.exists(outfile) and transfer_detail:
                    filterer = ResultFileFilter(
                        self.modula.STORAGE_ENGINE.get_filename(
                            "function_arch"))
                    conf = float(self.config.get("analysis:function_arch",
                                                 "max_pvalue"))
                    filterer.filter(outfile, confidence=conf)
                    self.log.info("Exported predicted function by"
                                  " transfer to %s.", outfile)
        else:
            self.log.info("No GOA source file was found and therefore")
            self.log.info("no funtion transfer will be performed")

        if there_is_combination:
            # we combine the overrep and function_arch results
            infile1 = self.modula.STORAGE_ENGINE.get_filename("overrep")
            infile2 = self.modula.STORAGE_ENGINE.get_filename("function_arch")
            outfile = os.path.join(outfolder, "combined_prediction.txt")
            # confidence is 0.05 (default value)
            # TODO: add this as a parameter in the configuration file
            if not os.path.exists(outfile) and overrep_detail\
               and transfer_detail:
                combiner = ResultFileCombiner(infile1, infile2)
                combiner.combine(outfile)
            # if there are files by arch, we combine them
            if self.config.get("generated", "file.overrep.arch_file") and\
               self.config.get("generated",
                               "file.function_arch.general_arch_file"):
                infile_arch1 = self.config.get(
                    "generated", "file.overrep.arch_file") + "_unfiltered"
                infile_arch2 = self.config.get(
                    "generated", "file.function_arch.general_arch_file") +\
                    "_unfiltered"
                outfile_arch = os.path.join(outfolder,
                                            "combined_prediction_by_arch.txt")
                if not os.path.exists(outfile_arch):
                    combiner_arch = ResultFileCombiner(infile_arch1,
                                                       infile_arch2)
                    combiner_arch.combine(outfile_arch)
        else:
            # the combination is a copy of the overrep file
            infile = os.path.join(outfolder, "overrepresentation_analysis.txt")
            outfile = os.path.join(outfolder, "combined_prediction.txt")
            if overrep_detail:
                shutil.copy(infile, outfile)
            # same for the overrep by arch, if it exists
            infile_arch = self.config.get("generated",
                                          "file.overrep.arch_file")
            outfile_arch = os.path.join(outfolder,
                                        "combined_prediction_by_arch.txt")
            shutil.copy(infile_arch, outfile_arch)

        # Run the words prediction, if we have to
        if self.config.get("DEFAULT", "file.idmapping") and\
           self.config.get("DEFAULT", "file.rdffile"):
            self.modula.run("get_text", force=self.options.force)
            self.log.info("Text weighting done!")
            outfile = os.path.join(outfolder, "weight_file_per_arch")
            if not os.path.exists(outfile):
                shutil.copy(self.modula.STORAGE_ENGINE.get_filename(
                    "get_text"), outfile)
                self.log.info("Exported weight vectors per architecture to %s",
                              outfile)
        else:
            self.log.info("Either not idmapping or RDF file were specified")
            self.log.info("no text weighting will be performed")
Example #4
0
 def _filter_arch_file(self, unfiltered, filtered):
     if self.options.ignore and self.options.results_by_arch:
         # we filter the file with the significance value
         filterer = ResultFileFilter(unfiltered)
         filterer.filter(filtered, confidence=self.options.max_pvalue)
Example #5
0
    def process_file(self, input_file):
        """Processes the given input file that contains the domain
        architectures."""
        self.log.info("Running overrepresentation analysis")
        self.log.info("p-value = %.4f, correction method = %s",
                      self.options.confidence, self.options.correction)

        if self.options.arch_file:
            arch_file_name = self.options.arch_file
            if self.options.ignore:
                arch_file_name += "_unfiltered"
            arch_file = open(arch_file_name, "w")

        confidence = self.options.confidence
        if self.options.ignore:
            confidence = float("inf")
        self.log.info("Ignored the significance value."
                      " We will filter results later.")

        overrep = OverrepresentationAnalyser(
            self.go_tree,
            self.go_mapping,
            confidence=confidence,
            min_count=self.options.min_size,
            correction=self.options.correction)
        cache = {}

        num_no_annotations = 0
        num_no_domains = 0
        total_seqs = 0

        for line in open_anything(input_file):
            parts = line.strip().split("\t")
            gene_id = parts[0]
            prts = parts[3].replace("{", ";").replace("}", ";").split(";")
            arch = tuple([x for x in prts if x])
            total_seqs += 1

            if arch == ("NO_ASSIGNMENT", ):
                num_no_domains += 1
                num_no_annotations += 1
                continue

            if arch not in cache:
                cache[arch] = overrep.test_group(arch)
                if self.options.arch_file:
                    arch_file.write("{}\n".format(parts[3]))  # architecture
                    for term, p_value in cache[arch]:
                        line = "  %.4f: %s (%s)\n" % (p_value, term.id,
                                                      term.name)
                        arch_file.write(line)
                    arch_file.write("\n")
            if self.options.results_by_protein:
                print(gene_id)
                for term, p_value in cache[arch]:
                    print("  %.4f: %s (%s)" % (p_value, term.id, term.name))
                print()

            if not cache[arch]:
                num_no_annotations += 1

        self.log.info("Total number of sequences processed: %d", total_seqs)
        if num_no_annotations:
            self.log.info("%d sequences have no overrepresented annots. :(",
                          num_no_annotations)
        if num_no_domains:
            self.log.info("%d sequences have no domains at all :(",
                          num_no_domains)

        if self.options.arch_file:
            arch_file.close()
            if self.options.ignore:
                # we filter the file with the significance value
                filterer = ResultFileFilter(arch_file_name)
                filterer.filter(self.options.arch_file,
                                confidence=self.options.confidence)