Beispiel #1
0
    def _transfer_from_same_file(self, goa, arch_file):
        """ Transfer function from same architecture file
        """
        confidence = self.options.max_pvalue
        if self.options.ignore:
            confidence = float("inf")
            self.log.info("Ignored the significance value."
                          " We will filter results later.")
        ora = OverrepresentationAnalyser(self.go_tree,
                                         goa,
                                         confidence=confidence,
                                         min_count=1,
                                         correction=self.options.correction)
        cov = self.options.minimum_coverage / 100.0
        self.log.info("Transferring function from same file. Min coverage=%s",
                      str(cov))
        all_annotated = frozenset(goa.left.keys())
        if self.options.results_by_arch:
            arch_file_name = self.options.results_by_arch
            if self.options.ignore:
                arch_file_name += "_unfiltered"
            out = open(arch_file_name, "w")

        # for each architecture and its associated proteins...
        carriage_return = os.linesep
        twocr = os.linesep * 2
        self.log.info("Arch file: %s", arch_file)
        for arch, prots in ArchReader(arch_file, cov):
            if not prots or arch == "NO_ASSIGNMENT":
                # if there is no annotation for proteins in the arch...
                if self.options.results_by_protein:
                    print("{}{}".format(twocr.join(prots), carriage_return))
                if self.options.results_by_arch:
                    out.write("{}{}".format(arch, twocr))
                continue
            targets = set(prots)
            annotated_prots = targets & all_annotated
            lines = carriage_return.join([
                "  %.4f: %s (%s)" % (p_value, term.id, term.name)
                for term, p_value in ora.test_group(targets)
            ])
            if self.options.results_by_arch:
                out.write("{}{}{}{}".format(arch, carriage_return, lines,
                                            twocr))
            if self.options.results_by_protein:
                for rest, prot in i_zip(combinations(prots,
                                                     len(prots) - 1),
                                        reversed(prots)):
                    print(prot)
                    if prot in annotated_prots:
                        for term, p_value in ora.test_group(rest):
                            print("  %.4f: %s (%s)" %
                                  (p_value, term.id, term.name))
                    else:
                        print(lines)
                    print()
        if self.options.results_by_arch:
            out.close()
            self._filter_arch_file(arch_file_name,
                                   self.options.results_by_arch)
Beispiel #2
0
    def _transfer_from_same_file(self, goa, arch_file):
        """ Transfer function from same architecture file
        """
        confidence = self.options.max_pvalue
        if self.options.ignore:
            confidence = float("inf")
            self.log.info("Ignored the significance value."
                          " We will filter results later.")
        ora = OverrepresentationAnalyser(self.go_tree, goa,
                                         confidence=confidence,
                                         min_count=1,
                                         correction=self.options.correction)
        cov = self.options.minimum_coverage / 100.0
        self.log.info("Transferring function from same file. Min coverage=%s",
                      str(cov))
        all_annotated = frozenset(goa.left.keys())
        if self.options.results_by_arch:
            arch_file_name = self.options.results_by_arch
            if self.options.ignore:
                arch_file_name += "_unfiltered"
            out = open(arch_file_name, "w")

        # for each architecture and its associated proteins...
        carriage_return = os.linesep
        twocr = os.linesep * 2
        self.log.info("Arch file: %s", arch_file)
        for arch, prots in ArchReader(arch_file, cov):
            if not prots or arch == "NO_ASSIGNMENT":
                # if there is no annotation for proteins in the arch...
                if self.options.results_by_protein:
                    print("{}{}".format(twocr.join(prots), carriage_return))
                if self.options.results_by_arch:
                    out.write("{}{}".format(arch, twocr))
                continue
            targets = set(prots)
            annotated_prots = targets & all_annotated
            lines = carriage_return.join(["  %.4f: %s (%s)" %
                                          (p_value, term.id, term.name)
                                          for term, p_value in
                                          ora.test_group(targets)])
            if self.options.results_by_arch:
                out.write("{}{}{}{}".format(arch, carriage_return,
                                            lines, twocr))
            if self.options.results_by_protein:
                for rest, prot in i_zip(combinations(prots, len(prots)-1),
                                        reversed(prots)):
                    print(prot)
                    if prot in annotated_prots:
                        for term, p_value in ora.test_group(rest):
                            print("  %.4f: %s (%s)" % (p_value,
                                                       term.id,
                                                       term.name))
                    else:
                        print(lines)
                    print()
        if self.options.results_by_arch:
            out.close()
            self._filter_arch_file(arch_file_name,
                                   self.options.results_by_arch)
Beispiel #3
0
    def _transfer_from_both(self, goa, arch_target, arch_source):
        """ Transfer from both an external file and the same file using a single GOA 
        file
        """
        confidence = self.options.max_pvalue
        if self.options.ignore:
            confidence = float("inf")
            self.log.info("Ignored the significance value. We will filter results later.")
        ora = OverrepresentationAnalyser(self.go_tree, goa,
                                         confidence=confidence,
                                         min_count=1, correction='None')
        cov = self.options.minimum_coverage / 100.0
        self.log.info("Transferring function from both files. Min coverage=" + str(cov))
        all_annotated = frozenset(goa.left.keys()) # all annotated proteins
        prots_per_arch = dict()
        if self.options.results_by_arch:
            arch_file_name = self.options.results_by_arch
            if self.options.ignore:
                arch_file_name += "_unfiltered"
            out = open(arch_file_name, "w")

        self.log.info("\t Source architecture: " + arch_source)
        self.log.info("\t Target (and source, as well) architecture: " + arch_target)

        for arch, prots in ArchReader(arch_source, cov):
            prots_per_arch[arch] = prots
        for arch, prots in ArchReader(arch_target, cov):
            other_prots = set()
            if arch in prots_per_arch:
                other_prots = prots_per_arch[arch]
            targets = set(other_prots) | set(prots)
            annotated_prots = targets & all_annotated
            if not annotated_prots or arch == "NO_ASSIGNMENT":
                # if there is no annotation for proteins in the arch...
                if self.options.results_by_protein:
                    print (os.linesep * 2).join(prots), os.linesep
                if self.options.results_by_arch:
                    out.write(arch + "\n\n")
                continue
            lines = os.linesep.join(["  %.4f: %s (%s)" % 
                (p_value, term.id, term.name)
                for term, p_value in ora.test_group(targets)])
            if self.options.results_by_arch:
                out.write(arch + "\n")
                out.write(lines)
                out.write("\n")
            if self.options.results_by_protein:
                for prot in prots:
                    print prot
                    if prot in annotated_prots:
                        for term, p_value in ora.test_group(targets - set([prot])):
                            print "  %.4f: %s (%s)" % (p_value, term.id, term.name)
                    else:
                        print lines
                    print
        if self.options.results_by_arch:
            out.close()
            self._filter_arch_file(arch_file_name, self.options.results_by_arch)
Beispiel #4
0
    def _transfer_from_other_file(self, goa, arch_target, arch_source):
        confidence = self.options.max_pvalue
        if self.options.ignore:
            confidence = float("inf")
            self.log.info("Ignored the significance value."
                          " We will filter results later.")
        ora = OverrepresentationAnalyser(self.go_tree,
                                         goa,
                                         confidence=confidence,
                                         min_count=1,
                                         correction='None')
        cov = self.options.minimum_coverage / 100.0
        goterms = dict()
        for arch, prots in ArchReader(arch_source, cov):
            if arch != "NO_ASSIGNMENT":
                goterms[arch] = ora.test_group(prots)
        if self.options.results_by_arch:
            arch_file_name = self.options.results_by_arch
            if self.options.ignore:
                arch_file_name += "_unfiltered"
            with open(arch_file_name, "w") as out:
                for arch in sorted(goterms.keys()):
                    out.write(arch + "\n")
                    for term, p_value in goterms[arch]:
                        line = "  %.4f: %s (%s)" % (p_value, term.id,
                                                    term.name)
                        out.write(line)
                        out.write("\n")
                    out.write("\n")
        if self.options.results_by_protein:
            for arch, prots in ArchReader(arch_target, cov):
                if arch in goterms:
                    for prot in prots:
                        print(prot)
                        for term, p_value in goterms[arch]:
                            print("  %.4f: %s (%s)" %
                                  (p_value, term.id, term.name))
                        print()
                else:
                    for prot in prots:
                        print(prot)
                        print()

        if self.options.results_by_arch:
            self._filter_arch_file(arch_file_name,
                                   self.options.results_by_arch)
Beispiel #5
0
    def _transfer_from_other_file(self, goa, arch_target, arch_source):
        confidence = self.options.max_pvalue
        if self.options.ignore:
            confidence = float("inf")
            self.log.info("Ignored the significance value."
                          " We will filter results later.")
        ora = OverrepresentationAnalyser(self.go_tree, goa,
                                         confidence=confidence,
                                         min_count=1, correction='None')
        cov = self.options.minimum_coverage / 100.0
        goterms = dict()
        for arch, prots in ArchReader(arch_source, cov):
            if arch != "NO_ASSIGNMENT":
                goterms[arch] = ora.test_group(prots)
        if self.options.results_by_arch:
            arch_file_name = self.options.results_by_arch
            if self.options.ignore:
                arch_file_name += "_unfiltered"
            with open(arch_file_name, "w") as out:
                for arch in sorted(goterms.keys()):
                    out.write(arch + "\n")
                    for term, p_value in goterms[arch]:
                        line = "  %.4f: %s (%s)" % (p_value,
                                                    term.id,
                                                    term.name)
                        out.write(line)
                        out.write("\n")
                    out.write("\n")
        if self.options.results_by_protein:
            for arch, prots in ArchReader(arch_target, cov):
                if arch in goterms:
                    for prot in prots:
                        print(prot)
                        for term, p_value in goterms[arch]:
                            print("  %.4f: %s (%s)" % (p_value,
                                                       term.id,
                                                       term.name))
                        print()
                else:
                    for prot in prots:
                        print(prot)
                        print()

        if self.options.results_by_arch:
            self._filter_arch_file(arch_file_name,
                                   self.options.results_by_arch)
Beispiel #6
0
    def process_file(self, input_file):
        """Processes the given input file that contains the domain
        architectures."""
        self.log.info("Running overrepresentation analysis")
        self.log.info("p-value = %.4f, correction method = %s" % \
                      (self.options.confidence, self.options.correction))

        overrep = OverrepresentationAnalyser(self.go_tree, self.go_mapping,
                confidence = self.options.confidence,
                min_count = self.options.min_size,
                correction = self.options.correction)
        cache = {}

        num_no_annotations = 0
        num_no_domains = 0
        total_seqs = 0

        for line in open_anything(input_file):
            parts = line.strip().split("\t")
            gene_id, arch = parts[0], tuple(parts[2].split(";"))
            total_seqs += 1

            if arch == ("NO_ASSIGNMENT", ):
                num_no_domains += 1
                num_no_annotations += 1
                continue

            if arch not in cache:
                cache[arch] = overrep.test_group(arch)

            print gene_id
            for term, p_value in cache[arch]:
                print "  %.4f: %s (%s)" % (p_value, term.id, term.name)
            print

            if len(cache[arch]) == 0:
                num_no_annotations += 1

        self.log.info("Total number of sequences processed: %d" % total_seqs)
        if num_no_annotations:
            self.log.info("%d sequences have no overrepresented annotations :("
                    % num_no_annotations)
        if num_no_domains:
            self.log.info("%d sequences have no domains at all :(" % num_no_domains)
Beispiel #7
0
    def process_file(self, input_file):
        """Processes the given input file that contains the domain
        architectures."""
        self.log.info("Running overrepresentation analysis")
        self.log.info("p-value = %.4f, correction method = %s",
                      self.options.confidence, self.options.correction)

        if self.options.arch_file:
            arch_file_name = self.options.arch_file
            if self.options.ignore:
                arch_file_name += "_unfiltered"
            arch_file = open(arch_file_name, "w")

        confidence = self.options.confidence
        if self.options.ignore:
            confidence = float("inf")
        self.log.info("Ignored the significance value."
                      " We will filter results later.")

        overrep = OverrepresentationAnalyser(self.go_tree,
                                             self.go_mapping,
                                             confidence=confidence,
                                             min_count=self.options.min_size,
                                             correction=self.options.
                                             correction)
        cache = {}

        num_no_annotations = 0
        num_no_domains = 0
        total_seqs = 0

        for line in open_anything(input_file):
            parts = line.strip().split("\t")
            gene_id = parts[0]
            prts = parts[3].replace("{", ";").replace("}", ";").split(";")
            arch = tuple([x for x in prts if x])
            total_seqs += 1

            if arch == ("NO_ASSIGNMENT", ):
                num_no_domains += 1
                num_no_annotations += 1
                continue

            if arch not in cache:
                cache[arch] = overrep.test_group(arch)
                if self.options.arch_file:
                    arch_file.write("{}\n".format(parts[3]))  # architecture
                    for term, p_value in cache[arch]:
                        line = "  %.4f: %s (%s)\n" % (p_value, term.id,
                                                      term.name)
                        arch_file.write(line)
                    arch_file.write("\n")
            if self.options.results_by_protein:
                print(gene_id)
                for term, p_value in cache[arch]:
                    print("  %.4f: %s (%s)" % (p_value, term.id, term.name))
                print()

            if not cache[arch]:
                num_no_annotations += 1

        self.log.info("Total number of sequences processed: %d", total_seqs)
        if num_no_annotations:
            self.log.info("%d sequences have no overrepresented annots. :(",
                          num_no_annotations)
        if num_no_domains:
            self.log.info("%d sequences have no domains at all :(",
                          num_no_domains)

        if self.options.arch_file:
            arch_file.close()
            if self.options.ignore:
                # we filter the file with the significance value
                filterer = ResultFileFilter(arch_file_name)
                filterer.filter(self.options.arch_file,
                                confidence=self.options.confidence)
Beispiel #8
0
    def _transfer_from_both(self, goa, arch_target, arch_source):
        """ Transfer from both an external file and the same file using a single GOA
        file
        """
        confidence = self.options.max_pvalue
        if self.options.ignore:
            confidence = float("inf")
            self.log.info("Ignored the significance value. "
                          "We will filter results later.")
        ora = OverrepresentationAnalyser(self.go_tree,
                                         goa,
                                         confidence=confidence,
                                         min_count=1,
                                         correction='None')
        cov = self.options.minimum_coverage / 100.0
        self.log.info(
            "Transferring function from both files."
            " Min coverage=%s", str(cov))
        all_annotated = frozenset(goa.left.keys())  # all annotated proteins
        prots_per_arch = dict()
        if self.options.results_by_arch:
            arch_file_name = self.options.results_by_arch
            if self.options.ignore:
                arch_file_name += "_unfiltered"
            out = open(arch_file_name, "w")

        self.log.info("\t Source architecture: %s", arch_source)
        self.log.info("\t Target (and source, as well) architecture: %s",
                      arch_target)

        for arch, prots in ArchReader(arch_source, cov):
            prots_per_arch[arch] = prots
        for arch, prots in ArchReader(arch_target, cov):
            other_prots = set()
            if arch in prots_per_arch:
                other_prots = prots_per_arch[arch]
            targets = set(other_prots) | set(prots)
            annotated_prots = targets & all_annotated
            if not annotated_prots or arch == "NO_ASSIGNMENT":
                # if there is no annotation for proteins in the arch...
                if self.options.results_by_protein:
                    print((os.linesep * 2).join(prots))
                    print()
                if self.options.results_by_arch:
                    out.write(arch + "\n\n")
                continue
            lines = os.linesep.join([
                "  %.4f: %s (%s)" % (p_value, term.id, term.name)
                for term, p_value in ora.test_group(targets)
            ])
            if self.options.results_by_arch:
                out.write(arch + "\n")
                out.write(lines)
                out.write("\n")
            if self.options.results_by_protein:
                for prot in prots:
                    print(prot)
                    if prot in annotated_prots:
                        grp = targets - set([prot])
                        for term, p_value in ora.test_group(grp):
                            print("  %.4f: %s (%s)" %
                                  (p_value, term.id, term.name))
                    else:
                        print(lines)
                    print()
        if self.options.results_by_arch:
            out.close()
            self._filter_arch_file(arch_file_name,
                                   self.options.results_by_arch)
Beispiel #9
0
    def process_file(self, input_file):
        """Processes the given input file that contains the domain
        architectures."""
        self.log.info("Running overrepresentation analysis")
        self.log.info("p-value = %.4f, correction method = %s",
                      self.options.confidence, self.options.correction)

        if self.options.arch_file:
            arch_file_name = self.options.arch_file
            if self.options.ignore:
                arch_file_name += "_unfiltered"
            arch_file = open(arch_file_name, "w")

        confidence = self.options.confidence
        if self.options.ignore:
            confidence = float("inf")
        self.log.info("Ignored the significance value."
                      " We will filter results later.")

        overrep = OverrepresentationAnalyser(
            self.go_tree,
            self.go_mapping,
            confidence=confidence,
            min_count=self.options.min_size,
            correction=self.options.correction)
        cache = {}

        num_no_annotations = 0
        num_no_domains = 0
        total_seqs = 0

        for line in open_anything(input_file):
            parts = line.strip().split("\t")
            gene_id = parts[0]
            prts = parts[3].replace("{", ";").replace("}", ";").split(";")
            arch = tuple([x for x in prts if x])
            total_seqs += 1

            if arch == ("NO_ASSIGNMENT", ):
                num_no_domains += 1
                num_no_annotations += 1
                continue

            if arch not in cache:
                cache[arch] = overrep.test_group(arch)
                if self.options.arch_file:
                    arch_file.write("{}\n".format(parts[3]))  # architecture
                    for term, p_value in cache[arch]:
                        line = "  %.4f: %s (%s)\n" % (p_value, term.id,
                                                      term.name)
                        arch_file.write(line)
                    arch_file.write("\n")
            if self.options.results_by_protein:
                print(gene_id)
                for term, p_value in cache[arch]:
                    print("  %.4f: %s (%s)" % (p_value, term.id, term.name))
                print()

            if not cache[arch]:
                num_no_annotations += 1

        self.log.info("Total number of sequences processed: %d", total_seqs)
        if num_no_annotations:
            self.log.info("%d sequences have no overrepresented annots. :(",
                          num_no_annotations)
        if num_no_domains:
            self.log.info("%d sequences have no domains at all :(",
                          num_no_domains)

        if self.options.arch_file:
            arch_file.close()
            if self.options.ignore:
                # we filter the file with the significance value
                filterer = ResultFileFilter(arch_file_name)
                filterer.filter(self.options.arch_file,
                                confidence=self.options.confidence)