Exemple #1
0
    def run_real(self):
        """Runs the application"""

        # Load valid sequence IDs (if necessary)
        if self.options.sequences_file:
            self.log.info("Loading sequences from %s..." %
                          self.options.sequences_file)
            self.total_sequence_length = 0
            self.valid_sequence_ids = set()
            parser = fasta.Parser(open_anything(self.options.sequences_file))
            parser = fasta.regexp_remapper(parser,
                                           self.options.sequence_id_regexp)
            for seq in parser:
                self.valid_sequence_ids.add(seq.id)
                self.total_sequence_length += len(seq.seq)
        else:
            self.valid_sequence_ids = complementerset()
            self.total_sequence_length = None

        # Find which sources will be allowed
        if not self.options.include_sources:
            self.sources = complementerset()
        else:
            self.sources = set(self.options.include_sources)
        self.sources.difference_update(self.options.exclude_sources)
        if isinstance(self.sources, complementerset):
            self.log.info("Ignored sources: %s" %
                          ", ".join(self.sources.iterexcluded()))
        else:
            self.log.info("Accepted sources: %s" % ", ".join(self.sources))

        if not self.args:
            self.args = ["-"]

        for arg in self.args:
            # Set up the output formatter
            if self.options.print_totals:
                self.output_formatter = GenomeLevelOutputFormatter(self)
            else:
                self.output_formatter = SequenceLevelOutputFormatter(self)
            # Process the file
            self.process_infile(arg)
            # Print the results
            self.output_formatter.finish()
Exemple #2
0
    def run_real(self):
        """Runs the application"""

        # Load valid sequence IDs (if necessary)
        if self.options.sequences_file:
            self.log.info("Loading sequences from %s..." % self.options.sequences_file)

            self.total_sequence_length = 0
            self.valid_sequence_ids = set()
            parser = fasta.Parser(open_anything(self.options.sequences_file))
            parser = fasta.regexp_remapper(parser, self.options.sequence_id_regexp)
            for seq in parser:
                self.valid_sequence_ids.add(seq.id)
                self.total_sequence_length += len(seq.seq)
        else:
            self.valid_sequence_ids = complementerset()
            self.total_sequence_length = None

        # Find which sources will be allowed
        if not self.options.include_sources:
            self.sources = complementerset()
        else:
            self.sources = set(self.options.include_sources)
        self.sources.difference_update(self.options.exclude_sources)
        if isinstance(self.sources, complementerset):
            self.log.info("Ignored sources: %s" % ", ".join(self.sources.iterexcluded()))
        else:
            self.log.info("Accepted sources: %s" % ", ".join(self.sources))

        if not self.args:
            self.args = ["-"]

        for arg in self.args:
            # Set up the output formatter
            if self.options.print_totals:
                self.output_formatter = GenomeLevelOutputFormatter(self)
            else:
                self.output_formatter = SequenceLevelOutputFormatter(self)
            # Process the file
            self.process_infile(arg)
            # Print the results
            self.output_formatter.finish()
Exemple #3
0
    def get_stages_from_config(self):
        """Turns to the configuration file specified at startup to
        fetch the data sources to be used in each stage of the algorithm.
        If there is no configuration file specified or it does not
        contain the corresponding keys, it will simply use a default
        stage setup which ignores HMMPanther and Gene3D in the first
        and second steps, but uses all sources in the third step.

        The method will be looking for configuration keys named like
        ``stages.1``, ``stages.2`` and so on in the ``analysis:iprscan_filter``
        section of the config file. The value of each such config key must
        be an expression consisting of assignment source names and the
        operators ``+`` and ``-``, with their usual meaning of addition
        and exclusion. The special source name ``ALL`` means all possible
        data sources, enabling us to write expressions like ``ALL-HMMPanther``
        (meaning all the sources except HMMPanther). Some examples:

        - ``HMMPanther`` means HMMPanther only.
        - ``ALL`` means all possible data sources.
        - ``HMMPanther+HMMPfam`` means HMMPanther or HMMPfam.
        - ``ALL-HMMPanther-Gene3D`` means all possible data sources but
          HMMPanther or Gene3D.
        - ``ALL+HMMPanther`` does not really make sense as you are extending
          all data sources with HMMPanther, so it is equivalent to ``ALL``.
          GFam will figure out what you meant anyway.
        """
        cfg = self.parser.config
        if cfg is None:
            spec = ["ALL-HMMPanther-Gene3D", "ALL-HMMPanther-Gene3D",
                    "ALL"]
        else:
            spec, idx = [], 1
            section = "analysis:iprscan_filter"
            while cfg.has_option(section, "stages.%d" % idx):
                spec.append(cfg.get(section, "stages.%d" % idx))
                idx += 1

        regexp = re.compile("([-+])?\s*([^-+]+)")
        result = []
        for item in spec:
            sources = set()
            for match in regexp.finditer(item):
                sign, source = match.groups()
                if source == "ALL":
                    source = complementerset()
                else:
                    source = set([source.strip()])
                if sign == "-":
                    sources -= source
                else:
                    sources |= source
            result.append(sources)

        return result
Exemple #4
0
    def get_stages_from_config(self):
        """Turns to the configuration file specified at startup to
        fetch the data sources to be used in each stage of the algorithm.
        If there is no configuration file specified or it does not
        contain the corresponding keys, it will simply use a default
        stage setup which ignores HMMPanther and Gene3D in the first
        and second steps, but uses all sources in the third step.

        The method will be looking for configuration keys named like
        ``stages.1``, ``stages.2`` and so on in the ``analysis:iprscan_filter``
        section of the config file. The value of each such config key must
        be an expression consisting of assignment source names and the
        operators ``+`` and ``-``, with their usual meaning of addition
        and exclusion. The special source name ``ALL`` means all possible
        data sources, enabling us to write expressions like ``ALL-HMMPanther``
        (meaning all the sources except HMMPanther). Some examples:

        - ``HMMPanther`` means HMMPanther only.
        - ``ALL`` means all possible data sources.
        - ``HMMPanther+HMMPfam`` means HMMPanther or HMMPfam.
        - ``ALL-HMMPanther-Gene3D`` means all possible data sources but
          HMMPanther or Gene3D.
        - ``ALL+HMMPanther`` does not really make sense as you are extending
          all data sources with HMMPanther, so it is equivalent to ``ALL``.
          GFam will figure out what you meant anyway.
        """
        cfg = self.parser.config
        if cfg is None:
            spec = ["ALL-HMMPanther-Gene3D", "ALL-HMMPanther-Gene3D", "ALL"]
        else:
            spec, idx = [], 1
            section = "analysis:iprscan_filter"
            while cfg.has_option(section, "stages.%d" % idx):
                spec.append(cfg.get(section, "stages.%d" % idx))
                idx += 1

        regexp = re.compile("([-+])?\s*([^-+]+)")
        result = []
        for item in spec:
            sources = set()
            for match in regexp.finditer(item):
                sign, source = match.groups()
                if source == "ALL":
                    source = complementerset()
                else:
                    source = set([source.strip()])
                if sign == "-":
                    sources -= source
                else:
                    sources |= source
            result.append(sources)

        return result
Exemple #5
0
    def run_real(self):
        """Runs the application"""
        AssignmentOverlapChecker.max_overlap = self.options.max_overlap

        if self.options.interpro_file:
            self.log.info("Loading known InterPro IDs from %s..." %
                          self.options.interpro_file)
            self.interpro = InterPro.FromFile(self.options.interpro_file)
        else:
            self.interpro = InterPro()

        if self.options.gene_id_file:
            self.log.info("Loading sequence IDs from %s..." %
                          self.options.gene_id_file)
            self.valid_sequence_ids = set()
            for line in open_anything(self.options.gene_id_file):
                self.valid_sequence_ids.add(line.strip())
        else:
            self.valid_sequence_ids = complementerset()

        if self.options.exclusions_log_file:
            self.log.info("Logging excluded sequences to %s." %
                          self.options.exclusions_log_file)
            self.exclusion_log = open(self.options.exclusions_log_file, "a+")
        else:
            self.exclusion_log = None

        self.ignored = set()
        for ignored_source in self.options.ignored:
            parts = ignored_source.split()
            self.ignored.update(parts)

        if not self.args:
            self.args = ["-"]
        if len(self.args) > 1:
            self.error("Only one input file may be given")

        self.process_infile(self.args[0])
Exemple #6
0
    def run_real(self):
        """Runs the application"""
        AssignmentOverlapChecker.max_overlap = self.options.max_overlap

        if self.options.interpro_file:
            self.log.info("Loading known InterPro IDs from %s..." %
                          self.options.interpro_file)
            self.interpro = InterPro.FromFile(self.options.interpro_file)
        else:
            self.interpro = InterPro()

        if self.options.gene_id_file:
            self.log.info("Loading sequence IDs from %s..." %
                          self.options.gene_id_file)
            self.valid_sequence_ids = set()
            for line in open_anything(self.options.gene_id_file):
                self.valid_sequence_ids.add(line.strip())
        else:
            self.valid_sequence_ids = complementerset()

        if self.options.exclusions_log_file:
            self.log.info("Logging excluded sequences to %s." %
                          self.options.exclusions_log_file)
            self.exclusion_log = open(self.options.exclusions_log_file, "a+")
        else:
            self.exclusion_log = None

        self.ignored = set()
        for ignored_source in self.options.ignored:
            parts = ignored_source.split()
            self.ignored.update(parts)

        if not self.args:
            self.args = ["-"]
        if len(self.args) > 1:
            self.error("Only one input file may be given")

        self.process_infile(self.args[0])
Exemple #7
0
    def run_real(self):
        """Runs the applications"""
        if len(self.args) != 2:
            self.error("exactly two input files are expected")

        AssignmentOverlapChecker.max_overlap = self.options.max_overlap
        AssignmentOverlapChecker.log = self.log

        if self.options.interpro_parent_child_file:
            self.log.info("Loading InterPro parent-child"
                          " assignments from %s..." %
                          self.options.interpro_parent_child_file)
            self.interpro = InterPro.FromFile(
                self.options.interpro_parent_child_file)
        else:
            self.interpro = InterPro()

        self.interpro_names = InterProNames.FromFile(
            self.options.interpro_names_file)

        if self.options.details:
            self.details_file = open(self.options.details, "w")
        else:
            self.details_file = None

        if self.options.old_table:
            self.process_old_table(self.options.old_table)
            self.using_old_table = True
        else:
            self.using_old_table = False
            self.current_cluster_id = 1

        interpro_file, clustering_file = self.args
        self.process_interpro_file(interpro_file)
        table = self.process_clustering_file(clustering_file)
        self.sort_by_domain_architecture()

        if self.options.new_domains_table:
            self.print_new_domains_table(table)

        for seqs in self.domain_archs.values():
            seqs.sort()

        self.domain_archs = self.domain_archs.items()
        self.domain_archs.sort(key=lambda x: len(x[1]), reverse=True)

        for domain_arch, members in self.domain_archs:
            if domain_arch:
                arch_str = domain_arch
            else:
                arch_str = "NO_ASSIGNMENT"
                arch_str_pos = "NO_ASSIGNMENT"
                arch_desc = "NO_DESCRIPTION"

            family_length = len(members)
            for member in members:
                seq = self.seqcat[member]
                if domain_arch:
                    arch_str_pos = seq.architecture_pos
                    arch_desc = ";".join(self.interpro_names[assignment.domain]
                                         for assignment in seq.assignments)
                print("%s\t%d\t%d\t%s\t%d\t%s\t%s" %
                      (member, seq.length, seq.num_covered(), arch_str,
                       family_length, arch_str_pos, arch_desc))

        self.details_file.close()

        if self.options.stats:
            stats_file = open(self.options.stats, "w")
            total_residues = 0.0
            covered_residues = 0
            covered_residues_nonnovel = 0
            nonnovel_sources = complementerset(["Novel"])

            for seq in self.seqcat.values():
                total_residues += seq.length
                covered_residues += round(seq.coverage() * seq.length)
                covered_residues_nonnovel += round(
                    seq.coverage(sources=nonnovel_sources) * seq.length)

            all_archs = set(arch for arch, _ in self.domain_archs)
            num_archs = len(all_archs)
            if "" in self.domain_archs:
                num_archs -= 1

            if self.options.prefix:
                prefix = self.options.prefix
            else:
                prefix = "NOVEL"

            def split_arch(arch):
                return [
                    x for x in arch.replace("{", ";").replace("}", ";").split(
                        ";") if x
                ]

            def exclude_novel_domains(domain_architecture):
                """Excludes novel domains from a domain architecture and returns
                the filtered domain architecture as a tuple."""
                return tuple(a for a in split_arch(domain_architecture)
                             if not a.startswith(prefix))

            archs_without_novel = set(
                exclude_novel_domains(arch) for arch in all_archs)
            if () in archs_without_novel:
                archs_without_novel.remove(())
            num_archs_without_novel = len(archs_without_novel)

            num_seqs_with_nonempty_domain_arch = sum(
                len(value) for ke, value in self.domain_archs
                if ke and ke != "NO_ASSIGNMENT")
            num_seqs_with_nonempty_domain_arch_ignore_novel = \
                sum(len(value) for key, value in self.domain_archs
                    if exclude_novel_domains(key) in archs_without_novel
                    and key != "NO_ASSIGNMENT")
            num_seqs_with_nonempty_nonnovel_domain_arch = \
                sum(len(value) for key, value in self.domain_archs
                    if key and not any(a.startswith(prefix) for a in
                                       split_arch(key)) and
                    key != "NO_ASSIGNMENT")

            with redirected(stdout=stats_file):
                print("Domain architectures")
                print("====================")
                print()
                print("Non-empty: %d" % num_archs)
                print("Non-empty (when ignoring novel domains): %d" %
                      num_archs_without_novel)
                print()
                print("Sequences")
                print("=========")
                print()
                print("Total: %d" % len(self.seqcat))
                print("With at least one domain: %d (%.4f%%)" %
                      (num_seqs_with_nonempty_domain_arch, 100.0 *
                       num_seqs_with_nonempty_domain_arch / len(self.seqcat)))
                print("With at least one non-novel domain: %d (%.4f%%)" %
                      (num_seqs_with_nonempty_domain_arch_ignore_novel,
                       100. * num_seqs_with_nonempty_domain_arch_ignore_novel /
                       len(self.seqcat)))
                print("With at least one domain and no novel domains: "
                      "%d (%.4f%%)" %
                      (num_seqs_with_nonempty_nonnovel_domain_arch,
                       100.0 * num_seqs_with_nonempty_nonnovel_domain_arch /
                       len(self.seqcat)))
                print()
                print("Residues")
                print("========")
                print()
                print("Total: %d" % total_residues)
                print("Covered: %d (%.4f%%)" %
                      (covered_residues,
                       100.0 * covered_residues / total_residues))
                print("Covered by non-novel: %d (%.4f%%)" %
                      (covered_residues_nonnovel,
                       100.0 * covered_residues_nonnovel / total_residues))
            stats_file.close()
Exemple #8
0
    def run_real(self):
        """Runs the applications"""
        if len(self.args) != 2:
            self.error("exactly two input files are expected")

        AssignmentOverlapChecker.max_overlap = self.options.max_overlap

        if self.options.interpro_parent_child_file:
            self.log.info("Loading InterPro parent-child assignments from %s..." % \
                    self.options.interpro_parent_child_file)
            self.interpro = InterPro.FromFile(self.options.interpro_parent_child_file)
        else:
            self.interpro = InterPro()

        self.interpro_names = InterProNames.FromFile(self.options.interpro_names_file)

        if self.options.details:
            self.details_file = open(self.options.details, "w")
        else:
            self.details_file = None

        interpro_file, clustering_file = self.args
        self.process_interpro_file(interpro_file)
        self.process_clustering_file(clustering_file)
        self.sort_by_domain_architecture()

        for seqs in self.domain_archs.itervalues():
            seqs.sort()

        self.domain_archs = self.domain_archs.items()
        self.domain_archs.sort(key=lambda x: len(x[1]), reverse=True)

        for domain_arch, members in self.domain_archs:
            if domain_arch:
                arch_str = ";".join(domain_arch)
            else:
                arch_str = "NO_ASSIGNMENT"
                arch_str_pos = "NO_ASSIGNMENT"
                arch_desc = "NO_DESCRIPTION"

            family_length = len(members)
            for member in members:
                seq = self.seqcat[member]
                if domain_arch:
                    arch_str_pos = ";".join(assignment.short_repr() \
                            for assignment in seq.assignments)
                    arch_desc = ";".join( \
                            self.interpro_names[assignment.domain]
                            for assignment in seq.assignments
                    )
                print "%s\t%d\t%s\t%d\t%s\t%s" % (member, seq.length, arch_str, \
                                              family_length, arch_str_pos, \
                                              arch_desc)

        self.details_file.close()

        if self.options.stats:
            stats_file = open(self.options.stats, "w")

            total_residues, covered_residues, covered_residues_nonnovel = 0.0, 0, 0
            nonnovel_sources = complementerset(["Novel"])

            for seq in self.seqcat.itervalues():
                total_residues += seq.length
                covered_residues += round(seq.coverage() * seq.length)
                covered_residues_nonnovel += round(seq.coverage(sources=nonnovel_sources) * seq.length)

            all_archs = set(arch for arch, _ in self.domain_archs)
            num_archs = len(all_archs)
            if "" in self.domain_archs:
                num_archs -= 1

            def exclude_novel_domains(domain_architecture):
                """Excludes novel domains from a domain architecture and returns
                the filtered domain architecture as a tuple."""
                return tuple(a for a in domain_architecture if not a.startswith("NOVEL"))

            archs_without_novel = set(exclude_novel_domains(arch)
                    for arch in all_archs)
            archs_without_novel.discard(())
            num_archs_without_novel = len(archs_without_novel)

            num_seqs_with_nonempty_domain_arch = \
                    sum(len(value) for key, value in self.domain_archs if key)
            num_seqs_with_nonempty_domain_arch_ignore_novel = \
                    sum(len(value) for key, value in self.domain_archs
                        if exclude_novel_domains(key) in archs_without_novel)
            num_seqs_with_nonempty_nonnovel_domain_arch = \
                    sum(len(value) for key, value in self.domain_archs
                            if key and not any(a.startswith("NOVEL") for a in key))

            with redirected(stdout=stats_file):
                print "Domain architectures"
                print "===================="
                print ""
                print "Non-empty: %d" % num_archs
                print "Non-empty (when ignoring novel domains): %d" % num_archs_without_novel
                print ""
                print "Sequences"
                print "========="
                print ""
                print "Total: %d" % len(self.seqcat)
                print "With at least one domain: %d (%.4f%%)" %\
                        (num_seqs_with_nonempty_domain_arch,
                         100.0 * num_seqs_with_nonempty_domain_arch / len(self.seqcat))
                print "With at least one non-novel domain: %d (%.4f%%)" %\
                        (num_seqs_with_nonempty_domain_arch_ignore_novel,
                         100.0 * num_seqs_with_nonempty_domain_arch_ignore_novel / len(self.seqcat))
                print "With at least one domain and no novel domains: %d (%.4f%%)" %\
                        (num_seqs_with_nonempty_nonnovel_domain_arch,
                         100.0 * num_seqs_with_nonempty_nonnovel_domain_arch / len(self.seqcat))
                print ""
                print "Residues"
                print "========"
                print ""
                print "Total: %d" % total_residues
                print "Covered: %d (%.4f%%)" % (covered_residues, 100.0*covered_residues/total_residues)
                print "Covered by non-novel: %d (%.4f%%)" % (covered_residues_nonnovel, 100.0*covered_residues_nonnovel/total_residues)
            stats_file.close()
    def run_real(self):
        """Runs the applications"""
        if len(self.args) != 2:
            self.error("exactly two input files are expected")

        AssignmentOverlapChecker.max_overlap = self.options.max_overlap

        if self.options.interpro_parent_child_file:
            self.log.info("Loading InterPro parent-child"
                          " assignments from %s..." %
                          self.options.interpro_parent_child_file)
            self.interpro = InterPro.FromFile(
                self.options.interpro_parent_child_file)
        else:
            self.interpro = InterPro()

        self.interpro_names = InterProNames.FromFile(
            self.options.interpro_names_file)

        if self.options.details:
            self.details_file = open(self.options.details, "w")
        else:
            self.details_file = None

        interpro_file, hmmer_file = self.args
        self.process_interpro_file(interpro_file)
        self.process_hmmer_file(hmmer_file)
        self.sort_by_domain_architecture()

        for seqs in self.domain_archs.values():
            seqs.sort()

        self.domain_archs = self.domain_archs.items()
        self.domain_archs.sort(key=lambda x: len(x[1]), reverse=True)

        for domain_arch, members in self.domain_archs:
            if domain_arch:
                arch_str = domain_arch  # ";".join(domain_arch)
            else:
                arch_str = "NO_ASSIGNMENT"
                arch_str_pos = "NO_ASSIGNMENT"
                arch_desc = "NO_DESCRIPTION"

            family_length = len(members)
            for member in members:
                seq = self.seqcat[member]
                if domain_arch:
                    arch_str_pos = seq.architecture_pos
                    arch_desc = ";".join(
                            self.interpro_names[assignment.domain]
                            for assignment in seq.assignments)
                print("%s\t%d\t%d\t%s\t%d\t%s\t%s" % (member, seq.length,
                                                      seq.num_covered(),
                                                      arch_str,
                                                      family_length,
                                                      arch_str_pos,
                                                      arch_desc))

        self.details_file.close()

        if self.options.stats:
            stats_file = open(self.options.stats, "w")

            total_residues = 0.0
            covered_residues, covered_residues_nonnovel = 0, 0
            nonnovel_sources = complementerset(["Novel"])

            for seq in self.seqcat.values():
                total_residues += seq.length
                covered_residues += round(seq.coverage() * seq.length)
                covered_residues_nonnovel += round(
                    seq.coverage(sources=nonnovel_sources) * seq.length)

            all_archs = set(arch for arch, _ in self.domain_archs)
            num_archs = len(all_archs)
            if "" in self.domain_archs:
                num_archs -= 1

            def split_arch(arch):
                return [x for x in arch.replace("{", ";").replace("}", ";")
                                                         .split(";") if x]

            def exclude_novel_domains(domain_architecture):
                """Excludes novel domains from a domain architecture and returns
                the filtered domain architecture as a tuple."""
                return tuple(a for a in split_arch(domain_architecture)
                             if a not in self.hmm_domains)

            archs_without_novel = set(exclude_novel_domains(arch)
                                      for arch in all_archs)
            if () in archs_without_novel:
                archs_without_novel.remove(())
            num_archs_without_novel = len(archs_without_novel)
            num_seqs_with_nonempty_domain_arch = sum(
                len(value) for key, value in self.domain_archs if key
                and key != "NO_ASSIGNMENT")
            num_seqs_with_nonempty_domain_arch_ignore_novel = sum(
                len(value) for key, value in self.domain_archs
                if exclude_novel_domains(key) in archs_without_novel
                and key != "NO_ASSIGNMENT")
            num_seqs_with_nonempty_nonnovel_domain_arch = sum(
                len(value) for ke, value in self.domain_archs
                if ke and not any(a in self.hmm_domains for a in ke)
                and ke != "NO_ASSIGNMENT")

            with redirected(stdout=stats_file):
                print("Domain architectures")
                print("====================")
                print()
                print("Non-empty: %d" % num_archs)
                print("Non-empty (when ignoring novel domains): %d" %
                      num_archs_without_novel)
                print()
                print("Sequences")
                print("=========")
                print()
                print("Total: %d" % len(self.seqcat))
                print("With at least one domain: %d (%.4f%%)" %
                      (num_seqs_with_nonempty_domain_arch,
                       100.0 * num_seqs_with_nonempty_domain_arch /
                       len(self.seqcat)))
                print("With at least one non-novel domain: %d (%.4f%%)" %
                      (num_seqs_with_nonempty_domain_arch_ignore_novel,
                       100. * num_seqs_with_nonempty_domain_arch_ignore_novel /
                       len(self.seqcat)))
                print("With at least one domain and no novel"
                      "domains: %d (%.4f%%)" %
                      (num_seqs_with_nonempty_nonnovel_domain_arch,
                       100.0 * num_seqs_with_nonempty_nonnovel_domain_arch /
                       len(self.seqcat)))
                print()
                print("Residues")
                print("========")
                print()
                print("Total: %d" % total_residues)
                print("Covered: %d (%.4f%%)" % (covered_residues,
                                                100.0 * covered_residues /
                                                total_residues))
                print("Covered by non-novel: %d (%.4f%%)" % (
                      covered_residues_nonnovel,
                      100.0 * covered_residues_nonnovel/total_residues))
            stats_file.close()