Ejemplo n.º 1
0
    def run(self):
        """Runs the calculation"""
        self.logger.info("Starting module %s", self.name)

        self.prepare()

        # Search for the CommandLineApp object in the module
        app = []
        for value in self.module.__dict__.values():
            if isinstance(value, type) and value != CommandLineApp \
                    and issubclass(value, CommandLineApp):
                app.append(value)

        if len(app) != 1:
            raise ValueError("more than one CommandLineApp in %s" % self.name)

        # Create the application
        app = app[0](logger=self.logger)
        args = ["-c", self.config.get("@global.config_file")]
        # add some extra args, if any
        try:
            self.extra_args
        except NameError:
            self.extra_args = []
        args.extend(self.extra_args)

        for param, value in self.parameters.items():
            if not param.startswith("switch."):
                continue
            switch, value = value.split(" ", 1)
            value = modula.STORAGE_ENGINE.get_filename(value.strip())
            args.extend([switch, value])

        if "infile" in self.parameters:
            infiles = self.parameters["infile"].split(",")
            for infile in infiles:
                infile = modula.STORAGE_ENGINE.get_filename(infile.strip())
                args.append(infile)

        if "stdin" in self.parameters:
            stdin = modula.STORAGE_ENGINE.get_source(self.parameters["stdin"])
        else:
            stdin = None

        out_fname = modula.STORAGE_ENGINE.get_filename(self.name)
        stdout = modula.STORAGE_ENGINE.get_result_stream(self, mode="wb")
        try:
            with redirected(stdin=stdin, stdout=stdout):
                retcode = app.run(args)
            stdout.close()
            if retcode:
                raise RuntimeError("non-zero return code from child module")
        except Exception:
            # If an error happens, remove the output file and re-raise
            # the exception
            stdout.close()
            os.unlink(out_fname)
            raise

        self.logger.info("Finished module %s", self.name)
Ejemplo n.º 2
0
 def print_new_domains_table(self, table):
     """Prints the new domain table"""
     self.log.info("Printing the new domains table")
     table_file = open(self.options.new_domains_table, "w")
     with redirected(stdout=table_file):
         for cluster_name in sorted(table.keys()):
             print(cluster_name + "\t" + "\t".join(table[cluster_name]))
     table_file.close()
Ejemplo n.º 3
0
 def print_new_domains_table(self, table):
     """Prints the new domain table"""
     self.log.info("Printing the new domains table")
     
     table_file = open(self.options.new_domains_table, "w")
     with redirected(stdout=table_file):
         for cluster_name in sorted(table.keys()):
             print cluster_name + "\t" + "\t".join(table[cluster_name])
     table_file.close()
Ejemplo n.º 4
0
    def run_real(self):
        """Runs the applications"""
        if len(self.args) != 2:
            self.error("exactly two input files are expected")

        AssignmentOverlapChecker.max_overlap = self.options.max_overlap
        AssignmentOverlapChecker.log = self.log

        if self.options.interpro_parent_child_file:
            self.log.info("Loading InterPro parent-child"
                          " assignments from %s..." %
                          self.options.interpro_parent_child_file)
            self.interpro = InterPro.FromFile(
                self.options.interpro_parent_child_file)
        else:
            self.interpro = InterPro()

        self.interpro_names = InterProNames.FromFile(
            self.options.interpro_names_file)

        if self.options.details:
            self.details_file = open(self.options.details, "w")
        else:
            self.details_file = None

        if self.options.old_table:
            self.process_old_table(self.options.old_table)
            self.using_old_table = True
        else:
            self.using_old_table = False
            self.current_cluster_id = 1

        interpro_file, clustering_file = self.args
        self.process_interpro_file(interpro_file)
        table = self.process_clustering_file(clustering_file)
        self.sort_by_domain_architecture()

        if self.options.new_domains_table:
            self.print_new_domains_table(table)

        for seqs in self.domain_archs.values():
            seqs.sort()

        self.domain_archs = self.domain_archs.items()
        self.domain_archs.sort(key=lambda x: len(x[1]), reverse=True)

        for domain_arch, members in self.domain_archs:
            if domain_arch:
                arch_str = domain_arch
            else:
                arch_str = "NO_ASSIGNMENT"
                arch_str_pos = "NO_ASSIGNMENT"
                arch_desc = "NO_DESCRIPTION"

            family_length = len(members)
            for member in members:
                seq = self.seqcat[member]
                if domain_arch:
                    arch_str_pos = seq.architecture_pos
                    arch_desc = ";".join(self.interpro_names[assignment.domain]
                                         for assignment in seq.assignments)
                print("%s\t%d\t%d\t%s\t%d\t%s\t%s" %
                      (member, seq.length, seq.num_covered(), arch_str,
                       family_length, arch_str_pos, arch_desc))

        self.details_file.close()

        if self.options.stats:
            stats_file = open(self.options.stats, "w")
            total_residues = 0.0
            covered_residues = 0
            covered_residues_nonnovel = 0
            nonnovel_sources = complementerset(["Novel"])

            for seq in self.seqcat.values():
                total_residues += seq.length
                covered_residues += round(seq.coverage() * seq.length)
                covered_residues_nonnovel += round(
                    seq.coverage(sources=nonnovel_sources) * seq.length)

            all_archs = set(arch for arch, _ in self.domain_archs)
            num_archs = len(all_archs)
            if "" in self.domain_archs:
                num_archs -= 1

            if self.options.prefix:
                prefix = self.options.prefix
            else:
                prefix = "NOVEL"

            def split_arch(arch):
                return [
                    x for x in arch.replace("{", ";").replace("}", ";").split(
                        ";") if x
                ]

            def exclude_novel_domains(domain_architecture):
                """Excludes novel domains from a domain architecture and returns
                the filtered domain architecture as a tuple."""
                return tuple(a for a in split_arch(domain_architecture)
                             if not a.startswith(prefix))

            archs_without_novel = set(
                exclude_novel_domains(arch) for arch in all_archs)
            if () in archs_without_novel:
                archs_without_novel.remove(())
            num_archs_without_novel = len(archs_without_novel)

            num_seqs_with_nonempty_domain_arch = sum(
                len(value) for ke, value in self.domain_archs
                if ke and ke != "NO_ASSIGNMENT")
            num_seqs_with_nonempty_domain_arch_ignore_novel = \
                sum(len(value) for key, value in self.domain_archs
                    if exclude_novel_domains(key) in archs_without_novel
                    and key != "NO_ASSIGNMENT")
            num_seqs_with_nonempty_nonnovel_domain_arch = \
                sum(len(value) for key, value in self.domain_archs
                    if key and not any(a.startswith(prefix) for a in
                                       split_arch(key)) and
                    key != "NO_ASSIGNMENT")

            with redirected(stdout=stats_file):
                print("Domain architectures")
                print("====================")
                print()
                print("Non-empty: %d" % num_archs)
                print("Non-empty (when ignoring novel domains): %d" %
                      num_archs_without_novel)
                print()
                print("Sequences")
                print("=========")
                print()
                print("Total: %d" % len(self.seqcat))
                print("With at least one domain: %d (%.4f%%)" %
                      (num_seqs_with_nonempty_domain_arch, 100.0 *
                       num_seqs_with_nonempty_domain_arch / len(self.seqcat)))
                print("With at least one non-novel domain: %d (%.4f%%)" %
                      (num_seqs_with_nonempty_domain_arch_ignore_novel,
                       100. * num_seqs_with_nonempty_domain_arch_ignore_novel /
                       len(self.seqcat)))
                print("With at least one domain and no novel domains: "
                      "%d (%.4f%%)" %
                      (num_seqs_with_nonempty_nonnovel_domain_arch,
                       100.0 * num_seqs_with_nonempty_nonnovel_domain_arch /
                       len(self.seqcat)))
                print()
                print("Residues")
                print("========")
                print()
                print("Total: %d" % total_residues)
                print("Covered: %d (%.4f%%)" %
                      (covered_residues,
                       100.0 * covered_residues / total_residues))
                print("Covered by non-novel: %d (%.4f%%)" %
                      (covered_residues_nonnovel,
                       100.0 * covered_residues_nonnovel / total_residues))
            stats_file.close()
Ejemplo n.º 5
0
    def run_real(self):
        """Runs the applications"""
        if len(self.args) != 2:
            self.error("exactly two input files are expected")

        AssignmentOverlapChecker.max_overlap = self.options.max_overlap

        if self.options.interpro_parent_child_file:
            self.log.info("Loading InterPro parent-child assignments from %s..." % \
                    self.options.interpro_parent_child_file)
            self.interpro = InterPro.FromFile(self.options.interpro_parent_child_file)
        else:
            self.interpro = InterPro()

        self.interpro_names = InterProNames.FromFile(self.options.interpro_names_file)

        if self.options.details:
            self.details_file = open(self.options.details, "w")
        else:
            self.details_file = None

        interpro_file, clustering_file = self.args
        self.process_interpro_file(interpro_file)
        self.process_clustering_file(clustering_file)
        self.sort_by_domain_architecture()

        for seqs in self.domain_archs.itervalues():
            seqs.sort()

        self.domain_archs = self.domain_archs.items()
        self.domain_archs.sort(key=lambda x: len(x[1]), reverse=True)

        for domain_arch, members in self.domain_archs:
            if domain_arch:
                arch_str = ";".join(domain_arch)
            else:
                arch_str = "NO_ASSIGNMENT"
                arch_str_pos = "NO_ASSIGNMENT"
                arch_desc = "NO_DESCRIPTION"

            family_length = len(members)
            for member in members:
                seq = self.seqcat[member]
                if domain_arch:
                    arch_str_pos = ";".join(assignment.short_repr() \
                            for assignment in seq.assignments)
                    arch_desc = ";".join( \
                            self.interpro_names[assignment.domain]
                            for assignment in seq.assignments
                    )
                print "%s\t%d\t%s\t%d\t%s\t%s" % (member, seq.length, arch_str, \
                                              family_length, arch_str_pos, \
                                              arch_desc)

        self.details_file.close()

        if self.options.stats:
            stats_file = open(self.options.stats, "w")

            total_residues, covered_residues, covered_residues_nonnovel = 0.0, 0, 0
            nonnovel_sources = complementerset(["Novel"])

            for seq in self.seqcat.itervalues():
                total_residues += seq.length
                covered_residues += round(seq.coverage() * seq.length)
                covered_residues_nonnovel += round(seq.coverage(sources=nonnovel_sources) * seq.length)

            all_archs = set(arch for arch, _ in self.domain_archs)
            num_archs = len(all_archs)
            if "" in self.domain_archs:
                num_archs -= 1

            def exclude_novel_domains(domain_architecture):
                """Excludes novel domains from a domain architecture and returns
                the filtered domain architecture as a tuple."""
                return tuple(a for a in domain_architecture if not a.startswith("NOVEL"))

            archs_without_novel = set(exclude_novel_domains(arch)
                    for arch in all_archs)
            archs_without_novel.discard(())
            num_archs_without_novel = len(archs_without_novel)

            num_seqs_with_nonempty_domain_arch = \
                    sum(len(value) for key, value in self.domain_archs if key)
            num_seqs_with_nonempty_domain_arch_ignore_novel = \
                    sum(len(value) for key, value in self.domain_archs
                        if exclude_novel_domains(key) in archs_without_novel)
            num_seqs_with_nonempty_nonnovel_domain_arch = \
                    sum(len(value) for key, value in self.domain_archs
                            if key and not any(a.startswith("NOVEL") for a in key))

            with redirected(stdout=stats_file):
                print "Domain architectures"
                print "===================="
                print ""
                print "Non-empty: %d" % num_archs
                print "Non-empty (when ignoring novel domains): %d" % num_archs_without_novel
                print ""
                print "Sequences"
                print "========="
                print ""
                print "Total: %d" % len(self.seqcat)
                print "With at least one domain: %d (%.4f%%)" %\
                        (num_seqs_with_nonempty_domain_arch,
                         100.0 * num_seqs_with_nonempty_domain_arch / len(self.seqcat))
                print "With at least one non-novel domain: %d (%.4f%%)" %\
                        (num_seqs_with_nonempty_domain_arch_ignore_novel,
                         100.0 * num_seqs_with_nonempty_domain_arch_ignore_novel / len(self.seqcat))
                print "With at least one domain and no novel domains: %d (%.4f%%)" %\
                        (num_seqs_with_nonempty_nonnovel_domain_arch,
                         100.0 * num_seqs_with_nonempty_nonnovel_domain_arch / len(self.seqcat))
                print ""
                print "Residues"
                print "========"
                print ""
                print "Total: %d" % total_residues
                print "Covered: %d (%.4f%%)" % (covered_residues, 100.0*covered_residues/total_residues)
                print "Covered by non-novel: %d (%.4f%%)" % (covered_residues_nonnovel, 100.0*covered_residues_nonnovel/total_residues)
            stats_file.close()
Ejemplo n.º 6
0
    def run(self):
        """Runs the calculation"""
        self.logger.info("Starting module %s", self.name)

        self.prepare()

        # Search for the CommandLineApp object in the module
        app = []
        for value in self.module.__dict__.values():
            if isinstance(value, type) and value != CommandLineApp \
                    and issubclass(value, CommandLineApp):
                app.append(value)

        if len(app) != 1:
            raise ValueError("more than one CommandLineApp in %s" % self.name)

        # Create the application
        app = app[0](logger=self.logger)
        args = ["-c", self.config.get("@global.config_file")]
        # add some extra args, if any
        try:
            self.extra_args
        except NameError:
            self.extra_args = []
        if self.extra_args is not None:
            args.extend(self.extra_args)

        for param, value in self.parameters.items():
            if not param.startswith("switch."):
                continue
            switch, value = value.split(" ", 1)
            value = modula.STORAGE_ENGINE.get_filename(value.strip())
            args.extend([switch, value])

        if "infile" in self.parameters:
            infiles = self.parameters["infile"].split(",")
            for infile in infiles:
                infile = modula.STORAGE_ENGINE.get_filename(infile.strip())
                args.append(infile)

        if "stdin" in self.parameters:
            stdin = modula.STORAGE_ENGINE.get_source(self.parameters["stdin"])
        else:
            stdin = None

        out_fname = modula.STORAGE_ENGINE.get_filename(self.name)
        stdout = modula.STORAGE_ENGINE.get_result_stream(self, mode="wb")
        try:
            with redirected(stdin=stdin, stdout=stdout):
                retcode = app.run(args)
            stdout.close()
            if retcode:
                raise RuntimeError("non-zero return code from child module")
        except RuntimeError:
            # If an error happens, remove the output file and re-raise
            # the exception
            stdout.close()
            os.unlink(out_fname)
            raise

        self.logger.info("Finished module %s", self.name)
    def run_real(self):
        """Runs the applications"""
        if len(self.args) != 2:
            self.error("exactly two input files are expected")

        AssignmentOverlapChecker.max_overlap = self.options.max_overlap

        if self.options.interpro_parent_child_file:
            self.log.info("Loading InterPro parent-child"
                          " assignments from %s..." %
                          self.options.interpro_parent_child_file)
            self.interpro = InterPro.FromFile(
                self.options.interpro_parent_child_file)
        else:
            self.interpro = InterPro()

        self.interpro_names = InterProNames.FromFile(
            self.options.interpro_names_file)

        if self.options.details:
            self.details_file = open(self.options.details, "w")
        else:
            self.details_file = None

        interpro_file, hmmer_file = self.args
        self.process_interpro_file(interpro_file)
        self.process_hmmer_file(hmmer_file)
        self.sort_by_domain_architecture()

        for seqs in self.domain_archs.values():
            seqs.sort()

        self.domain_archs = self.domain_archs.items()
        self.domain_archs.sort(key=lambda x: len(x[1]), reverse=True)

        for domain_arch, members in self.domain_archs:
            if domain_arch:
                arch_str = domain_arch  # ";".join(domain_arch)
            else:
                arch_str = "NO_ASSIGNMENT"
                arch_str_pos = "NO_ASSIGNMENT"
                arch_desc = "NO_DESCRIPTION"

            family_length = len(members)
            for member in members:
                seq = self.seqcat[member]
                if domain_arch:
                    arch_str_pos = seq.architecture_pos
                    arch_desc = ";".join(
                            self.interpro_names[assignment.domain]
                            for assignment in seq.assignments)
                print("%s\t%d\t%d\t%s\t%d\t%s\t%s" % (member, seq.length,
                                                      seq.num_covered(),
                                                      arch_str,
                                                      family_length,
                                                      arch_str_pos,
                                                      arch_desc))

        self.details_file.close()

        if self.options.stats:
            stats_file = open(self.options.stats, "w")

            total_residues = 0.0
            covered_residues, covered_residues_nonnovel = 0, 0
            nonnovel_sources = complementerset(["Novel"])

            for seq in self.seqcat.values():
                total_residues += seq.length
                covered_residues += round(seq.coverage() * seq.length)
                covered_residues_nonnovel += round(
                    seq.coverage(sources=nonnovel_sources) * seq.length)

            all_archs = set(arch for arch, _ in self.domain_archs)
            num_archs = len(all_archs)
            if "" in self.domain_archs:
                num_archs -= 1

            def split_arch(arch):
                return [x for x in arch.replace("{", ";").replace("}", ";")
                                                         .split(";") if x]

            def exclude_novel_domains(domain_architecture):
                """Excludes novel domains from a domain architecture and returns
                the filtered domain architecture as a tuple."""
                return tuple(a for a in split_arch(domain_architecture)
                             if a not in self.hmm_domains)

            archs_without_novel = set(exclude_novel_domains(arch)
                                      for arch in all_archs)
            if () in archs_without_novel:
                archs_without_novel.remove(())
            num_archs_without_novel = len(archs_without_novel)
            num_seqs_with_nonempty_domain_arch = sum(
                len(value) for key, value in self.domain_archs if key
                and key != "NO_ASSIGNMENT")
            num_seqs_with_nonempty_domain_arch_ignore_novel = sum(
                len(value) for key, value in self.domain_archs
                if exclude_novel_domains(key) in archs_without_novel
                and key != "NO_ASSIGNMENT")
            num_seqs_with_nonempty_nonnovel_domain_arch = sum(
                len(value) for ke, value in self.domain_archs
                if ke and not any(a in self.hmm_domains for a in ke)
                and ke != "NO_ASSIGNMENT")

            with redirected(stdout=stats_file):
                print("Domain architectures")
                print("====================")
                print()
                print("Non-empty: %d" % num_archs)
                print("Non-empty (when ignoring novel domains): %d" %
                      num_archs_without_novel)
                print()
                print("Sequences")
                print("=========")
                print()
                print("Total: %d" % len(self.seqcat))
                print("With at least one domain: %d (%.4f%%)" %
                      (num_seqs_with_nonempty_domain_arch,
                       100.0 * num_seqs_with_nonempty_domain_arch /
                       len(self.seqcat)))
                print("With at least one non-novel domain: %d (%.4f%%)" %
                      (num_seqs_with_nonempty_domain_arch_ignore_novel,
                       100. * num_seqs_with_nonempty_domain_arch_ignore_novel /
                       len(self.seqcat)))
                print("With at least one domain and no novel"
                      "domains: %d (%.4f%%)" %
                      (num_seqs_with_nonempty_nonnovel_domain_arch,
                       100.0 * num_seqs_with_nonempty_nonnovel_domain_arch /
                       len(self.seqcat)))
                print()
                print("Residues")
                print("========")
                print()
                print("Total: %d" % total_residues)
                print("Covered: %d (%.4f%%)" % (covered_residues,
                                                100.0 * covered_residues /
                                                total_residues))
                print("Covered by non-novel: %d (%.4f%%)" % (
                      covered_residues_nonnovel,
                      100.0 * covered_residues_nonnovel/total_residues))
            stats_file.close()