Exemple #1
0
    def split_path(cls, path):
        """
        Returns information about a cluster based on the path (filename).
        """

        re_file = re.compile(r'(?P<sfam_id>[0-9.]+)'  # 1.10.8.10
                             # - or __
                             r'(?P<join_char>-|__)'
                             # ff
                             r'(?P<cluster_type>\w+)'
                             # - or __
                             r'(?:-|__)'
                             # 1234
                             r'(?P<cluster_num>[0-9]+)'
                             # .reduced (optional)
                             r'(?P<desc>.*?)'
                             # .sto (optional)
                             r'(?P<suffix>\.\w+)?$')

        basename = os.path.basename(path)
        m = re_file.match(basename)
        if m:
            info = m.groupdict()
            info['path'] = os.path.dirname(path)
            return info

        raise err.NoMatchesError(
            'failed to parse cluster details from filename "{}"'.format(
                basename))
Exemple #2
0
    def search_by_domain_id(self, domain_id):
        """Return the filename of the FunFam alignment containing the domain id."""
        if not is_valid_domain_id(domain_id):
            raise err.InvalidInputError('{} is not a valid domain id'.format(
                repr(domain_id)))

        # replace template placeholders with '*'
        glob_path = re.sub(r'__([A-Z_]+)__', '*', self.ff_tmpl)
        grep_args = (self.grep_path, '--include', glob_path, '-l',
                     '^' + domain_id, '-R', self.base_dir)
        LOG.debug("search_by_domain_id: sys: " + " ".join(grep_args))

        try:
            # note: this returns bytes (not strings)
            grep_out = subprocess.check_output(grep_args).decode('ascii')
        except subprocess.CalledProcessError as e:
            if e.returncode == 1:
                # grep telling us it didn't find any matches
                raise err.NoMatchesError(
                    'failed to find domain id {} with cmd {}'.format(
                        domain_id, str(e.cmd)))
            else:
                LOG.error(
                    'CMD: {}\nCODE: {}\nOUTPUT: {}\nSTDERR: "{}"\nSTDOUT: "{}"\n'
                    .format(e.cmd, e.returncode, e.output, e.stderr, e.stdout))
                raise
        except:
            raise FileNotFoundError(
                "Encountered error trying to find domain_id '{}' (grep: `{}`)".
                format(domain_id, " ".join(grep_args)))

        ff_files = grep_out.splitlines()

        if len(ff_files) == 0:
            raise FileNotFoundError(
                "Failed to find FunFam alignment for domain_id '{}' (grep: `{}`)"
                .format(domain_id, " ".join(grep_args)))
        elif len(ff_files) > 1:
            raise err.GeneralError(
                "Found more than one FunFam file ({}) containing the domain id '{}' (grep: `{}`):\n{}\n"
                .format(
                    len(ff_files),
                    domain_id,
                    " ".join(grep_args),
                    "\n".join(ff_files),
                ))

        LOG.debug("search_by_domain_id: found funfam alignment {}".format(
            repr(ff_files[0])))

        return ff_files[0]
Exemple #3
0
    def funfam_id_from_file(self, ff_file):
        """Extracts a FunfamID from the file name (based on the ff_tmpl)"""
        ff_re = self.ff_tmpl
        ff_re = ff_re.replace('__SFAM__', r'(?P<sfam_id>[0-9\.]+)')
        ff_re = ff_re.replace('__FF_NUM__', r'(?P<ff_num>[0-9]+)')
        filename = os.path.basename(ff_file)
        m = re.match(ff_re, filename)
        if not m:
            raise err.NoMatchesError(
                "failed to match template '{}' against filename '{}'".format(
                    ff_re, filename))
        ff_id = FunfamID(sfam_id=m.group('sfam_id'),
                         cluster_num=int(m.group('ff_num')))

        return ff_id