def split_path(cls, path): """ Returns information about a cluster based on the path (filename). """ re_file = re.compile(r'(?P<sfam_id>[0-9.]+)' # 1.10.8.10 # - or __ r'(?P<join_char>-|__)' # ff r'(?P<cluster_type>\w+)' # - or __ r'(?:-|__)' # 1234 r'(?P<cluster_num>[0-9]+)' # .reduced (optional) r'(?P<desc>.*?)' # .sto (optional) r'(?P<suffix>\.\w+)?$') basename = os.path.basename(path) m = re_file.match(basename) if m: info = m.groupdict() info['path'] = os.path.dirname(path) return info raise err.NoMatchesError( 'failed to parse cluster details from filename "{}"'.format( basename))
def search_by_domain_id(self, domain_id): """Return the filename of the FunFam alignment containing the domain id.""" if not is_valid_domain_id(domain_id): raise err.InvalidInputError('{} is not a valid domain id'.format( repr(domain_id))) # replace template placeholders with '*' glob_path = re.sub(r'__([A-Z_]+)__', '*', self.ff_tmpl) grep_args = (self.grep_path, '--include', glob_path, '-l', '^' + domain_id, '-R', self.base_dir) LOG.debug("search_by_domain_id: sys: " + " ".join(grep_args)) try: # note: this returns bytes (not strings) grep_out = subprocess.check_output(grep_args).decode('ascii') except subprocess.CalledProcessError as e: if e.returncode == 1: # grep telling us it didn't find any matches raise err.NoMatchesError( 'failed to find domain id {} with cmd {}'.format( domain_id, str(e.cmd))) else: LOG.error( 'CMD: {}\nCODE: {}\nOUTPUT: {}\nSTDERR: "{}"\nSTDOUT: "{}"\n' .format(e.cmd, e.returncode, e.output, e.stderr, e.stdout)) raise except: raise FileNotFoundError( "Encountered error trying to find domain_id '{}' (grep: `{}`)". format(domain_id, " ".join(grep_args))) ff_files = grep_out.splitlines() if len(ff_files) == 0: raise FileNotFoundError( "Failed to find FunFam alignment for domain_id '{}' (grep: `{}`)" .format(domain_id, " ".join(grep_args))) elif len(ff_files) > 1: raise err.GeneralError( "Found more than one FunFam file ({}) containing the domain id '{}' (grep: `{}`):\n{}\n" .format( len(ff_files), domain_id, " ".join(grep_args), "\n".join(ff_files), )) LOG.debug("search_by_domain_id: found funfam alignment {}".format( repr(ff_files[0]))) return ff_files[0]
def funfam_id_from_file(self, ff_file): """Extracts a FunfamID from the file name (based on the ff_tmpl)""" ff_re = self.ff_tmpl ff_re = ff_re.replace('__SFAM__', r'(?P<sfam_id>[0-9\.]+)') ff_re = ff_re.replace('__FF_NUM__', r'(?P<ff_num>[0-9]+)') filename = os.path.basename(ff_file) m = re.match(ff_re, filename) if not m: raise err.NoMatchesError( "failed to match template '{}' against filename '{}'".format( ff_re, filename)) ff_id = FunfamID(sfam_id=m.group('sfam_id'), cluster_num=int(m.group('ff_num'))) return ff_id