Exemple #1
0
    def __init__(self, base_dir, *, ff_tmpl='__SFAM__-ff-__FF_NUM__.sto'):
        self.base_dir = base_dir
        if re.search(r'[^a-zA-Z_.\-*]', ff_tmpl):
            raise err.InvalidInputError(
                "ff_tmpl contains unexpected characters")

        self.ff_tmpl = ff_tmpl
Exemple #2
0
    def search_by_domain_id(self, domain_id):
        """Return the filename of the FunFam alignment containing the domain id."""
        if not is_valid_domain_id(domain_id):
            raise err.InvalidInputError('{} is not a valid domain id'.format(
                repr(domain_id)))

        # replace template placeholders with '*'
        glob_path = re.sub(r'__([A-Z_]+)__', '*', self.ff_tmpl)
        grep_args = (self.grep_path, '--include', glob_path, '-l',
                     '^' + domain_id, '-R', self.base_dir)
        LOG.debug("search_by_domain_id: sys: " + " ".join(grep_args))

        try:
            # note: this returns bytes (not strings)
            grep_out = subprocess.check_output(grep_args).decode('ascii')
        except subprocess.CalledProcessError as e:
            if e.returncode == 1:
                # grep telling us it didn't find any matches
                raise err.NoMatchesError(
                    'failed to find domain id {} with cmd {}'.format(
                        domain_id, str(e.cmd)))
            else:
                LOG.error(
                    'CMD: {}\nCODE: {}\nOUTPUT: {}\nSTDERR: "{}"\nSTDOUT: "{}"\n'
                    .format(e.cmd, e.returncode, e.output, e.stderr, e.stdout))
                raise
        except:
            raise FileNotFoundError(
                "Encountered error trying to find domain_id '{}' (grep: `{}`)".
                format(domain_id, " ".join(grep_args)))

        ff_files = grep_out.splitlines()

        if len(ff_files) == 0:
            raise FileNotFoundError(
                "Failed to find FunFam alignment for domain_id '{}' (grep: `{}`)"
                .format(domain_id, " ".join(grep_args)))
        elif len(ff_files) > 1:
            raise err.GeneralError(
                "Found more than one FunFam file ({}) containing the domain id '{}' (grep: `{}`):\n{}\n"
                .format(
                    len(ff_files),
                    domain_id,
                    " ".join(grep_args),
                    "\n".join(ff_files),
                ))

        LOG.debug("search_by_domain_id: found funfam alignment {}".format(
            repr(ff_files[0])))

        return ff_files[0]
Exemple #3
0
    def get_by_id(cls, aa_str):
        """Return the AminoAcid object by the given single character aa code."""

        aa_str = str(aa_str)
        aa_obj = None
        if len(aa_str) == 1:
            aa_obj = cls._aa_by_one[aa_str.upper()]
        elif len(aa_str) == 3:
            aa_obj = cls._aa_by_three[aa_str.upper()]
        else:
            raise err.InvalidInputError(
                "expected either 1- or 3-character amino acid id (not: '{}')".
                format(aa_str))
        return aa_obj
Exemple #4
0
    def run_alignment(self,
                      alignment,
                      *,
                      column_gap=None,
                      group_gap=None,
                      mclachlan=False):
        """Runs `groupsim` against a given alignment."""

        # mclachan max score is 5: normalise to 0-1 before storing
        maxscore = 5 if mclachlan else 1

        fasta_tmp = tempfile.NamedTemporaryFile(mode='w+',
                                                delete=False,
                                                suffix=".fa")
        fasta_tmp_filename = fasta_tmp.name

        if not column_gap:
            column_gap = self.column_gap

        if not group_gap:
            group_gap = self.group_gap

        column_gap = float(column_gap)
        group_gap = float(group_gap)

        assert (column_gap > 0 and column_gap < 1)
        assert (group_gap > 0 and group_gap < 1)

        # write out the alignment with funfam numbers appended to sequence ids
        # >1ebgB02/127-436|7431
        aln_copy = alignment.copy()
        for seq in aln_copy.seqs:
            if not seq.cluster_id:
                raise err.InvalidInputError((
                    "need to set_cluster_id() on alignment sequences before running groupsim: {}"
                ).format(seq.__dict__))
            seq.set_uid('{}|{}'.format(seq.uid, str(seq.cluster_id)))

        source_ids = {s.cluster_id for s in aln_copy.seqs}

        # lower-case aa -> gaps
        # '.' -> '-'
        for s in aln_copy.seqs:
            s.set_all_gap_chars(gap_char='-')
            s.set_lower_case_to_gap(gap_char='-')

        aln_copy.write_fasta(fasta_tmp_filename)

        groupsim_args = [
            self.python2path, self.groupsim_path, '-c',
            str(self.column_gap), '-g',
            str(self.group_gap)
        ]

        if mclachlan:
            groupsim_args.extend(['-m', self.mclachlan_path])

        groupsim_args.append(fasta_tmp_filename)
        groupsim_args.extend(source_ids)

        groupsim_args = [str(a) for a in groupsim_args]

        LOG.debug("running groupsim: sys: %s", " ".join(groupsim_args))

        try:
            p = Popen(groupsim_args,
                      stdout=PIPE,
                      stderr=PIPE,
                      universal_newlines=True)
            groupsim_out, _ = p.communicate()
        except CalledProcessError as e:
            LOG.error(
                'CMD: %s\nCODE: %s\nOUTPUT: %s\nSTDERR: "%s"\nSTDOUT: "%s"\n',
                e.cmd, e.returncode, e.output, e.stderr, e.stdout)
            raise e
        except:
            raise FileNotFoundError(
                "Encountered unexpected error running GroupSim: `{}`".format(
                    " ".join(groupsim_args)))

        gs_io = io.StringIO(groupsim_out)

        res = GroupsimResult.from_io(gs_io, maxscore=maxscore)

        return res