Beispiel #1
def remove_clan_overlaps(pfam_table):
    Remove overlapping Pfam hits from same Pfam clan
    (equivalent of Currently only
    allows to remove overlaps by domain bitscore.

    .. todo::

        is bitscore the most sensible choice if different length hits?

    pfam_table : pd.DataFrame
        Pfam hit table as generated by pfam_hits() function
        (must contain Pfam clan annotation).

        Pfam hit table with lower-scoring overlaps removed
    # could make this a parameter, if switching to E-values
    # we would have to changing sorting order of DataFrame
    # and sign of comparison further below.
    score = "domain_score"

    # group by sequence ID and clan to resolve overlaps
    grouped = pfam_table.sort_values(
        by=score, ascending=False
        by=["query_name", "clan_id"], as_index=False, sort=False

    # store index value of all entries to discard
    remove_hits = []

    for (uniprot_ac, clan_name), grp in grouped:
        # safety check here that we are not grouping hits that are
        # not in the same clan (missing value) if pandas ever changed
        # the behaviour of groupby to not iterate through groups
        # with missing values. Otherwise, we would have to skip grouop.
        assert clan_name.startswith("CL")

        # go through all pairwise combinations of hits
        for idx1, hit1 in grp.iterrows():
            for idx2, hit2 in grp.iterrows():
                if idx1 < idx2:
                    if range_overlap(
                        (int(hit1["ali_from"]), int(hit1["ali_to"]) + 1),
                        (int(hit2["ali_from"]), int(hit2["ali_to"]) + 1),
                    ) > 0:
                        if float(hit1[score]) >= float(hit2[score]):

    return pfam_table.loc[~pfam_table.index.isin(remove_hits)]
Beispiel #2
    def by_alignment(self, min_overlap=20, reduce_chains=False, **kwargs):
        Find structures by sequence alignment between
        query sequence and sequences in PDB.

        min_overlap : int, optional (default: 20)
            Require at least this many aligned positions
            with the target structure
        reduce_chains : bool, optional (Default: True)
            If true, keep only first chain per PDB ID
            (i.e. remove redundant occurrences of same
            protein in PDB structures). Should be set to
            False to identify homomultimeric contacts.
            Defines the behaviour of find_homologs() function
            used to find homologs by sequence alignment:
            - which alignment method is used 
              (pdb_alignment_method: {"jackhmmer", "hmmsearch"}, 
              default: "jackhmmer"),
            - parameters passed into the protocol for the selected
              alignment method (evcouplings.align.jackhmmer_search or
              Default parameters are set in the HMMER_CONFIG string in this
              module, other parameters will need to be overriden; these
              minimally are:
              - for pdb_alignment_method == "jackhmmer":
                - sequence_id : str, identifier of target sequence
                - jackhmmer : str, path to jackhmmer binary if not on path                
              - for pdb_alignment_method == "hmmsearch":
                - sequence_id : str, identifier of target sequence
                - raw_focus_alignment_file : str, path to input alignment file  
                - hmmbuild : str, path to hmmbuild binary if not on path
                - hmmsearch : str, path to search binary if not on path
            - additionally, if "prefix" is given,
              individual mappings will be saved to files suffixed
              by the respective key in mapping table.

            Record of hits and mappings found for this
            query sequence by alignment. See by_pdb_id()
            for detailed explanation of fields.
        def _create_mapping(r):
            _, query_start, query_end = parse_header(ali.ids[0])

            # create mapping from query into PDB Uniprot sequence
            # A_i will be query sequence indices, A_j Uniprot sequence indices
            m = map_indices(ali[0], query_start, query_end,
                            ali[r["alignment_id"]], r["alignment_start"],

            # create mapping from PDB Uniprot into seqres numbering
            # j will be Uniprot sequence index, k seqres index
            n = pd.DataFrame({
                list(range(r["uniprot_start"], r["uniprot_end"] + 1)),
                list(range(r["resseq_start"], r["resseq_end"] + 1)),

            # need to convert to strings since other mapping has indices as strings
            n.loc[:, "j"] = n.j.astype(str)
            n.loc[:, "k"] = n.k.astype(str)

            # join over Uniprot indices (i.e. j);
            # get rid of any position that is not aligned
            mn = m.merge(n, on="j", how="inner").dropna()

            # extract final mapping from seqres (k) to query (i)
            map_ = dict(zip(mn.k, mn.i))

            return map_, mn

        if self.sequence_file is None:
            raise ValueError("Need to have SIFTS sequence file. "
                             "Create using create_sequence_file() "
                             "method or constructor.")

        ali, hits = find_homologs(sequence_database=self.sequence_file,

        # merge with internal table to identify overlap of
        # aligned regions and regions with structural coverage
        hits = hits.merge(self.table, on="uniprot_ac", suffixes=("", "_"))

        # add 1 to end of range since overlap function treats
        # ends as exclusive, while ends here are inclusive
        hits.loc[:, "overlap"] = [
            range_overlap((r["uniprot_start"], r["uniprot_end"] + 1),
                          (r["alignment_start"], r["alignment_end"] + 1))
            for i, r in hits.iterrows()

        # collect complete index mappings in here...
        mappings = {}
        # ... as well as dataframe rows for assignment of hit to mapping
        mapping_rows = []

        # complication: if there are multiple segments per hit and chain, we should
        # reduce these into a single mapping (even though split mappings
        # are possible in principle) so we can count unique number of hits etc.
        hit_columns = ["alignment_id", "pdb_id", "pdb_chain"]
        for i, (hit, grp) in enumerate(hits.groupby(hit_columns)):
            agg_mapping = {}
            agg_df = pd.DataFrame()
            # go through each segment
            for j, r in grp.iterrows():
                # compute mapping for that particular segment
                map_j, map_j_df = _create_mapping(r)

                # add to overall mapping dictionary for this hit
                agg_df = agg_df.append(map_j_df)

            # store assignment of group to mapping index
            mapping_rows.append(list(hit) + [i, len(grp) > 1])

            mappings[i] = agg_mapping

            # store index mappings if filename prefix is given
            prefix = kwargs.get("prefix", None)
            if prefix is not None:
                agg_df = agg_df.rename(
                        "j": "uniprot_of_pdb_index",
                        "A_j": "uniprot_of_pdb_residue",
                        "k": "pdb_seqres_index",

                agg_df.to_csv("{}_mapping{}.csv".format(prefix, i),

        # create dataframe from mapping rows
        mapping_df = pd.DataFrame(mapping_rows,
                                  columns=hit_columns + [

        # now group again, to aggregate full hit dataframe
        def _agg_type(x):
            if x == "overlap":
                return "sum"
            elif x.endswith("_start"):
                return "min"
            elif x.endswith("end"):
                return "max"
                return "first"

        agg_types = OrderedDict([(c, _agg_type(c)) for c in hits.columns
                                 if c not in hit_columns])

        # only aggregate if we have anything to aggregate,
        # otherwise pandas drops the index columns
        # alignment_id, pdb_id, pdb_chain and things go
        # wrong horribly in the following join
        if len(hits) > 0:
            hits_grouped = hits.groupby(hit_columns).agg(
            hits_grouped = hits

        # join with mapping information
        hits_grouped = hits_grouped.merge(mapping_df, on=hit_columns)

        # remove hits with too little residue coverage
        hits_grouped = hits_grouped.query("overlap >= @min_overlap")

        hits_grouped.loc[:, "bitscore"] = pd.to_numeric(
            hits_grouped.loc[:, "bitscore"], errors="coerce")
        hits_grouped = hits_grouped.sort_values(by="bitscore", ascending=False)

        # if requested, only keep one chain per PDB;
        # sort by score before this to keep best hit
        if reduce_chains:
            hits_grouped = hits_grouped.groupby("pdb_id").first().reset_index()
            # sort again, just to be sure...
            hits_grouped = hits_grouped.sort_values(by="bitscore",

        # remove any zombie mappings we did not keep in table
        mappings = {
            idx: map_
            for idx, map_ in mappings.items()
            if idx in hits_grouped.mapping_index.values

        return SIFTSResult(hits_grouped, mappings)