def remove_clan_overlaps(pfam_table): """ Remove overlapping Pfam hits from same Pfam clan (equivalent of PfamScan.pl). Currently only allows to remove overlaps by domain bitscore. .. todo:: is bitscore the most sensible choice if different length hits? Parameters ---------- pfam_table : pd.DataFrame Pfam hit table as generated by pfam_hits() function (must contain Pfam clan annotation). Returns ------- pd.DataFrame Pfam hit table with lower-scoring overlaps removed """ # could make this a parameter, if switching to E-values # we would have to changing sorting order of DataFrame # and sign of comparison further below. score = "domain_score" # group by sequence ID and clan to resolve overlaps grouped = pfam_table.sort_values( by=score, ascending=False ).groupby( by=["query_name", "clan_id"], as_index=False, sort=False ) # store index value of all entries to discard remove_hits = [] for (uniprot_ac, clan_name), grp in grouped: # safety check here that we are not grouping hits that are # not in the same clan (missing value) if pandas ever changed # the behaviour of groupby to not iterate through groups # with missing values. Otherwise, we would have to skip grouop. assert clan_name.startswith("CL") # go through all pairwise combinations of hits for idx1, hit1 in grp.iterrows(): for idx2, hit2 in grp.iterrows(): if idx1 < idx2: if range_overlap( (int(hit1["ali_from"]), int(hit1["ali_to"]) + 1), (int(hit2["ali_from"]), int(hit2["ali_to"]) + 1), ) > 0: if float(hit1[score]) >= float(hit2[score]): remove_hits.append(idx2) else: remove_hits.append(idx1) return pfam_table.loc[~pfam_table.index.isin(remove_hits)]
def by_alignment(self, min_overlap=20, reduce_chains=False, **kwargs): """ Find structures by sequence alignment between query sequence and sequences in PDB. Parameters ---------- min_overlap : int, optional (default: 20) Require at least this many aligned positions with the target structure reduce_chains : bool, optional (Default: True) If true, keep only first chain per PDB ID (i.e. remove redundant occurrences of same protein in PDB structures). Should be set to False to identify homomultimeric contacts. **kwargs Defines the behaviour of find_homologs() function used to find homologs by sequence alignment: - which alignment method is used (pdb_alignment_method: {"jackhmmer", "hmmsearch"}, default: "jackhmmer"), - parameters passed into the protocol for the selected alignment method (evcouplings.align.jackhmmer_search or evcouplings.align.hmmbuild_and_search). Default parameters are set in the HMMER_CONFIG string in this module, other parameters will need to be overriden; these minimally are: - for pdb_alignment_method == "jackhmmer": - sequence_id : str, identifier of target sequence - jackhmmer : str, path to jackhmmer binary if not on path - for pdb_alignment_method == "hmmsearch": - sequence_id : str, identifier of target sequence - raw_focus_alignment_file : str, path to input alignment file - hmmbuild : str, path to hmmbuild binary if not on path - hmmsearch : str, path to search binary if not on path - additionally, if "prefix" is given, individual mappings will be saved to files suffixed by the respective key in mapping table. Returns ------- SIFTSResult Record of hits and mappings found for this query sequence by alignment. See by_pdb_id() for detailed explanation of fields. """ def _create_mapping(r): _, query_start, query_end = parse_header(ali.ids[0]) # create mapping from query into PDB Uniprot sequence # A_i will be query sequence indices, A_j Uniprot sequence indices m = map_indices(ali[0], query_start, query_end, ali[r["alignment_id"]], r["alignment_start"], r["alignment_end"]) # create mapping from PDB Uniprot into seqres numbering # j will be Uniprot sequence index, k seqres index n = pd.DataFrame({ "j": list(range(r["uniprot_start"], r["uniprot_end"] + 1)), "k": list(range(r["resseq_start"], r["resseq_end"] + 1)), }) # need to convert to strings since other mapping has indices as strings n.loc[:, "j"] = n.j.astype(str) n.loc[:, "k"] = n.k.astype(str) # join over Uniprot indices (i.e. j); # get rid of any position that is not aligned mn = m.merge(n, on="j", how="inner").dropna() # extract final mapping from seqres (k) to query (i) map_ = dict(zip(mn.k, mn.i)) return map_, mn if self.sequence_file is None: raise ValueError("Need to have SIFTS sequence file. " "Create using create_sequence_file() " "method or constructor.") ali, hits = find_homologs(sequence_database=self.sequence_file, **kwargs) # merge with internal table to identify overlap of # aligned regions and regions with structural coverage hits = hits.merge(self.table, on="uniprot_ac", suffixes=("", "_")) # add 1 to end of range since overlap function treats # ends as exclusive, while ends here are inclusive hits.loc[:, "overlap"] = [ range_overlap((r["uniprot_start"], r["uniprot_end"] + 1), (r["alignment_start"], r["alignment_end"] + 1)) for i, r in hits.iterrows() ] # collect complete index mappings in here... mappings = {} # ... as well as dataframe rows for assignment of hit to mapping mapping_rows = [] # complication: if there are multiple segments per hit and chain, we should # reduce these into a single mapping (even though split mappings # are possible in principle) so we can count unique number of hits etc. hit_columns = ["alignment_id", "pdb_id", "pdb_chain"] for i, (hit, grp) in enumerate(hits.groupby(hit_columns)): agg_mapping = {} agg_df = pd.DataFrame() # go through each segment for j, r in grp.iterrows(): # compute mapping for that particular segment map_j, map_j_df = _create_mapping(r) # add to overall mapping dictionary for this hit agg_mapping.update(map_j) agg_df = agg_df.append(map_j_df) # store assignment of group to mapping index mapping_rows.append(list(hit) + [i, len(grp) > 1]) mappings[i] = agg_mapping # store index mappings if filename prefix is given prefix = kwargs.get("prefix", None) if prefix is not None: agg_df = agg_df.rename( columns={ "j": "uniprot_of_pdb_index", "A_j": "uniprot_of_pdb_residue", "k": "pdb_seqres_index", }) agg_df.to_csv("{}_mapping{}.csv".format(prefix, i), index=False) # create dataframe from mapping rows mapping_df = pd.DataFrame(mapping_rows, columns=hit_columns + [ "mapping_index", "grouped_segments", ]) # now group again, to aggregate full hit dataframe def _agg_type(x): if x == "overlap": return "sum" elif x.endswith("_start"): return "min" elif x.endswith("end"): return "max" else: return "first" agg_types = OrderedDict([(c, _agg_type(c)) for c in hits.columns if c not in hit_columns]) # only aggregate if we have anything to aggregate, # otherwise pandas drops the index columns # alignment_id, pdb_id, pdb_chain and things go # wrong horribly in the following join if len(hits) > 0: hits_grouped = hits.groupby(hit_columns).agg( agg_types).reset_index() else: hits_grouped = hits # join with mapping information hits_grouped = hits_grouped.merge(mapping_df, on=hit_columns) # remove hits with too little residue coverage hits_grouped = hits_grouped.query("overlap >= @min_overlap") hits_grouped.loc[:, "bitscore"] = pd.to_numeric( hits_grouped.loc[:, "bitscore"], errors="coerce") hits_grouped = hits_grouped.sort_values(by="bitscore", ascending=False) # if requested, only keep one chain per PDB; # sort by score before this to keep best hit if reduce_chains: hits_grouped = hits_grouped.groupby("pdb_id").first().reset_index() # sort again, just to be sure... hits_grouped = hits_grouped.sort_values(by="bitscore", ascending=False) # remove any zombie mappings we did not keep in table mappings = { idx: map_ for idx, map_ in mappings.items() if idx in hits_grouped.mapping_index.values } return SIFTSResult(hits_grouped, mappings)