def from_file(cls, filename, file_format="pdb"): """ Initialize structure from PDB/mmCIF file Parameters ---------- filename : str Path of file file_format : {"pdb", "cif"}, optional (default: "pdb") Format of structure (old PDB format or mmCIF) Returns ------- ClassicPDB Initialized PDB structure """ try: if file_format == "pdb": from Bio.PDB import PDBParser parser = PDBParser(QUIET=True) elif file_format == "cif": from Bio.PDB import FastMMCIFParser parser = FastMMCIFParser(QUIET=True) else: raise InvalidParameterError( "Invalid file_format, valid options are: pdb, cif" ) structure = parser.get_structure("", filename) return cls(structure) except FileNotFoundError as e: raise ResourceError( "Could not find file {}".format(filename) ) from e
def from_id(cls, pdb_id): """ Initialize structure by PDB ID (fetches structure from RCSB servers) Parameters ---------- pdb_id : str PDB identifier (e.g. 1hzx) Returns ------- PDB initialized PDB structure """ from urllib.error import URLError from Bio.PDB import PDBList pdblist = PDBList() try: # download PDB file to temporary directory pdb_file = pdblist.retrieve_pdb_file(pdb_id, pdir=tempdir()) return cls.from_file(pdb_file, file_format="pdb") except URLError as e: raise ResourceError( "Could not fetch PDB data for {}".format(pdb_id) ) from e
def _insert_file(self, filename, parent_id): """ Insert file from filesystem into database Parameters ---------- filename : str Path to file that is to be inserted parent_id : bson.ObjectId MongoDB identifier of job document this file is linked to Returns ------- dict Dictionary with keys "filename" (original file path) and "fs_id" (ObjectId of inserted file in GridFS) """ def _insert(): with open(filename, "rb") as f: return self.fs.put(f, parent_id=parent_id, job_id=self.job_id, filename=filename, time_saved=datetime.utcnow()) try: id_ = self._retry_query(_insert) except OSError as e: raise ResourceError( "Could not read {} for storing in MongoDB backend".format( filename)) from e return {"filename": filename, "fs_id": id_}
def fetch_sequence(sequence_id, sequence_file, sequence_download_url, out_file): """ Fetch sequence either from database based on identifier, or from input sequence file. Parameters ---------- sequence_id : str Identifier of sequence that should be retrieved sequence_file : str File containing sequence. If None, sqeuence will be downloaded from sequence_download_url sequence_download_url : str URL from which to download missing sequence. Must contain "{}" at the position where sequence ID will be inserted into download URL (using str.format). out_file : str Output file in which sequence will be stored, if sequence_file is not existing. Returns ------- str Path of file with stored sequence (can be sequence_file or out_file) tuple (str, str) Identifier of sequence as stored in file, and sequence """ if sequence_file is None: get(sequence_download_url.format(sequence_id), out_file, allow_redirects=True) else: # if we have sequence file, try to copy it try: copy(sequence_file, out_file) except FileNotFoundError: raise ResourceError( "sequence_file does not exist: {}".format(sequence_file)) # also make sure input file has something in it verify_resources("Input sequence missing", out_file) with open(out_file) as f: seq = next(read_fasta(f)) return out_file, seq
def from_file(cls, filename): """ Initialize structure from MMTF file Parameters ---------- filename : str Path of MMTF file Returns ------- PDB initialized PDB structure """ try: return cls(parse(filename)) except FileNotFoundError as e: raise ResourceError( "Could not find file {}".format(filename) ) from e
def from_id(cls, pdb_id): """ Initialize structure by PDB ID (fetches structure from RCSB servers) Parameters ---------- pdb_id : str PDB identifier (e.g. 1hzx) Returns ------- PDB initialized PDB structure """ try: return cls(fetch(pdb_id)) except HTTPError as e: raise ResourceError( "Could not fetch MMTF data for {}".format(pdb_id) ) from e
def fetch_uniprot_mapping(ids, from_="ACC", to="ACC", format="fasta"): """ Fetch data from UniProt ID mapping service (e.g. download set of sequences) Parameters ---------- ids : list(str) List of UniProt identifiers for which to retrieve mapping from_ : str, optional (default: "ACC") Source identifier (i.e. contained in "ids" list) to : str, optional (default: "ACC") Target identifier (to which source should be mapped) format : str, optional (default: "fasta") Output format to request from Uniprot server Returns ------- str: Response from UniProt server """ params = { "from": from_, "to": to, "format": format, "query": " ".join(ids) } url = UNIPROT_MAPPING_URL r = requests.post(url, data=params) if r.status_code != requests.codes.ok: raise ResourceError("Invalid status code ({}) for URL: {}".format( r.status_code, url)) return r.text
def run_plmc(alignment, couplings_file, param_file=None, focus_seq=None, alphabet=None, theta=None, scale=None, ignore_gaps=False, iterations=None, lambda_h=None, lambda_J=None, lambda_g=None, cpu=None, binary="plmc"): """ Run plmc on sequence alignment and store files with model parameters and pair couplings. Parameters ---------- alignment : str Path to input sequence alignment couplings_file : str Output path for file with evolutionary couplings (folder will be created) param_file : str Output path for binary file containing model parameters (folder will be created) focus_seq : str, optional (default: None) Name of focus sequence, if None, non-focus mode will be used alphabet : str, optional (default: None) Alphabet for model inference. If None, standard amino acid alphabet including gap will be used. First character in string corresponds to gap character (relevant for ignore_gaps). theta : float, optional (default: None) Sequences with pairwise identity >= theta will be clustered and their sequence weights downweighted as 1 / num_cluster_members. Important: Note that plmc will be parametrized using 1 - theta. If None, default value in plmc will be used, which corresponds to theta=0.8 (plmc setting 0.2). scale : float, optional (default: None) Scale weights of clusters by this value. If None, default value in plmc (1.0) will be used ignore_gaps : bool, optional (default: False) Exclude gaps from parameter inference. Gap character is first character of alphabet parameter. iterations : int, optional (default: None) Maximum iterations for optimization. lambda_h : float, optional (default: None) l2 regularization strength on fields. If None, plmc default will be used. lambda_J : float, optional (default: None) l2-regularization strength on couplings. If None, plmc default will be used lambda_g : float, optional (default: None) group l1-regularization strength on couplings If None, plmc default will be used. cpu : Number of cores to use for running plmc. Note that plmc has to be compiled in openmp mode to runnable with multiple cores. Can also be set to "max". binary : str, optional (default: "plmc") Path to plmc binary Returns ------- PlmcResult namedtuple containing output files and parsed fields from console output of plmc Raises ------ ExternalToolError """ create_prefix_folders(couplings_file) # Make sure input alignment exists verify_resources( "Alignment file does not exist", alignment ) cmd = [ binary, "-c", couplings_file, ] # store eij file if explicitly requested if param_file is not None: create_prefix_folders(param_file) cmd += ["-o", param_file] # focus sequence mode and ID if focus_seq is not None: # TODO: for now split exclude sequence # region from focus seq name, otherwise # plmc does not remap names. If this # behaviour changes in plmc, remove the # following line. focus_seq = focus_seq.split("/")[0] cmd += ["-f", focus_seq] # exclude gaps from calculation? if ignore_gaps: cmd += ["-g"] # maximum number of iterations, can also be "max" if iterations is not None: cmd += ["-m", str(iterations)] # set custom alphabet # (first character is gap by default in nogap mode) if alphabet is not None: cmd += ["-a", alphabet] # sequence reweighting if theta is not None: # transform into plmc convention (1-theta) theta = 1.0 - theta cmd += ["-t", str(theta)] # cluster weight if scale is not None: cmd += ["-s", str(scale)] # L2 regularization weight for fields if lambda_h is not None: cmd += ["-lh", str(lambda_h)] # L2 regularization weight for pair couplings if lambda_J is not None: cmd += ["-le", str(lambda_J)] # Group L1 regularization weight for pair couplings if lambda_g is not None: cmd += ["-lg", str(lambda_g)] # Number of cores to use for calculation if cpu is not None: cmd += ["-n", str(cpu)] # finally also add input alignment (main parameter) cmd += [alignment] # TODO: for now do not check returncode because sometimes # returncode == -11 (segfault) despite successful calculation return_code, stdout, stderr = run(cmd, check_returncode=False) # TODO: remove this segfault-hunting output once fixed if return_code != 0: # if not a segfault, still raise exception if return_code != -11: from evcouplings.utils.system import ExternalToolError raise ExternalToolError( "Call failed:\ncmd={}\nreturncode={}\nstdout={}\nstderr={}".format( cmd, return_code, stdout, stderr ) ) print("PLMC NON-ZERO RETURNCODE:", return_code) print(cmd) print(" ".join(cmd)) print("stdout:", stdout) print("stderr:", stderr) iter_df, out_fields = parse_plmc_log(stderr) # also check we actually calculated couplings... if not valid_file(couplings_file): raise ResourceError( "plmc returned no couplings: stdout={} stderr={} file={}".format( stdout, stderr, couplings_file ) ) # ... and parameter file, if requested if param_file and not valid_file(param_file): raise ResourceError( "plmc returned no parameter file: stdout={} stderr={} file={}".format( stdout, stderr, param_file ) ) return PlmcResult( couplings_file, param_file, iter_df, *out_fields )
def substitute_config(**kwargs): """ Substitute command line arguments into config file Parameters ---------- **kwargs Command line parameters to be substituted into configuration file Returns ------- dict Updated configuration """ # mapping of command line parameters to config file entries CONFIG_MAP = { "prefix": ("global", "prefix"), "protein": ("global", "sequence_id"), "seqfile": ("global", "sequence_file"), "alignment": ("align", "input_alignment"), "iterations": ("align", "iterations"), "id": ("align", "seqid_filter"), "seqcov": ("align", "minimum_sequence_coverage"), "colcov": ("align", "minimum_column_coverage"), "theta": ("global", "theta"), "plmiter": ("couplings", "iterations"), "queue": ("environment", "queue"), "time": ("environment", "time"), "cores": ("environment", "cores"), "memory": ("environment", "memory"), } # try to read in configuration config_file = kwargs["config"] if not valid_file(config_file): raise ResourceError( "Config file does not exist or is empty: {}".format(config_file)) config = read_config_file(config_file, preserve_order=True) # substitute command-line parameters into configuration # (if straightforward substitution) for param, value in kwargs.items(): if param in CONFIG_MAP and value is not None: outer, inner = CONFIG_MAP[param] config[outer][inner] = value # make sure that number of CPUs requested by # programs within pipeline does not exceed # number of cores requested in environment if config["environment"]["cores"] is not None: config["global"]["cpu"] = config["environment"]["cores"] # handle the more complicated parameters # If alignment is given, run "existing" protocol if kwargs.get("alignment", None) is not None: # TODO: think about what to do if sequence_file is given # (will not be used) config["align"]["protocol"] = "existing" # subregion of protein if kwargs.get("region", None) is not None: region = kwargs["region"] m = re.search("(\d+)-(\d+)", region) if m: start, end = map(int, m.groups()) config["global"]["region"] = [start, end] else: raise InvalidParameterError( "Region string does not have format " "start-end (e.g. 5-123):".format(region)) # pipeline stages to run if kwargs.get("stages", None) is not None: config["stages"] = kwargs["stages"].replace(" ", "").split(",") # sequence alignment input database if kwargs.get("database", None) is not None: db = kwargs["database"] # check if we have a predefined sequence database # if so, use it; otherwise, interpret as file path if db in config["databases"]: config["align"]["database"] = db else: config["align"]["database"] = "custom" config["databases"]["custom"] = db # make sure bitscore and E-value thresholds are exclusively set if kwargs.get("bitscores", None) is not None and kwargs.get( "evalues", None) is not None: raise InvalidParameterError( "Can not specify bitscore and E-value threshold at the same time.") if kwargs.get("bitscores", None) is not None: thresholds = kwargs["bitscores"] bitscore = True elif kwargs.get("evalues", None) is not None: thresholds = kwargs["evalues"] bitscore = False else: thresholds = None if thresholds is not None: T = thresholds.replace(" ", "").split(",") try: x_cast = [(float(t) if "." in t else int(t)) for t in T] except ValueError: raise InvalidParameterError( "Bitscore/E-value threshold(s) must be numeric: " "{}".format(thresholds)) config["align"]["use_bitscores"] = bitscore # check if we have a single threshold (single job) # or if we need to create an array of jobs if len(x_cast) == 1: config["align"]["domain_threshold"] = x_cast[0] config["align"]["sequence_threshold"] = x_cast[0] else: config["batch"] = {} for t in x_cast: sub_prefix = ("_b" if bitscore else "_e") + str(t) config["batch"][sub_prefix] = { "align": { "domain_threshold": t, "sequence_threshold": t, } } return config
def create_sequence_file(self, output_file, chunk_size=1000, max_retries=100): """ Create FASTA sequence file containing all UniProt sequences of proteins in SIFTS. This file is required for homology-based structure identification and index remapping. This function will also automatically associate the sequence file with the SIFTS object. Parameters ---------- output_file : str Path at which to store sequence file chunk_size : int, optional (default: 1000) Retrieve sequences from UniProt in chunks of this size (too large chunks cause the mapping service to stall) max_retries : int, optional (default: 100) Allow this many retries when fetching sequences from UniProt ID mapping service, which unfortunately often suffers from connection failures. """ ids = self.table.uniprot_ac.unique().tolist() # retrieve sequences in chunks since ID mapping service # tends to fail on large requests id_chunks = [ ids[i:i + chunk_size] for i in range(0, len(ids), chunk_size) ] # store individual retrieved chunks as list of strings seq_chunks = [] # keep track of how many retries were necessary and # abort if number exceeds max_retries num_retries = 0 for ch in id_chunks: # fetch sequence chunk; # if there is a problem retry as long as we stay within # maximum number of retries while True: try: seqs = fetch_uniprot_mapping(ch) break except requests.ConnectionError as e: # count as failed try num_retries += 1 # if we retried too often, abort if num_retries > max_retries: raise ResourceError( "Could not fetch sequences for SIFTS mapping tables from UniProt since " "maximum number of retries after connection errors was exceeded. Retry " "at a later time, or call SIFTS.create_sequence_file() with a higher value " "for max_retries.") from e # rename identifiers in sequence file, so # we can circumvent Uniprot sequence identifiers # being prefixed by hmmer if a hit has exactly the # same identifier as the query sequence seqs = seqs.replace( ">sp|", ">evsp|", ).replace( ">tr|", ">evtr|", ) assert seqs.endswith("\n") # store for writing seq_chunks.append(seqs) # store sequences to FASTA file in one go at the end with open(output_file, "w") as f: f.write("".join(seq_chunks)) self.sequence_file = output_file # add Uniprot ID column to SIFTS table self._add_uniprot_ids()