def load_structures(pdb_ids, structure_dir=None, raise_missing=True): """ Load PDB structures from files / web Parameters ---------- pdb_ids : Iterable List / iterable containing PDB identifiers to be loaded. structure_dir : str, optional (default: None) Path to directory with structures. Structures filenames must be in the format 5p21.mmtf. If a file can not be found, will try to fetch from web instead. raise_missing : bool, optional (default: True) Raise a ResourceError exception if any of the PDB IDs cannot be loaded. If False, missing entries will be ignored. Returns ------- structures : dict(str -> PDB) Dictionary containing loaded structures. Keys (PDB identifiers) will be lower-case. Raises ------ ResourceError Raised if raise_missing is True and any of the given PDB IDs cannot be loaded. """ # collect loaded structures in dict(id -> PDB) structures = {} # load structure by structure for pdb_id in set(pdb_ids): pdb_id = pdb_id.lower() has_file = False if structure_dir is not None: structure_file = path.join(structure_dir, pdb_id + ".mmtf") has_file = valid_file(structure_file) try: # see if we can load locally from disk if has_file: structures[pdb_id] = PDB.from_file(structure_file) else: # otherwise fetch from web structures[pdb_id] = PDB.from_id(pdb_id) except (ResourceError, UnicodeDecodeError): # ResourceError: invalid PDB ID # UnicodeDecodeError: some random problem with mmtf library if raise_missing: raise return structures
def create_archive(config, outcfg, output_file): """ Create archive of files generated by pipeline Parameters ---------- config : dict-like Input configuration of job. Uses config["management"]["archive"] (list of key used to index outcfg) to determine which files should be added to archive outcfg : dict-like Output configuration of job output_file : str Store archive file to this path """ # determine keys (corresponding to files) in # outcfg that should be stored outkeys = config.get("management", {}).get("archive", None) # if no output keys are requested, nothing to do if outkeys is None or len(outkeys) == 0: return # create archive with tarfile.open(output_file, "w:gz") as tar: # add files based on keys one by one for k in outkeys: # skip missing keys or ones not defined if k not in outcfg or outcfg[k] is None: continue # distinguish between files and lists of files if k.endswith("files"): for f in outcfg[k]: if valid_file(f): tar.add(f) else: if valid_file(outcfg[k]): tar.add(outcfg[k])
def __init__(self, sifts_table_file, sequence_file=None): """ Create new SIFTS mapper from mapping table. Note that creation of the mapping files, if not existing, takes a while. Parameters ---------- sifts_table_file : str Path to *corrected* SIFTS pdb_chain_uniprot.csv To generate this file, point to an empty file path. sequence_file : str, optional (default: None) Path to file containing all UniProt sequences in SIFTS (used for homology-based identification of structures). Note: This file can be created using the create_sequence_file() method. """ # test if table exists, if not, download and modify if not valid_file(sifts_table_file): self._create_mapping_table(sifts_table_file) self.table = pd.read_csv(sifts_table_file, comment="#") # final table has still some entries where lengths do not match, # remove these self.table = self.table.query( "(resseq_end - resseq_start) == (uniprot_end - uniprot_start)") self.sequence_file = sequence_file # if path for sequence file given, but not there, create if sequence_file is not None and not valid_file(sequence_file): self.create_sequence_file(sequence_file) # add Uniprot ID column if we have sequence mapping # from FASTA file if self.sequence_file is not None: self._add_uniprot_ids()
def _protein_monomer_plot(ali_table, data): """ # TODO """ import seaborn as sns sns.set_palette("Paired", len(ali_table), None) FONTSIZE = 16 # set up plot and grid fig = plt.figure(figsize=(15, 15)) gridsize = ((3, 2)) ax_cov = plt.subplot2grid(gridsize, (0, 0), colspan=1) ax_distr = plt.subplot2grid(gridsize, (0, 1), colspan=1) ax_gaps = plt.subplot2grid(gridsize, (1, 0), colspan=2) ax_sig = plt.subplot2grid(gridsize, (2, 0), colspan=1) ax_comp = plt.subplot2grid(gridsize, (2, 1), colspan=1) # 1) Number of sequences, coverage l_seqs = ax_cov.plot(ali_table.domain_threshold, ali_table.N_eff / ali_table.num_cov, "ok-", label="# Sequences") ax_cov.set_xlabel("Domain inclusion threshold") ax_cov.set_ylabel("# effective sequences / L") ax_cov.set_title("Sequences and coverage", fontsize=FONTSIZE) ax_cov.legend(loc="lower left") ax_cov2 = ax_cov.twinx() l_cov = ax_cov2.plot(ali_table.domain_threshold, ali_table.num_cov / ali_table.seqlen, "o-", label="Coverage", color="#2079b4") ax_cov2.set_ylabel("Coverage (% of region)") ax_cov2.legend(loc="lower right") ax_cov2.set_ylim(0, 1) # 2) sequence identity & coverage distributions for (domain_threshold, subjob), subdata in sorted(data.items()): # sequence identities to query if valid_file(subdata["identities"]): ids = pd.read_csv(subdata["identities"]).identity_to_query.dropna() ax_distr.hist(ids, histtype="step", range=(0, 1.0), bins=100, normed=True, cumulative=True, linewidth=3, label=str(domain_threshold)) ali_table.loc[ali_table.prefix == subjob, "average_identity"] = ids.mean() # coverage distribution if valid_file(subdata["frequencies"]): freqs = pd.read_csv(subdata["frequencies"]) # print(freqs.head()) ax_gaps.plot(freqs.i, 1 - freqs.loc[:, "-"], "o", linewidth=3, label=str(domain_threshold)) mincov = subdata["minimum_column_coverage"] if mincov > 1: mincov /= 100 ax_gaps.axhline(mincov, ls="--", color="k") ax_distr.set_xlabel("% sequence identity to query") ax_distr.set_title("Sequence identity distribution", fontsize=FONTSIZE) ax_distr.set_xlim(0, 1) ax_distr.set_ylim(0, 1) ax_distr.legend() ax_gaps.set_title("Gap statistics", fontsize=FONTSIZE) ax_gaps.set_xlabel("Sequence index") ax_gaps.set_ylabel("Column coverage (1 - % gaps)") ax_gaps.autoscale(enable=True, axis='x', tight=True) ax_gaps.set_ylim(0, 1) ax_gaps.legend(loc="best") # number of significant ECs, EC precision if "num_significant" in ali_table.columns: ax_sig.plot(ali_table.domain_threshold, ali_table.num_significant / ali_table.num_cov, "ok-") ax_sig.set_title("Significant ECs", fontsize=FONTSIZE) ax_sig.set_xlabel("Domain inclusion threshold") ax_sig.set_ylabel("Fraction of significant ECs (% of L)") if "precision" in ali_table.columns: ax_comp.plot(ali_table.domain_threshold, ali_table.precision, "ok-") ax_comp.set_title("Comparison to 3D (top L ECs)", fontsize=FONTSIZE) ax_comp.set_xlabel("Domain inclusion threshold") ax_comp.set_ylabel("EC precision") ax_comp.set_ylim(0, 1) return fig
def jackhmmer_search(**kwargs): """ Protocol: Iterative jackhmmer search against a sequence database. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required .. todo:: explain meaning of parameters in detail. Returns ------- outcfg : dict Output configuration of the protocol, including the following fields: * sequence_id (passed through from input) * first_index (passed through from input) * target_sequence_file * sequence_file * raw_alignment_file * hittable_file * focus_mode * focus_sequence * segments """ check_required(kwargs, [ "prefix", "sequence_id", "sequence_file", "sequence_download_url", "region", "first_index", "use_bitscores", "domain_threshold", "sequence_threshold", "database", "iterations", "cpu", "nobias", "reuse_alignment", "checkpoints_hmm", "checkpoints_ali", "jackhmmer", "extract_annotation" ]) prefix = kwargs["prefix"] # make sure output directory exists create_prefix_folders(prefix) # store search sequence file here target_sequence_file = prefix + ".fa" full_sequence_file = prefix + "_full.fa" # make sure search sequence is defined and load it full_seq_file, (full_seq_id, full_seq) = fetch_sequence( kwargs["sequence_id"], kwargs["sequence_file"], kwargs["sequence_download_url"], full_sequence_file) # cut sequence to target region and save in sequence_file # (this is the main sequence file used downstream) (region_start, region_end), cut_seq = cut_sequence(full_seq, kwargs["sequence_id"], kwargs["region"], kwargs["first_index"], target_sequence_file) # run jackhmmer... allow to reuse pre-exisiting # Stockholm alignment file here ali_outcfg_file = prefix + ".align_jackhmmer_search.outcfg" # determine if to rerun, only possible if previous results # were stored in ali_outcfg_file if kwargs["reuse_alignment"] and valid_file(ali_outcfg_file): ali = read_config_file(ali_outcfg_file) # check if the alignment file itself is also there verify_resources( "Tried to reuse alignment, but empty or " "does not exist", ali["alignment"], ali["domtblout"]) else: # otherwise, we have to run the alignment # modify search thresholds to be suitable for jackhmmer seq_threshold, domain_threshold = search_thresholds( kwargs["use_bitscores"], kwargs["sequence_threshold"], kwargs["domain_threshold"], len(cut_seq)) # run search process ali = at.run_jackhmmer( query=target_sequence_file, database=kwargs[kwargs["database"]], prefix=prefix, use_bitscores=kwargs["use_bitscores"], domain_threshold=domain_threshold, seq_threshold=seq_threshold, iterations=kwargs["iterations"], nobias=kwargs["nobias"], cpu=kwargs["cpu"], checkpoints_hmm=kwargs["checkpoints_hmm"], checkpoints_ali=kwargs["checkpoints_ali"], binary=kwargs["jackhmmer"], ) # get rid of huge stdout log file immediately # (do not use /dev/null option of jackhmmer function # to make no assumption about operating system) try: os.remove(ali.output) except OSError: pass # turn namedtuple into dictionary to make # restarting code nicer ali = dict(ali._asdict()) # save results of search for possible restart write_config_file(ali_outcfg_file, ali) # prepare output dictionary with result files outcfg = { "sequence_id": kwargs["sequence_id"], "target_sequence_file": target_sequence_file, "sequence_file": full_sequence_file, "first_index": kwargs["first_index"], "focus_mode": True, "raw_alignment_file": ali["alignment"], "hittable_file": ali["domtblout"], } # define a single protein segment based on target sequence outcfg["segments"] = [ Segment("aa", kwargs["sequence_id"], region_start, region_end, range(region_start, region_end + 1)).to_list() ] outcfg["focus_sequence"] = "{}/{}-{}".format(kwargs["sequence_id"], region_start, region_end) return outcfg
def _make_hmmsearch_raw_fasta(alignment_result, prefix): """ HMMsearch results do not contain the query sequence so we must construct a raw_fasta file with the query sequence as the first hit, to ensure proper numbering. The search result is filtered to only contain the columns with match states to the HMM, which has a one to one mapping to the query sequence. Paramters --------- alignment_result : dict Alignment result dictionary, output by run_hmmsearch prefix : str Prefix for file creation Returns ------- str path to raw focus alignment file """ def _add_gaps_to_query(query_sequence_ali, ali): # get the index of columns that do not contain match states (indicated by an x) gap_index = [ i for i, x in enumerate(ali.annotation["GC"]["RF"]) if x != "x" ] # get the index of columns that contain match states (indicated by an x) match_index = [ i for i, x in enumerate(ali.annotation["GC"]["RF"]) if x == "x" ] # ensure that the length of the match states # match the length of the sequence if len(match_index) != query_sequence_ali.L: raise ValueError("HMMsearch result {} does not have a one-to-one" " mapping to the query sequence columns".format( ar["raw_alignment_file"])) gapped_query_sequence = "" seq = list(query_sequence_ali.matrix[0, :]) # loop through every position in the HMMsearch hits for i in range(len(ali.annotation["GC"]["RF"])): # if that position should be a gap, add a gap if i in gap_index: gapped_query_sequence += "-" # if that position should be a letter, pop the next # letter in the query sequence else: gapped_query_sequence += seq.pop(0) new_sequence_ali = Alignment.from_dict( {query_sequence_ali.ids[0]: gapped_query_sequence}) return new_sequence_ali # open the sequence file with open(alignment_result["target_sequence_file"]) as a: query_sequence_ali = Alignment.from_file(a, format="fasta") # if the provided alignment is empty, just return the target sequence raw_focus_alignment_file = prefix + "_raw.fasta" if not valid_file(alignment_result["raw_alignment_file"]): # write the query sequence to a fasta file with open(raw_focus_alignment_file, "w") as of: query_sequence_ali.write(of) # return as an alignment object return raw_focus_alignment_file # else, open the HMM search result with open(alignment_result["raw_alignment_file"]) as a: ali = Alignment.from_file(a, format="stockholm") # make sure that the stockholm alignment contains the match annotation if not ("GC" in ali.annotation and "RF" in ali.annotation["GC"]): raise ValueError("Stockholm alignment {} missing RF" " annotation of match states".format( alignment_result["raw_alignment_file"])) # add insertions to the query sequence in order to preserve correct # numbering of match sequences gapped_sequence_ali = _add_gaps_to_query(query_sequence_ali, ali) # write a new alignment file with the query sequence as # the first entry with open(raw_focus_alignment_file, "w") as of: gapped_sequence_ali.write(of) ali.write(of) return raw_focus_alignment_file
def hmmbuild_and_search(**kwargs): """ Protocol: Build HMM from sequence alignment using hmmbuild and search against a sequence database using hmmsearch. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the protocol, including the following fields: * target_sequence_file * sequence_file * raw_alignment_file * hittable_file * focus_mode * focus_sequence * segments """ def _format_alignment_for_hmmbuild(input_alignment_file, **kwargs): # this file is starting point of pipeline; # check if input alignment actually exists verify_resources("Input alignment does not exist", input_alignment_file) # first try to autodetect format of alignment with open(input_alignment_file) as f: format = detect_format(f) if format is None: raise InvalidParameterError( "Format of input alignment {} could not be " "automatically detected.".format(input_alignment_file)) with open(input_alignment_file) as f: ali_raw = Alignment.from_file(f, format) # Target sequence of alignment sequence_id = kwargs["sequence_id"] if sequence_id is None: raise InvalidParameterError( "Parameter sequence_id must be defined") # First, find focus sequence in alignment focus_index = None for i, id_ in enumerate(ali_raw.ids): if id_.startswith(sequence_id): focus_index = i break # if we didn't find it, cannot continue if focus_index is None: raise InvalidParameterError( "Target sequence {} could not be found in alignment".format( sequence_id)) # identify what columns (non-gap) to keep for focus # this should be all columns in the raw_focus_alignment_file # but checking anyway focus_seq = ali_raw[focus_index] focus_cols = np.array([ c not in [ali_raw._match_gap, ali_raw._insert_gap] for c in focus_seq ]) # extract focus alignment focus_ali = ali_raw.select(columns=focus_cols) focus_seq_nogap = "".join(focus_ali[focus_index]) # determine region of sequence. If first_index is given, # use that in any case, otherwise try to autodetect full_focus_header = ali_raw.ids[focus_index] focus_id = full_focus_header.split()[0] # try to extract region from sequence header id_, region_start, region_end = parse_header(focus_id) # override with first_index if given if kwargs["first_index"] is not None: region_start = kwargs["first_index"] region_end = region_start + len(focus_seq_nogap) - 1 if region_start is None or region_end is None: raise InvalidParameterError( "Could not extract region information " + "from sequence header {} ".format(full_focus_header) + "and first_index parameter is not given.") # resubstitute full sequence ID from identifier # and region information header = "{}/{}-{}".format(id_, region_start, region_end) focus_ali.ids[focus_index] = header # write target sequence to file target_sequence_file = prefix + ".fa" with open(target_sequence_file, "w") as f: write_fasta([(header, focus_seq_nogap)], f) # swap target sequence to first position if it is not # the first sequence in alignment; # this is particularly important for hhfilter run # because target sequence might otherwise be filtered out if focus_index != 0: indices = np.arange(0, len(focus_ali)) indices[0] = focus_index indices[focus_index] = 0 focus_index = 0 focus_ali = focus_ali.select(sequences=indices) # write the raw focus alignment for hmmbuild focus_fasta_file = prefix + "_raw_focus_input.fasta" with open(focus_fasta_file, "w") as f: focus_ali.write(f, "fasta") return focus_fasta_file, target_sequence_file, region_start, region_end # define the gap threshold for inclusion in HMM's build by HMMbuild. SYMFRAC_HMMBUILD = 0.0 # check for required options check_required(kwargs, [ "prefix", "sequence_id", "alignment_file", "use_bitscores", "domain_threshold", "sequence_threshold", "database", "cpu", "nobias", "reuse_alignment", "hmmbuild", "hmmsearch" ]) prefix = kwargs["prefix"] # make sure output directory exists create_prefix_folders(prefix) # prepare input alignment for hmmbuild focus_fasta_file, target_sequence_file, region_start, region_end = \ _format_alignment_for_hmmbuild( kwargs["alignment_file"], **kwargs ) # run hmmbuild_and_search... allow to reuse pre-exisiting # Stockholm alignment file here ali_outcfg_file = prefix + ".align_hmmbuild_and_search.outcfg" # determine if to rerun, only possible if previous results # were stored in ali_outcfg_file if kwargs["reuse_alignment"] and valid_file(ali_outcfg_file): ali = read_config_file(ali_outcfg_file) # check if the alignment file itself is also there verify_resources( "Tried to reuse alignment, but empty or " "does not exist", ali["alignment"], ali["domtblout"]) else: # otherwise, we have to run the alignment # modify search thresholds to be suitable for hmmsearch sequence_length = region_end - region_start + 1 seq_threshold, domain_threshold = search_thresholds( kwargs["use_bitscores"], kwargs["sequence_threshold"], kwargs["domain_threshold"], sequence_length) # create the hmm hmmbuild_result = at.run_hmmbuild( alignment_file=focus_fasta_file, prefix=prefix, symfrac=SYMFRAC_HMMBUILD, cpu=kwargs["cpu"], binary=kwargs["hmmbuild"], ) hmmfile = hmmbuild_result.hmmfile # run the alignment from the hmm ali = at.run_hmmsearch( hmmfile=hmmfile, database=kwargs[kwargs["database"]], prefix=prefix, use_bitscores=kwargs["use_bitscores"], domain_threshold=domain_threshold, seq_threshold=seq_threshold, nobias=kwargs["nobias"], cpu=kwargs["cpu"], binary=kwargs["hmmsearch"], ) # get rid of huge stdout log file immediately try: os.remove(ali.output) except OSError: pass # turn namedtuple into dictionary to make # restarting code nicer ali = dict(ali._asdict()) # only item from hmmsearch_result to save is the hmmfile ali["hmmfile"] = hmmfile # save results of search for possible restart write_config_file(ali_outcfg_file, ali) # prepare output dictionary with result files outcfg = { "sequence_file": target_sequence_file, "first_index": region_start, "input_raw_focus_alignment": focus_fasta_file, "target_sequence_file": target_sequence_file, "focus_mode": True, "raw_alignment_file": ali["alignment"], "hittable_file": ali["domtblout"], } # convert the raw output alignment to fasta format # and add the appropriate query sequecne raw_focus_alignment_file = _make_hmmsearch_raw_fasta(outcfg, prefix) outcfg["raw_focus_alignment_file"] = raw_focus_alignment_file # define a single protein segment based on target sequence outcfg["segments"] = [ Segment("aa", kwargs["sequence_id"], region_start, region_end, range(region_start, region_end + 1)).to_list() ] outcfg["focus_sequence"] = "{}/{}-{}".format(kwargs["sequence_id"], region_start, region_end) return outcfg
def infer_plmc(**kwargs): """ Run EC computation on alignment. This function contains the functionality shared between monomer and complex EC inference. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: raw_ec_file model_file num_sites num_sequences effective_sequences focus_mode (passed through) focus_sequence (passed through) segments (passed through) """ check_required( kwargs, [ "prefix", "alignment_file", "focus_mode", "focus_sequence", "theta", "alphabet", "segments", "ignore_gaps", "iterations", "lambda_h", "lambda_J", "lambda_group", "scale_clusters", "cpu", "plmc", "reuse_ecs", ] ) prefix = kwargs["prefix"] # for now disable option to not save model, since # otherwise mutate stage will crash. To remove model # file at end, use delete option in management section. """ if kwargs["save_model"]: model = prefix + ".model" else: model = None """ model = prefix + ".model" outcfg = { "model_file": model, "raw_ec_file": prefix + "_ECs.txt", "ec_file": prefix + "_CouplingScores.csv", # the following are passed through stage... "focus_mode": kwargs["focus_mode"], "focus_sequence": kwargs["focus_sequence"], "segments": kwargs["segments"], } # make sure input alignment exists verify_resources( "Input alignment does not exist", kwargs["alignment_file"] ) # make sure output directory exists create_prefix_folders(prefix) # regularization strength on couplings J_ij lambda_J = kwargs["lambda_J"] segments = kwargs["segments"] if segments is not None: segments = [ mapping.Segment.from_list(s) for s in segments ] # first determine size of alphabet; # default is amino acid alphabet if kwargs["alphabet"] is None: alphabet = ALPHABET_PROTEIN alphabet_setting = None else: alphabet = kwargs["alphabet"] # allow shortcuts for protein, DNA, RNA if alphabet in ALPHABET_MAP: alphabet = ALPHABET_MAP[alphabet] # if we have protein alphabet, do not set # as plmc parameter since default parameter, # has some implementation advantages for focus mode if alphabet == ALPHABET_PROTEIN: alphabet_setting = None else: alphabet_setting = alphabet # scale lambda_J to proportionally compensate # for higher number of J_ij compared to h_i? if kwargs["lambda_J_times_Lq"]: num_symbols = len(alphabet) # if we ignore gaps, there is one character less if kwargs["ignore_gaps"]: num_symbols -= 1 # second, determine number of uppercase positions # that are included in the calculation with open(kwargs["alignment_file"]) as f: seq_id, seq = next(read_fasta(f)) # gap character is by convention first char in alphabet gap = alphabet[0] uppercase = [ c for c in seq if c == c.upper() or c == gap ] L = len(uppercase) # finally, scale lambda_J lambda_J *= (num_symbols - 1) * (L - 1) # run plmc... or reuse pre-exisiting results from previous run plm_outcfg_file = prefix + ".couplings_standard_plmc.outcfg" # determine if to rerun, only possible if previous results # were stored in ali_outcfg_file if kwargs["reuse_ecs"] and valid_file(plm_outcfg_file): plmc_result = read_config_file(plm_outcfg_file) # check if the EC/parameter files are there required_files = [outcfg["raw_ec_file"]] if outcfg["model_file"] is not None: required_files += [outcfg["model_file"]] verify_resources( "Tried to reuse ECs, but empty or " "does not exist", *required_files ) else: # run plmc binary plmc_result = ct.run_plmc( kwargs["alignment_file"], outcfg["raw_ec_file"], outcfg["model_file"], focus_seq=kwargs["focus_sequence"], alphabet=alphabet_setting, theta=kwargs["theta"], scale=kwargs["scale_clusters"], ignore_gaps=kwargs["ignore_gaps"], iterations=kwargs["iterations"], lambda_h=kwargs["lambda_h"], lambda_J=lambda_J, lambda_g=kwargs["lambda_group"], cpu=kwargs["cpu"], binary=kwargs["plmc"], ) # save iteration table to file iter_table_file = prefix + "_iteration_table.csv" plmc_result.iteration_table.to_csv( iter_table_file ) # turn namedtuple into dictionary to make # restarting code nicer plmc_result = dict(plmc_result._asdict()) # then replace table with filename so # we can store results in config file plmc_result["iteration_table"] = iter_table_file # save results of search for possible restart write_config_file(plm_outcfg_file, plmc_result) # store useful information about model in outcfg outcfg.update({ "num_sites": plmc_result["num_valid_sites"], "num_valid_sequences": plmc_result["num_valid_seqs"], "effective_sequences": plmc_result["effective_samples"], "region_start": plmc_result["region_start"], }) # read and sort ECs ecs = pairs.read_raw_ec_file(outcfg["raw_ec_file"]) if segments is not None: # create index mapping seg_mapper = mapping.SegmentIndexMapper( kwargs["focus_mode"], outcfg["region_start"], *segments ) # apply to EC table ecs = mapping.segment_map_ecs(ecs, seg_mapper) return outcfg, ecs, segments
def create_archive(config, outcfg, prefix): """ Create archive of files generated by pipeline Parameters ---------- config : dict-like Input configuration of job. Uses config["management"]["archive"] (list of key used to index outcfg) to determine which files should be added to archive outcfg : dict-like Output configuration of job prefix : str Prefix of job, will be used to define archive file path (prefix + archive type-specific extension) """ # allowed output archive formats ALLOWED_FORMATS = ["targz", "zip"] # determine selected output format, default is .tar.gz archive_format = config.get("management", {}).get("archive_format", "targz") # determine keys (corresponding to files) in # outcfg that should be stored archive_keys = config.get("management", {}).get("archive", None) # if no files selected for archiving, return immediately if archive_keys is None: return # check if selected format is valid if archive_format not in ALLOWED_FORMATS: raise InvalidParameterError( "Invalid format for output archive: {}. ".format(archive_format) + "Valid options are: " + ", ".join(ALLOWED_FORMATS)) # create explicit list of files that would go into archive and are valid files archive_files = [(file_path, file_key, idx) for (file_path, file_key, idx) in iterate_files(outcfg, subset=archive_keys) if valid_file(file_path)] # if there are no file, don't create archive if len(archive_files) == 0: return if archive_format == "targz": final_archive_file = prefix + ".tar.gz" with tarfile.open(final_archive_file, "w:gz") as tar: for (file_path, file_key, idx) in archive_files: tar.add(file_path) elif archive_format == "zip": final_archive_file = prefix + ".zip" with zipfile.ZipFile(final_archive_file, "w", zipfile.ZIP_DEFLATED) as zip_: for (file_path, file_key, idx) in archive_files: zip_.write(file_path) return final_archive_file
def run_jobs(configs, global_config, overwrite=False, workdir=None): """ Submit config to pipeline Parameters ---------- configs : dict Configurations for individual subjobs global_config : dict Master configuration (if only one job, the contents of this dictionary will be equal to the single element of config_files) """ python = executable pipeline_path = path.abspath(pipeline.__file__) summarize_path = path.abspath(summarize.__file__) cmd_base = "{} {}".format(python, pipeline_path) summ_base = "{} {}".format(python, summarize_path) # determine output directory for config files prefix = global_config["global"]["prefix"] # integrate working directory into output prefix # if it is given; if prefix contains an absolute path, # this will override the workdir according to # implementation of path.join() if workdir is not None: out_prefix = path.join(workdir, prefix) else: out_prefix = prefix # save configuration file, make sure we do not overwrite previous run # if overwrite protection is activated # (but only if it is a valid configuration file with contents) cfg_filename = CONFIG_NAME.format(out_prefix) if not overwrite and valid_file(cfg_filename): raise InvalidParameterError( "Existing configuration file {} ".format(cfg_filename) + "indicates current prefix {} ".format(prefix) + "would overwrite existing results. Use --yolo " + "flag to deactivate overwrite protection (e.g. for " "restarting a job or running a different stage)." ) # make sure working directory exists create_prefix_folders(cfg_filename) # write global config file write_config_file(cfg_filename, global_config) # also write individual subjob configuration files # (we have to write these before submitting, since # the job summarizer needs the paths to all files) for subjob_prefix, subjob_cfg in configs.items(): # determine working dir for each subjob, since subjob # prefix may contain slashes leading to subfolder creation if workdir is not None: subjob_out_prefix = path.join(workdir, subjob_prefix) else: subjob_out_prefix = subjob_prefix subcfg_filename = CONFIG_NAME.format(subjob_out_prefix) # make sure output subfolder exists create_prefix_folders(subcfg_filename) # write subjob configuration file write_config_file(subcfg_filename, subjob_cfg) # now create list of subjob config files relative to working # directory (above, we allow to run submitted in arbitrary directory) config_files = [ CONFIG_NAME.format(subjob_prefix) for subjob_prefix in configs ] # create command for summarizer (needs to know all subjob config files) summ_cmd = "{} {} {} {}".format( summ_base, global_config["pipeline"], global_config["global"]["prefix"], " ".join(config_files) ) # create submitter from global (pre-unrolling) configuration submitter = utils.SubmitterFactory( global_config["environment"]["engine"], db_path=out_prefix + "_job_database.txt" ) # collect individual submitted jobs here commands = [] # prepare individual jobs for submission for job, job_cfg in configs.items(): job_prefix = job_cfg["global"]["prefix"] job_cfg_file = CONFIG_NAME.format(job) # set job status in database to pending pipeline.update_job_status(job_cfg, status=database.EStatus.PEND) # create submission command env = job_cfg["environment"] cmd = utils.Command( [ "{} {}".format(cmd_base, job_cfg_file), summ_cmd ], name=job_prefix, environment=env["configuration"], workdir=workdir, resources={ utils.EResource.queue: env["queue"], utils.EResource.time: env["time"], utils.EResource.mem: env["memory"], utils.EResource.nodes: env["cores"], utils.EResource.out: job_prefix + "_stdout.log", utils.EResource.error: job_prefix + "_stderr.log", } ) # store job for later dependency creation commands.append(cmd) # finally, submit job submitter.submit(cmd) # submit final summarizer # (hold for now - summarizer is run after each subjob finishes) # wait for all runs to finish (but only if blocking) submitter.join()
def cns_dgsa_fold(residues, ec_pairs, prefix, config_file=None, secstruct_column="sec_struct_3state", num_structures=20, min_cycles=5, log_level=None, binary="cns"): """ Predict 3D structure coordinates using distance geometry and simulated annealing-based folding protocol Parameters ---------- residues : pandas.DataFrame Table containing positions (column i), residue type (column A_i), and secondary structure for each position ec_pairs : pandas.DataFrame Table with EC pairs that will be turned into distance restraints (with columns i, j, A_i, A_j) prefix : str Prefix for output files (can include directories). Folders will be created automatically. config_file : str, optional (default: None) Path to config file with folding settings. If None, will use default settings included in package (restraints.yml) secstruct_column : str, optional (default: sec_struct_3state) Column name in residues dataframe from which secondary structure will be extracted (has to be H, E, or C). num_structures : int, optional (default: 20) Number of trial structures to generate min_cycles : int, optional (default: 5) Number of minimization cycles at end of protocol log_level : {None, "quiet", "verbose"}, optional (default: None) Don't keep CNS log files, or switch to different degrees of verbosity ("verbose" needed to obtain violation information) binary : str, optional (default: "cns") Path of CNS binary Returns ------- final_models : dict Mapping from model name to path of model """ def _run_inp(inp_str, output_prefix): with open(output_prefix + ".inp", "w") as f: f.write(inp_str) if log_level is not None: log_file = output_prefix + ".log" else: log_file = None run_cns(inp_str, log_file=log_file, binary=binary) # make sure output directory exists create_prefix_folders(prefix) # CNS doesn't like paths above a certain length, so we # will change into working directory and keep paths short. # For this reason, extract path and filename prefix dir_, rootname = path.split(prefix) cwd = os.getcwd() if dir_ != "": os.chdir(dir_) # create restraints (EC pairs and secondary structure-based) ec_tbl = rootname + "_couplings.tbl" ss_dist_tbl = rootname + "_ss_distance.tbl" ss_angle_tbl = rootname + "_ss_angle.tbl" ec_dist_restraints(ec_pairs, ec_tbl, cns_dist_restraint, config_file) secstruct_dist_restraints(residues, ss_dist_tbl, cns_dist_restraint, config_file, secstruct_column) secstruct_angle_restraints(residues, ss_angle_tbl, cns_dihedral_restraint, config_file, secstruct_column) # create sequence file seq = "".join(residues.A_i) seq_file = rootname + ".seq" cns_seq_file(seq, seq_file) # set up input files for folding # make molecular topology file (will be written to mtf_file) mtf_file = rootname + ".mtf" _run_inp( cns_mtf_inp(seq_file, mtf_file, first_index=residues.i.min(), disulfide_bridges=None), mtf_file) # make extended PDB file (will be in extended_file) extended_file = rootname + "_extended.pdb" _run_inp(cns_extended_inp(mtf_file, extended_file), extended_file) # fold using dg_sa protocol (filenames will have suffixes _1, _2, ...) # have to pass either quiet or verbose to CNS (but will not store # log file if log_level is None). if log_level is None: dgsa_log_level = "quiet" else: dgsa_log_level = log_level _run_inp( cns_dgsa_inp(extended_file, mtf_file, rootname, ec_tbl, ss_dist_tbl, ss_angle_tbl, num_structures=num_structures, log_level=dgsa_log_level), rootname + "_dgsa") # add hydrogen atoms and minimize (for all # generated candidate structures from dg_sa) # keep track of final predicted structures final_models = {} for i in range(1, num_structures + 1): input_root = "{}_{}".format(rootname, i) input_model = input_root + ".pdb" # check if we actually got the model from dg_sa if not valid_file(input_model): continue # run generate_easy protocol to add hydrogen atoms easy_pdb = input_root + "_h.pdb" easy_mtf = input_root + "_h.mtf" _run_inp(cns_generate_easy_inp(input_model, easy_pdb, easy_mtf), input_root + "_h") # then minimize min_pdb = input_root + "_hMIN.pdb" _run_inp( cns_minimize_inp(easy_pdb, easy_mtf, min_pdb, num_cycles=min_cycles), input_root + "_hMIN") if valid_file(min_pdb): final_models[min_pdb] = path.join(dir_, min_pdb) # change back into original directory os.chdir(cwd) return final_models
def run_plmc(alignment, couplings_file, param_file=None, focus_seq=None, alphabet=None, theta=None, scale=None, ignore_gaps=False, iterations=None, lambda_h=None, lambda_J=None, lambda_g=None, cpu=None, binary="plmc"): """ Run plmc on sequence alignment and store files with model parameters and pair couplings. Parameters ---------- alignment : str Path to input sequence alignment couplings_file : str Output path for file with evolutionary couplings (folder will be created) param_file : str Output path for binary file containing model parameters (folder will be created) focus_seq : str, optional (default: None) Name of focus sequence, if None, non-focus mode will be used alphabet : str, optional (default: None) Alphabet for model inference. If None, standard amino acid alphabet including gap will be used. First character in string corresponds to gap character (relevant for ignore_gaps). theta : float, optional (default: None) Sequences with pairwise identity >= theta will be clustered and their sequence weights downweighted as 1 / num_cluster_members. Important: Note that plmc will be parametrized using 1 - theta. If None, default value in plmc will be used, which corresponds to theta=0.8 (plmc setting 0.2). scale : float, optional (default: None) Scale weights of clusters by this value. If None, default value in plmc (1.0) will be used ignore_gaps : bool, optional (default: False) Exclude gaps from parameter inference. Gap character is first character of alphabet parameter. iterations : int, optional (default: None) Maximum iterations for optimization. lambda_h : float, optional (default: None) l2 regularization strength on fields. If None, plmc default will be used. lambda_J : float, optional (default: None) l2-regularization strength on couplings. If None, plmc default will be used lambda_g : float, optional (default: None) group l1-regularization strength on couplings If None, plmc default will be used. cpu : Number of cores to use for running plmc. Note that plmc has to be compiled in openmp mode to runnable with multiple cores. Can also be set to "max". binary : str, optional (default: "plmc") Path to plmc binary Returns ------- PlmcResult namedtuple containing output files and parsed fields from console output of plmc Raises ------ ExternalToolError """ create_prefix_folders(couplings_file) # Make sure input alignment exists verify_resources( "Alignment file does not exist", alignment ) cmd = [ binary, "-c", couplings_file, ] # store eij file if explicitly requested if param_file is not None: create_prefix_folders(param_file) cmd += ["-o", param_file] # focus sequence mode and ID if focus_seq is not None: # TODO: for now split exclude sequence # region from focus seq name, otherwise # plmc does not remap names. If this # behaviour changes in plmc, remove the # following line. focus_seq = focus_seq.split("/")[0] cmd += ["-f", focus_seq] # exclude gaps from calculation? if ignore_gaps: cmd += ["-g"] # maximum number of iterations, can also be "max" if iterations is not None: cmd += ["-m", str(iterations)] # set custom alphabet # (first character is gap by default in nogap mode) if alphabet is not None: cmd += ["-a", alphabet] # sequence reweighting if theta is not None: # transform into plmc convention (1-theta) theta = 1.0 - theta cmd += ["-t", str(theta)] # cluster weight if scale is not None: cmd += ["-s", str(scale)] # L2 regularization weight for fields if lambda_h is not None: cmd += ["-lh", str(lambda_h)] # L2 regularization weight for pair couplings if lambda_J is not None: cmd += ["-le", str(lambda_J)] # Group L1 regularization weight for pair couplings if lambda_g is not None: cmd += ["-lg", str(lambda_g)] # Number of cores to use for calculation if cpu is not None: cmd += ["-n", str(cpu)] # finally also add input alignment (main parameter) cmd += [alignment] # TODO: for now do not check returncode because sometimes # returncode == -11 (segfault) despite successful calculation return_code, stdout, stderr = run(cmd, check_returncode=False) # TODO: remove this segfault-hunting output once fixed if return_code != 0: # if not a segfault, still raise exception if return_code != -11: from evcouplings.utils.system import ExternalToolError raise ExternalToolError( "Call failed:\ncmd={}\nreturncode={}\nstdout={}\nstderr={}".format( cmd, return_code, stdout, stderr ) ) print("PLMC NON-ZERO RETURNCODE:", return_code) print(cmd) print(" ".join(cmd)) print("stdout:", stdout) print("stderr:", stderr) iter_df, out_fields = parse_plmc_log(stderr) # also check we actually calculated couplings... if not valid_file(couplings_file): raise ResourceError( "plmc returned no couplings: stdout={} stderr={} file={}".format( stdout, stderr, couplings_file ) ) # ... and parameter file, if requested if param_file and not valid_file(param_file): raise ResourceError( "plmc returned no parameter file: stdout={} stderr={} file={}".format( stdout, stderr, param_file ) ) return PlmcResult( couplings_file, param_file, iter_df, *out_fields )
def standard(**kwargs): """ Protocol: Predict 3D structure from evolutionary couplings Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * sec_struct_file * folding_ec_file * folded_structure_files """ check_required( kwargs, [ "prefix", "engine", "ec_file", "target_sequence_file", "segments", "folding_config_file", "cut_to_alignment_region", "sec_struct_method", "reuse_sec_struct", "sec_struct_file", "filter_sec_struct_clashes", "min_sequence_distance", "fold_probability_cutoffs", "fold_lowest_count", "fold_highest_count", "fold_increase", "num_models", "psipred", "cpu", "remapped_pdb_files", "cleanup", ] ) prefix = kwargs["prefix"] # make sure output directory exists create_prefix_folders(prefix) outcfg = { "folding_ec_file": prefix + "_CouplingScores_with_clashes.csv", "sec_struct_file": prefix + "_secondary_structure.csv", } # get secondary structure prediction # check if we should (and can) reuse output file from previous run if kwargs["reuse_sec_struct"] and valid_file(outcfg["sec_struct_file"]): residues = pd.read_csv(outcfg["sec_struct_file"]) else: residues = secondary_structure(**kwargs) # make pymol secondary structure assignment script outcfg["secondary_structure_pml_file"] = prefix + "_ss_draw.pml" pymol_secondary_structure( residues, outcfg["secondary_structure_pml_file"] ) # load ECs and filter for long-range pairs verify_resources( "EC file does not exist", kwargs["ec_file"] ) ecs_all = pd.read_csv(kwargs["ec_file"]) ecs = ecs_all.query("abs(i - j) > {}".format( kwargs["min_sequence_distance"]) ) # find secondary structure clashes ecs = secstruct_clashes(ecs, residues) ecs.to_csv(outcfg["folding_ec_file"], index=False) # if requested, filter clashes out before folding if kwargs["filter_sec_struct_clashes"]: ecs_fold = ecs.loc[~ecs.ss_clash] else: ecs_fold = ecs # cut modelled region to aligned region, if selected if kwargs["cut_to_alignment_region"]: segments = kwargs["segments"] # infer region from segment positions if we have it if segments is not None: positions = Segment.from_list(segments[0]).positions else: # otherwise get from EC values (could be misleading if # EC list is truncated, so only second option) positions = set(ecs.i.unique()).union(ecs.j.unique()) # limit modelled positions to covered region first_pos, last_pos = min(positions), max(positions) residues.loc[:, "in_model"] = False residues.loc[ (residues.i >= first_pos) & (residues.i <= last_pos), "in_model" ] = True else: # otherwise include all positions in model residues.loc[:, "in_model"] = True # save secondary structure prediction residues.to_csv(outcfg["sec_struct_file"], index=False) # only use the residues that will be in model for folding residues_fold = residues.loc[residues.in_model] # after all the setup, now fold the structures... # to speed things up, parallelize this to the number of # available CPUs num_procs = kwargs["cpu"] if num_procs is None: num_procs = 1 # first define all the sub-runs... folding_runs = [] # ... based on mixture model probability cutoffs = kwargs["fold_probability_cutoffs"] if cutoffs is not None and "probability" in ecs_fold.columns: if not isinstance(cutoffs, list): cutoffs = [cutoffs] for c in cutoffs: sig_ecs = ecs_fold.query("probability >= @c") if len(sig_ecs) > 0: folding_runs.append( (sig_ecs, "_significant_ECs_{}".format(c)) ) # ... and on simple EC counts/bins flc = kwargs["fold_lowest_count"] fhc = kwargs["fold_highest_count"] fi = kwargs["fold_increase"] if flc is not None and fhc is not None and fi is not None: num_sites = len( set.union(set(ecs.i.unique()), set(ecs.j.unique())) ) # transform fraction of number of sites into discrete number of ECs def _discrete_count(x): if isinstance(x, float): x = ceil(x * num_sites) return int(x) # range of plots to make lowest = _discrete_count(flc) highest = _discrete_count(fhc) step = _discrete_count(fi) # append to list of jobs to run folding_runs += [ ( ecs_fold.iloc[:c], "_{}".format(c) ) for c in range(lowest, highest + 1, step) ] # set up method to drive the folding of each job method = kwargs["engine"] # store structures in an auxiliary subdirectory, after folding # final models will be moved to main folding dir. Depending # on cleanup setting, the aux directory will be removed aux_prefix = insert_dir(prefix, "aux", rootname_subdir=False) aux_dir = path.dirname(aux_prefix) folding_runs = [ (job_ecs, aux_prefix + job_suffix) for (job_ecs, job_suffix) in folding_runs ] if method == "cns_dgsa": folder = partial( cns_dgsa_fold, residues_fold, config_file=kwargs["folding_config_file"], num_structures=kwargs["num_models"], log_level=None, binary=kwargs["cns"] ) else: raise InvalidParameterError( "Invalid folding engine: {} ".format(method) + "Valid selections are: cns_dgsa" ) # then apply folding function to each sub-run pool = mp.Pool(processes=num_procs) results = pool.starmap(folder, folding_runs) # make double sure that the pool is cleaned up, # or SIGTERM upon exit will interfere with # interrupt signal interception pool.close() pool.join() # merge result dictionaries into one dict folded_files = { k: v for subres in results for k, v in subres.items() } # move structures from aux into main folding dir fold_dir = path.dirname(prefix) prediction_files = [] for name, file_path in folded_files.items(): # move file (use copy to allow overwriting) shutil.copy(file_path, fold_dir) # update file path to main folding dir, # and put in a flat list of result files prediction_files.append( file_path.replace(aux_prefix, prefix) ) outcfg["folded_structure_files"] = prediction_files # remove aux dir if cleanup is requested if kwargs["cleanup"]: shutil.rmtree(aux_dir) # apply ranking to predicted models ranking = dihedral_ranking(prediction_files, residues) # apply clustering (all available methods), but only # if we have something to cluster if len(prediction_files) > 1: clustering = maxcluster_clustering_table( prediction_files, binary=kwargs["maxcluster"] ) # join ranking with clustering ranking = ranking.merge(clustering, on="filename", how="left") # sort by score (best models first) ranking = ranking.sort_values(by="ranking_score", ascending=False) # store as file outcfg["folding_ranking_file"] = prefix + "_ranking.csv" ranking.to_csv(outcfg["folding_ranking_file"], index=False) # apply comparison to existing structures if kwargs["remapped_pdb_files"] is not None and len(kwargs["remapped_pdb_files"]) > 0: experimental_files = kwargs["remapped_pdb_files"] comp_all, comp_singles = compare_models_maxcluster( list(experimental_files.keys()), prediction_files, norm_by_intersection=True, distance_cutoff=None, binary=kwargs["maxcluster"] ) # merge with ranking and save comparison = ranking.merge( comp_all, on="filename", how="left" ).sort_values(by="tm", ascending=False) outcfg["folding_comparison_file"] = prefix + "_comparison.csv" comparison.to_csv(outcfg["folding_comparison_file"], index=False) # also store comparison to structures in individual files ind_comp_files = {} for filename, comp_single in comp_singles.items(): comparison_s = ranking.merge( comp_single, on="filename", how="left" ).sort_values(by="tm", ascending=False) basename = path.splitext(path.split(filename)[1])[0] ind_file = path.join(fold_dir, basename + ".csv") # map back to original key from remapped_pdb_files as a key for this list ind_comp_files[ind_file] = experimental_files[filename] comparison_s.to_csv(ind_file, index=False) outcfg["folding_individual_comparison_files"] = ind_comp_files return outcfg
def substitute_config(**kwargs): """ Substitute command line arguments into config file Parameters ---------- **kwargs Command line parameters to be substituted into configuration file Returns ------- dict Updated configuration """ # mapping of command line parameters to config file entries CONFIG_MAP = { "prefix": ("global", "prefix"), "protein": ("global", "sequence_id"), "seqfile": ("global", "sequence_file"), "alignment": ("align", "input_alignment"), "iterations": ("align", "iterations"), "id": ("align", "seqid_filter"), "seqcov": ("align", "minimum_sequence_coverage"), "colcov": ("align", "minimum_column_coverage"), "theta": ("global", "theta"), "plmiter": ("couplings", "iterations"), "queue": ("environment", "queue"), "time": ("environment", "time"), "cores": ("environment", "cores"), "memory": ("environment", "memory"), } # try to read in configuration config_file = kwargs["config"] if not valid_file(config_file): raise ResourceError( "Config file does not exist or is empty: {}".format(config_file)) config = read_config_file(config_file, preserve_order=True) # substitute command-line parameters into configuration # (if straightforward substitution) for param, value in kwargs.items(): if param in CONFIG_MAP and value is not None: outer, inner = CONFIG_MAP[param] config[outer][inner] = value # make sure that number of CPUs requested by # programs within pipeline does not exceed # number of cores requested in environment if config["environment"]["cores"] is not None: config["global"]["cpu"] = config["environment"]["cores"] # handle the more complicated parameters # If alignment is given, run "existing" protocol if kwargs.get("alignment", None) is not None: # TODO: think about what to do if sequence_file is given # (will not be used) config["align"]["protocol"] = "existing" # subregion of protein if kwargs.get("region", None) is not None: region = kwargs["region"] m = re.search("(\d+)-(\d+)", region) if m: start, end = map(int, m.groups()) config["global"]["region"] = [start, end] else: raise InvalidParameterError( "Region string does not have format " "start-end (e.g. 5-123):".format(region)) # pipeline stages to run if kwargs.get("stages", None) is not None: config["stages"] = kwargs["stages"].replace(" ", "").split(",") # sequence alignment input database if kwargs.get("database", None) is not None: db = kwargs["database"] # check if we have a predefined sequence database # if so, use it; otherwise, interpret as file path if db in config["databases"]: config["align"]["database"] = db else: config["align"]["database"] = "custom" config["databases"]["custom"] = db # make sure bitscore and E-value thresholds are exclusively set if kwargs.get("bitscores", None) is not None and kwargs.get( "evalues", None) is not None: raise InvalidParameterError( "Can not specify bitscore and E-value threshold at the same time.") if kwargs.get("bitscores", None) is not None: thresholds = kwargs["bitscores"] bitscore = True elif kwargs.get("evalues", None) is not None: thresholds = kwargs["evalues"] bitscore = False else: thresholds = None if thresholds is not None: T = thresholds.replace(" ", "").split(",") try: x_cast = [(float(t) if "." in t else int(t)) for t in T] except ValueError: raise InvalidParameterError( "Bitscore/E-value threshold(s) must be numeric: " "{}".format(thresholds)) config["align"]["use_bitscores"] = bitscore # check if we have a single threshold (single job) # or if we need to create an array of jobs if len(x_cast) == 1: config["align"]["domain_threshold"] = x_cast[0] config["align"]["sequence_threshold"] = x_cast[0] else: config["batch"] = {} for t in x_cast: sub_prefix = ("_b" if bitscore else "_e") + str(t) config["batch"][sub_prefix] = { "align": { "domain_threshold": t, "sequence_threshold": t, } } return config
def run_jobs(configs, global_config, overwrite=False, workdir=None, abort_on_error=True, environment=None): """ Submit config to pipeline Parameters ---------- configs : dict Configurations for individual subjobs global_config : dict Master configuration (if only one job, the contents of this dictionary will be equal to the single element of config_files) overwrite : bool, optional (default: False) If True, allows overwriting previous run of the same config, otherwise will fail if results from previous execution are present workdir : str, optional (default: None) Workdir in which to run job (will combine workdir and prefix in joint path) abort_on_error : bool, optional (default: True) Abort entire job submission if error occurs for one of the jobs by propagating RuntimeError environment : str, optional (default: None) Allow to pass value for environment parameter of submitter, will override environment.configuration from global_config (e.g., for setting environment variables like passwords) Returns ------- job_ids : dict Mapping from subjob prefix (keys in configs parameter) to identifier returned by submitter for each of the jobs that was *successfully* submitted (i.e. missing keys from configs param indicate these jobs could not be submitted). Raises ------ RuntimeError If error encountered during submission and abort_on_error is True """ cmd_base = environ.get("EVCOUPLINGS_RUNCFG_APP") or "evcouplings_runcfg" summ_base = environ.get( "EVCOUPLINGS_SUMMARIZE_APP") or "evcouplings_summarize" # determine output directory for config files prefix = global_config["global"]["prefix"] # integrate working directory into output prefix # if it is given; if prefix contains an absolute path, # this will override the workdir according to # implementation of path.join() if workdir is not None: out_prefix = path.join(workdir, prefix) else: out_prefix = prefix # save configuration file, make sure we do not overwrite previous run # if overwrite protection is activated # (but only if it is a valid configuration file with contents) cfg_filename = CONFIG_NAME.format(out_prefix) if not overwrite and valid_file(cfg_filename): raise InvalidParameterError( "Existing configuration file {} ".format(cfg_filename) + "indicates current prefix {} ".format(prefix) + "would overwrite existing results. Use --yolo " + "flag to deactivate overwrite protection (e.g. for " "restarting a job or running a different stage).") # make sure working directory exists create_prefix_folders(cfg_filename) # write global config file write_config_file(cfg_filename, global_config) # also write individual subjob configuration files # (we have to write these before submitting, since # the job summarizer needs the paths to all files) for subjob_prefix, subjob_cfg in configs.items(): # determine working dir for each subjob, since subjob # prefix may contain slashes leading to subfolder creation if workdir is not None: subjob_out_prefix = path.join(workdir, subjob_prefix) else: subjob_out_prefix = subjob_prefix subcfg_filename = CONFIG_NAME.format(subjob_out_prefix) # make sure output subfolder exists create_prefix_folders(subcfg_filename) # write subjob configuration file write_config_file(subcfg_filename, subjob_cfg) # now create list of subjob config files relative to working # directory (above, we allow to run submitted in arbitrary directory) config_files = [ CONFIG_NAME.format(subjob_prefix) for subjob_prefix in configs ] # create command for summarizer (needs to know all subjob config files) summ_cmd = "{} {} {} {}".format(summ_base, global_config["pipeline"], global_config["global"]["prefix"], " ".join(config_files)) # create submitter from global (pre-unrolling) configuration submitter = utils.SubmitterFactory(global_config["environment"]["engine"], db_path=out_prefix + "_job_database.txt") # collect individual submitted jobs here commands = [] # record subjob IDs returned by submitter for each job job_ids = {} # prepare individual jobs for submission for job, job_cfg in configs.items(): job_prefix = job_cfg["global"]["prefix"] job_cfg_file = CONFIG_NAME.format(job) # create submission command env = job_cfg["environment"] cmd = utils.Command( ["{} {}".format(cmd_base, job_cfg_file), summ_cmd], name=job_prefix, environment=environment or env["configuration"], workdir=workdir, resources={ utils.EResource.queue: env["queue"], utils.EResource.time: env["time"], utils.EResource.mem: env["memory"], utils.EResource.nodes: env["cores"], utils.EResource.out: job_prefix + "_stdout.log", utils.EResource.error: job_prefix + "_stderr.log", }) # store job for later dependency creation commands.append(cmd) tracker = get_result_tracker(job_cfg) try: # finally, submit job current_job_id = submitter.submit(cmd) # store run identifier returned by submitter # TODO: consider storing current_job_id using tracker right away job_ids[job] = current_job_id # set job status in database to pending tracker.update(status=EStatus.PEND) except RuntimeError as e: # set job as failed in database tracker.update(status=EStatus.FAIL, message=str(e)) # fail entire job submission if requested if abort_on_error: raise # submit final summarizer # (hold for now - summarizer is run after each subjob finishes) # wait for all runs to finish (but only if blocking) submitter.join() # return job identifiers return job_ids
def protein_complex(prefix, configs): """ Create results summary for run using protein_complex pipeline """ # TODO: this is only designed to work with skewnormal threshold MIN_PROBABILITY = 0.9 # number of inter ECs to check for precision NUM_INTER = 5 # TODO: create segments global variable and import FIRST_SEGMENT = "A_1" SECOND_SEGMENT = "B_1" ali_table = pd.DataFrame() prefix_to_cfgs = {} data = defaultdict(lambda: defaultdict()) # go through all config files for cfg_file in configs: # check if the file exists and has contents # since run might not yet have finished or crashed if valid_file(cfg_file): # job input configuration C = read_config_file(cfg_file) sub_prefix = C["global"]["prefix"] sub_index = (sub_prefix) final_state_cfg = sub_prefix + FINAL_CONFIG_SUFFIX if not valid_file(final_state_cfg): continue # read final output state of job R = read_config_file(final_state_cfg) data[sub_index]["identities"] = R["identities_file"] data[sub_index]["frequencies"] = R["frequencies_file"] data[sub_index]["minimum_column_coverage"] = C["concatenate"][ "minimum_column_coverage"] stat_file = R["statistics_file"] ec_file = R.get("ec_file", "") ec_comp_file = R.get("ec_compared_longrange_file", "") concat_stat_file = R.get("concatentation_statistics_file", "") first_stat_file = R.get("first_statistics_file", "") second_stat_file = R.get("second_statistics_file", "") prefix_to_cfgs[(sub_prefix)] = (C, R) # read and modify alignment statistics if valid_file(stat_file): # get alignment stats for current job stat_df = pd.read_csv(stat_file) n_eff = R["effective_sequences"] if n_eff is not None: stat_df.loc[0, "N_eff"] = n_eff L = stat_df.loc[0, "num_cov"] # try to get concatenation statistics in addition if valid_file(concat_stat_file): concat_stat_df = pd.read_csv(concat_stat_file) # get and save n sequences per monomer aln n_seqs_1 = concat_stat_df.loc[0, "num_seqs_1"] n_seqs_2 = concat_stat_df.loc[0, "num_seqs_2"] stat_df.loc[0, "first_n_seqs"] = int(n_seqs_1) stat_df.loc[0, "second_n_seqs"] = int(n_seqs_2) # get and save median n paralogs per monomer aln n_paralogs_1 = concat_stat_df.loc[ 0, "median_num_per_species_1"] n_paralogs_2 = concat_stat_df.loc[ 0, "median_num_per_species_2"] stat_df.loc[0, "median_num_per_species_1"] = n_paralogs_1 stat_df.loc[0, "median_num_per_species_2"] = n_paralogs_2 # try to get number of significant ECs in addition if valid_file(ec_file): ecs = pd.read_csv(ec_file) #number of significant monomer Ecs min_seq_dist = C["compare"]["min_sequence_distance"] num_sig = len( ecs.query( "abs(i-j) >= @min_seq_dist and probability >= @MIN_PROBABILITY" )) # number of inter-protein ECs significant num_sig_inter = len( ecs.query( "segment_i != segment_j and probability >= @MIN_PROBABILITY" )) stat_df.loc[0, "num_significant"] = int(num_sig) #rank of top inter contact top_inter_rank = ecs.query( "segment_i != segment_j").index[0] stat_df.loc[0, "top_inter_rank"] = int(top_inter_rank) # try to get EC precision in addition if valid_file(ec_comp_file): ec_comp = pd.read_csv(ec_comp_file) ec_comp_1 = ec_comp.query( "segment_i == segment_j == @FIRST_SEGMENT") ec_comp_2 = ec_comp.query( "segment_i == segment_j == @SECOND_SEGMENT") ec_comp_inter = ec_comp.query("segment_i != segment_j") # use the monomer statistics files to figure out how many sites in each monomer if valid_file(first_stat_file) and valid_file( second_stat_file): stats_1 = pd.read_csv(first_stat_file) L_1 = L = stats_1.loc[0, "num_cov"] stats_2 = pd.read_csv(second_stat_file) L_2 = L = stats_2.loc[0, "num_cov"] # precision of monomer 1 stat_df.loc[ 0, "first_monomer_precision"] = ec_comp_1.iloc[ L_1]["segmentwise_precision"] # precicions of monomer 2 stat_df.loc[ 0, "second_monomer_precision"] = ec_comp_2.iloc[ L_2]["segmentwise_precision"] # precision of top 5 inter stat_df.loc[0, "inter_precision"] = ec_comp_inter.iloc[ NUM_INTER]["segmentwise_precision"] # finally, append to global table ali_table = ali_table.append(stat_df) # save ali statistics table table_file = prefix + "_job_statistics_summary.csv" lock_table = filelock.FileLock(table_file) with lock_table: ali_table.to_csv(table_file, index=False, float_format="%.3f") return ali_table
def protein_monomer(prefix, configs): """ Create results summary for run using protein_monomer pipeline # TODO """ MIN_PROBABILITY = 0.9 ali_table = pd.DataFrame() prefix_to_cfgs = {} data = defaultdict(lambda: defaultdict()) # go through all config files for cfg_file in configs: # check if the file exists and has contents # since run might not yet have finished or crashed if valid_file(cfg_file): # job input configuration C = read_config_file(cfg_file) sub_prefix = C["global"]["prefix"] domain_threshold = C["align"]["domain_threshold"] sub_index = (domain_threshold, sub_prefix) final_state_cfg = sub_prefix + FINAL_CONFIG_SUFFIX if not valid_file(final_state_cfg): continue # read final output state of job R = read_config_file(final_state_cfg) data[sub_index]["identities"] = R["identities_file"] data[sub_index]["frequencies"] = R["frequencies_file"] data[sub_index]["minimum_column_coverage"] = C["align"][ "minimum_column_coverage"] stat_file = R["statistics_file"] ec_file = R.get("ec_file", "") ec_comp_file = R.get("ec_compared_longrange_file", "") prefix_to_cfgs[(sub_prefix)] = (C, R) # read and modify alignment statistics if valid_file(stat_file): # get alignment stats for current job stat_df = pd.read_csv(stat_file) n_eff = R["effective_sequences"] if n_eff is not None: stat_df.loc[0, "N_eff"] = n_eff stat_df.loc[0, "domain_threshold"] = domain_threshold L = stat_df.loc[0, "num_cov"] # try to get number of significant ECs in addition if valid_file(ec_file): ecs = pd.read_csv(ec_file) min_seq_dist = C["compare"]["min_sequence_distance"] num_sig = len( ecs.query( "abs(i-j) >= @min_seq_dist and probability >= @MIN_PROBABILITY" )) stat_df.loc[0, "num_significant"] = num_sig # try to get EC precision in addition if valid_file(ec_comp_file): ec_comp = pd.read_csv(ec_comp_file) stat_df.loc[0, "precision"] = ec_comp.iloc[L]["precision"] # finally, append to global table ali_table = ali_table.append(stat_df) # sort table by sequence search threshold ali_table = ali_table.sort_values(by="domain_threshold") # when saving files, have to aquire lock to make sure # jobs don't start overwriting results # make plots and save fig = _protein_monomer_plot(ali_table, data) plot_file = prefix + "_job_statistics_summary.pdf" lock_plot = filelock.FileLock(plot_file) with lock_plot: fig.savefig(plot_file, bbox_inches="tight") # save ali statistics table table_file = prefix + "_job_statistics_summary.csv" lock_table = filelock.FileLock(table_file) with lock_table: ali_table.to_csv(table_file, index=False, float_format="%.3f") return ali_table
def standard(**kwargs): """ Protocol: Infer ECs from alignment using plmc. .. todo:: 1. make EC enrichment calculation segment-ready 2. explain meaning of parameters in detail. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * raw_ec_file * model_file * num_sites * num_sequences * effective_sequences * focus_mode (passed through) * focus_sequence (passed through) * segments (passed through) """ check_required( kwargs, [ "prefix", "alignment_file", "focus_mode", "focus_sequence", "theta", "alphabet", "segments", "ignore_gaps", "iterations", "lambda_h", "lambda_J", "lambda_group", "scale_clusters", "cpu", "plmc", "reuse_ecs", "min_sequence_distance", # "save_model", ] ) prefix = kwargs["prefix"] # for now disable option to not save model, since # otherwise mutate stage will crash. To remove model # file at end, use delete option in management section. """ if kwargs["save_model"]: model = prefix + ".model" else: model = None """ model = prefix + ".model" outcfg = { "model_file": model, "raw_ec_file": prefix + "_ECs.txt", "ec_file": prefix + "_CouplingScores.csv", # TODO: the following are passed through stage... # keep this or unnecessary? "focus_mode": kwargs["focus_mode"], "focus_sequence": kwargs["focus_sequence"], "segments": kwargs["segments"], } # make sure input alignment exists verify_resources( "Input alignment does not exist", kwargs["alignment_file"] ) # make sure output directory exists create_prefix_folders(prefix) # regularization strength on couplings J_ij lambda_J = kwargs["lambda_J"] segments = kwargs["segments"] if segments is not None: segments = [ mapping.Segment.from_list(s) for s in segments ] # first determine size of alphabet; # default is amino acid alphabet if kwargs["alphabet"] is None: alphabet = ALPHABET_PROTEIN alphabet_setting = None else: alphabet = kwargs["alphabet"] # allow shortcuts for protein, DNA, RNA if alphabet in ALPHABET_MAP: alphabet = ALPHABET_MAP[alphabet] # if we have protein alphabet, do not set # as plmc parameter since default parameter, # has some implementation advantages for focus mode if alphabet == ALPHABET_PROTEIN: alphabet_setting = None else: alphabet_setting = alphabet # scale lambda_J to proportionally compensate # for higher number of J_ij compared to h_i? if kwargs["lambda_J_times_Lq"]: num_symbols = len(alphabet) # if we ignore gaps, there is one character less if kwargs["ignore_gaps"]: num_symbols -= 1 # second, determine number of uppercase positions # that are included in the calculation with open(kwargs["alignment_file"]) as f: seq_id, seq = next(read_fasta(f)) # gap character is by convention first char in alphabet gap = alphabet[0] uppercase = [ c for c in seq if c == c.upper() or c == gap ] L = len(uppercase) # finally, scale lambda_J lambda_J *= (num_symbols - 1) * (L - 1) # run plmc... or reuse pre-exisiting results from previous run plm_outcfg_file = prefix + ".couplings_standard_plmc.outcfg" # determine if to rerun, only possible if previous results # were stored in ali_outcfg_file if kwargs["reuse_ecs"] and valid_file(plm_outcfg_file): plmc_result = read_config_file(plm_outcfg_file) # check if the EC/parameter files are there required_files = [outcfg["raw_ec_file"]] if outcfg["model_file"] is not None: required_files += [outcfg["model_file"]] verify_resources( "Tried to reuse ECs, but empty or " "does not exist", *required_files ) else: # run plmc binary plmc_result = ct.run_plmc( kwargs["alignment_file"], outcfg["raw_ec_file"], outcfg["model_file"], focus_seq=kwargs["focus_sequence"], alphabet=alphabet_setting, theta=kwargs["theta"], scale=kwargs["scale_clusters"], ignore_gaps=kwargs["ignore_gaps"], iterations=kwargs["iterations"], lambda_h=kwargs["lambda_h"], lambda_J=lambda_J, lambda_g=kwargs["lambda_group"], cpu=kwargs["cpu"], binary=kwargs["plmc"], ) # save iteration table to file iter_table_file = prefix + "_iteration_table.csv" plmc_result.iteration_table.to_csv( iter_table_file ) # turn namedtuple into dictionary to make # restarting code nicer plmc_result = dict(plmc_result._asdict()) # then replace table with filename so # we can store results in config file plmc_result["iteration_table"] = iter_table_file # save results of search for possible restart write_config_file(plm_outcfg_file, plmc_result) # store useful information about model in outcfg outcfg.update({ "num_sites": plmc_result["num_valid_sites"], "num_sequences": plmc_result["num_valid_seqs"], "effective_sequences": plmc_result["effective_samples"], "region_start": plmc_result["region_start"], }) # read and sort ECs ecs = pairs.read_raw_ec_file(outcfg["raw_ec_file"]) # add mixture model probability ecs = pairs.add_mixture_probability(ecs) if segments is not None: # and (len(segments) > 1 or not kwargs["focus_mode"]): # create index mapping seg_mapper = mapping.SegmentIndexMapper( kwargs["focus_mode"], outcfg["region_start"], *segments ) # apply to EC table ecs = mapping.segment_map_ecs(ecs, seg_mapper) # write updated table to csv file ecs.to_csv(outcfg["ec_file"], index=False) # also store longrange ECs as convenience output if kwargs["min_sequence_distance"] is not None: outcfg["ec_longrange_file"] = prefix + "_CouplingScores_longrange.csv" ecs_longrange = ecs.query( "abs(i - j) >= {}".format(kwargs["min_sequence_distance"]) ) ecs_longrange.to_csv(outcfg["ec_longrange_file"], index=False) # also create line-drawing script (for now, only for single segments) if segments is None or len(segments) == 1: outcfg["ec_lines_pml_file"] = prefix + "_draw_ec_lines.pml" L = outcfg["num_sites"] ec_lines_pymol_script( ecs_longrange.iloc[:L, :], outcfg["ec_lines_pml_file"] ) # compute EC enrichment (for now, for single segments # only since enrichment code cannot handle multiple segments) if segments is None or len(segments) == 1: outcfg["enrichment_file"] = prefix + "_enrichment.csv" ecs_enriched = pairs.enrichment(ecs) ecs_enriched.to_csv(outcfg["enrichment_file"], index=False) # create corresponding enrichment pymol scripts outcfg["enrichment_pml_files"] = [] for sphere_view, pml_suffix in [ (True, "_enrichment_spheres.pml"), (False, "_enrichment_sausage.pml") ]: pml_file = prefix + pml_suffix enrichment_pymol_script(ecs_enriched, pml_file, sphere_view=sphere_view) outcfg["enrichment_pml_files"].append(pml_file) # output EVzoom JSON file if we have stored model file if outcfg.get("model_file", None) is not None: outcfg["evzoom_file"] = prefix + "_evzoom.json" with open(outcfg["evzoom_file"], "w") as f: # load parameters c = CouplingsModel(outcfg["model_file"]) # create JSON output and write to file f.write( evzoom_json(c) + "\n" ) # dump output config to YAML file for debugging/logging write_config_file(prefix + ".couplings_standard.outcfg", outcfg) return outcfg