def _modify_segments(seg_list, seg_prefix): # extract segments from list representation into objects segs = [Segment.from_list(s) for s in seg_list] # update segment IDs for i, s in enumerate(segs, start=1): s.segment_id = "{}_{}".format(seg_prefix, i) return segs
def jackhmmer_search(**kwargs): """ Protocol: Iterative jackhmmer search against a sequence database. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required .. todo:: explain meaning of parameters in detail. Returns ------- outcfg : dict Output configuration of the protocol, including the following fields: * sequence_id (passed through from input) * first_index (passed through from input) * target_sequence_file * sequence_file * raw_alignment_file * hittable_file * focus_mode * focus_sequence * segments """ check_required(kwargs, [ "prefix", "sequence_id", "sequence_file", "sequence_download_url", "region", "first_index", "use_bitscores", "domain_threshold", "sequence_threshold", "database", "iterations", "cpu", "nobias", "reuse_alignment", "checkpoints_hmm", "checkpoints_ali", "jackhmmer", "extract_annotation" ]) prefix = kwargs["prefix"] # make sure output directory exists create_prefix_folders(prefix) # store search sequence file here target_sequence_file = prefix + ".fa" full_sequence_file = prefix + "_full.fa" # make sure search sequence is defined and load it full_seq_file, (full_seq_id, full_seq) = fetch_sequence( kwargs["sequence_id"], kwargs["sequence_file"], kwargs["sequence_download_url"], full_sequence_file) # cut sequence to target region and save in sequence_file # (this is the main sequence file used downstream) (region_start, region_end), cut_seq = cut_sequence(full_seq, kwargs["sequence_id"], kwargs["region"], kwargs["first_index"], target_sequence_file) # run jackhmmer... allow to reuse pre-exisiting # Stockholm alignment file here ali_outcfg_file = prefix + ".align_jackhmmer_search.outcfg" # determine if to rerun, only possible if previous results # were stored in ali_outcfg_file if kwargs["reuse_alignment"] and valid_file(ali_outcfg_file): ali = read_config_file(ali_outcfg_file) # check if the alignment file itself is also there verify_resources( "Tried to reuse alignment, but empty or " "does not exist", ali["alignment"], ali["domtblout"]) else: # otherwise, we have to run the alignment # modify search thresholds to be suitable for jackhmmer seq_threshold, domain_threshold = search_thresholds( kwargs["use_bitscores"], kwargs["sequence_threshold"], kwargs["domain_threshold"], len(cut_seq)) # run search process ali = at.run_jackhmmer( query=target_sequence_file, database=kwargs[kwargs["database"]], prefix=prefix, use_bitscores=kwargs["use_bitscores"], domain_threshold=domain_threshold, seq_threshold=seq_threshold, iterations=kwargs["iterations"], nobias=kwargs["nobias"], cpu=kwargs["cpu"], checkpoints_hmm=kwargs["checkpoints_hmm"], checkpoints_ali=kwargs["checkpoints_ali"], binary=kwargs["jackhmmer"], ) # get rid of huge stdout log file immediately # (do not use /dev/null option of jackhmmer function # to make no assumption about operating system) try: os.remove(ali.output) except OSError: pass # turn namedtuple into dictionary to make # restarting code nicer ali = dict(ali._asdict()) # save results of search for possible restart write_config_file(ali_outcfg_file, ali) # prepare output dictionary with result files outcfg = { "sequence_id": kwargs["sequence_id"], "target_sequence_file": target_sequence_file, "sequence_file": full_sequence_file, "first_index": kwargs["first_index"], "focus_mode": True, "raw_alignment_file": ali["alignment"], "hittable_file": ali["domtblout"], } # define a single protein segment based on target sequence outcfg["segments"] = [ Segment("aa", kwargs["sequence_id"], region_start, region_end, range(region_start, region_end + 1)).to_list() ] outcfg["focus_sequence"] = "{}/{}-{}".format(kwargs["sequence_id"], region_start, region_end) return outcfg
def modify_alignment(focus_ali, target_seq_index, target_seq_id, region_start, **kwargs): """ Apply pairwise identity filtering, fragment filtering, and exclusion of columns with too many gaps to a sequence alignment. Also generates files describing properties of the alignment such as frequency distributions, conservation, and "old-style" alignment statistics files. .. note:: assumes focus alignment (otherwise unprocessed) as input. .. todo:: come up with something more clever to filter fragments than fixed width (e.g. use 95% quantile of length distribution as reference point) Parameters ---------- focus_ali : Alignment Focus-mode input alignment target_seq_index : int Index of target sequence in alignment target_seq_id : str Identifier of target sequence (without range) region_start : int Index of first sequence position in target sequence kwargs : See required arguments in source code Returns ------- outcfg : Dict File products generated by the function: * alignment_file * statistics_file * frequencies_file * identities_file * raw_focus_alignment_file ali : Alignment Final processed alignment """ check_required(kwargs, [ "prefix", "seqid_filter", "hhfilter", "minimum_sequence_coverage", "minimum_column_coverage", "compute_num_effective_seqs", "theta", ]) prefix = kwargs["prefix"] create_prefix_folders(prefix) focus_fasta_file = prefix + "_raw_focus.fasta" outcfg = { "alignment_file": prefix + ".a2m", "statistics_file": prefix + "_alignment_statistics.csv", "frequencies_file": prefix + "_frequencies.csv", "identities_file": prefix + "_identities.csv", "raw_focus_alignment_file": focus_fasta_file, } # swap target sequence to first position if it is not # the first sequence in alignment; # this is particularly important for hhfilter run # because target sequence might otherwise be filtered out if target_seq_index != 0: indices = np.arange(0, len(focus_ali)) indices[0] = target_seq_index indices[target_seq_index] = 0 target_seq_index = 0 focus_ali = focus_ali.select(sequences=indices) with open(focus_fasta_file, "w") as f: focus_ali.write(f, "fasta") # apply pairwise identity filter (using hhfilter) if kwargs["seqid_filter"] is not None: filtered_file = prefix + "_filtered.a3m" at.run_hhfilter(focus_fasta_file, filtered_file, threshold=kwargs["seqid_filter"], columns="first", binary=kwargs["hhfilter"]) with open(filtered_file) as f: focus_ali = Alignment.from_file(f, "a3m") # final FASTA alignment before applying A2M format modifications filtered_fasta_file = prefix + "_raw_focus_filtered.fasta" with open(filtered_fasta_file, "w") as f: focus_ali.write(f, "fasta") ali = focus_ali # filter fragments # come up with something more clever here than fixed width # (e.g. use 95% quantile of length distribution as reference point) min_cov = kwargs["minimum_sequence_coverage"] if min_cov is not None: if isinstance(min_cov, int): min_cov /= 100 keep_seqs = (1 - ali.count("-", axis="seq")) >= min_cov ali = ali.select(sequences=keep_seqs) # Calculate frequencies, conservation and identity to query # on final alignment (except for lowercase modification) # Note: running hhfilter might cause a loss of the target seque # if it is not the first sequence in the file! To be sure that # nothing goes wrong, target_seq_index should always be 0. describe_seq_identities(ali, target_seq_index=target_seq_index).to_csv( outcfg["identities_file"], float_format="%.3f", index=False) describe_frequencies(ali, region_start, target_seq_index=target_seq_index).to_csv( outcfg["frequencies_file"], float_format="%.3f", index=False) coverage_stats = describe_coverage(ali, prefix, region_start, kwargs["minimum_column_coverage"]) # keep list of uppercase sequence positions in alignment pos_list = np.arange(region_start, region_start + ali.L, dtype="int32") # Make columns with too many gaps lowercase min_col_cov = kwargs["minimum_column_coverage"] if min_col_cov is not None: if isinstance(min_col_cov, int): min_col_cov /= 100 lc_cols = ali.count(ali._match_gap, axis="pos") > 1 - min_col_cov ali = ali.lowercase_columns(lc_cols) # if we remove columns, we have to update list of positions pos_list = pos_list[~lc_cols] else: lc_cols = None # compute effective number of sequences # (this is intended for cases where coupling stage is # not run, but this number is wanted nonetheless) if kwargs["compute_num_effective_seqs"]: # make sure we only compute N_eff on the columns # that would be used for model inference, dispose # the rest if lc_cols is None: cut_ali = ali else: cut_ali = ali.select(columns=~lc_cols) # compute sequence weights cut_ali.set_weights(kwargs["theta"]) # N_eff := sum of all sequence weights n_eff = float(cut_ali.weights.sum()) # patch into coverage statistics (N_eff column) coverage_stats.loc[:, "N_eff"] = n_eff else: n_eff = None # save coverage statistics to file coverage_stats.to_csv(outcfg["statistics_file"], float_format="%.3f", index=False) # store description of final sequence alignment in outcfg # (note these parameters will be updated by couplings protocol) outcfg.update({ "num_sites": len(pos_list), "num_sequences": len(ali), "effective_sequences": n_eff, "region_start": region_start, }) # create segment in outcfg outcfg["segments"] = [ Segment("aa", target_seq_id, region_start, region_start + ali.L - 1, pos_list).to_list() ] with open(outcfg["alignment_file"], "w") as f: ali.write(f, "fasta") return outcfg, ali
def standard(**kwargs): """ Protocol: Standard buildali4 workflow (run iterative jackhmmer search against sequence database, than determine which sequences and columns to include in the calculation based on coverage and maximum gap thresholds). Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * sequence_id (passed through from input) * first_index (passed through from input) * alignment_file * raw_alignment_file * raw_focus_alignment_file * statistics_file * target_sequence_file * sequence_file * annotation_file * frequencies_file * identities_file * hittable_file * focus_mode * focus_sequence * segments ali : Alignment Final sequence alignment """ check_required(kwargs, [ "prefix", "extract_annotation", ]) prefix = kwargs["prefix"] # make sure output directory exists create_prefix_folders(prefix) # first step of protocol is to get alignment using # jackhmmer; initialize output configuration with # results of this search jackhmmer_outcfg = jackhmmer_search(**kwargs) stockholm_file = jackhmmer_outcfg["raw_alignment_file"] segment = Segment.from_list(jackhmmer_outcfg["segments"][0]) target_seq_id = segment.sequence_id region_start = segment.region_start region_end = segment.region_end # read in stockholm format (with full annotation) with open(stockholm_file) as a: ali_raw = Alignment.from_file(a, "stockholm") # and store as FASTA file first (disabled for now # since equivalent information easily be obtained # from Stockholm file """ ali_raw_fasta_file = prefix + "_raw.fasta" with open(ali_raw_fasta_file, "w") as f: ali_raw.write(f, "fasta") """ # save annotation in sequence headers (species etc.) if kwargs["extract_annotation"]: annotation_file = prefix + "_annotation.csv" annotation = extract_header_annotation(ali_raw) annotation.to_csv(annotation_file, index=False) # center alignment around focus/search sequence focus_cols = np.array([c != "-" for c in ali_raw[0]]) focus_ali = ali_raw.select(columns=focus_cols) target_seq_index = 0 mod_outcfg, ali = modify_alignment(focus_ali, target_seq_index, target_seq_id, region_start, **kwargs) # merge results of jackhmmer_search and modify_alignment stage outcfg = { **jackhmmer_outcfg, **mod_outcfg, "annotation_file": annotation_file } # dump output config to YAML file for debugging/logging write_config_file(prefix + ".align_standard.outcfg", outcfg) # return results of protocol return outcfg
def hmmbuild_and_search(**kwargs): """ Protocol: Build HMM from sequence alignment using hmmbuild and search against a sequence database using hmmsearch. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the protocol, including the following fields: * target_sequence_file * sequence_file * raw_alignment_file * hittable_file * focus_mode * focus_sequence * segments """ def _format_alignment_for_hmmbuild(input_alignment_file, **kwargs): # this file is starting point of pipeline; # check if input alignment actually exists verify_resources("Input alignment does not exist", input_alignment_file) # first try to autodetect format of alignment with open(input_alignment_file) as f: format = detect_format(f) if format is None: raise InvalidParameterError( "Format of input alignment {} could not be " "automatically detected.".format(input_alignment_file)) with open(input_alignment_file) as f: ali_raw = Alignment.from_file(f, format) # Target sequence of alignment sequence_id = kwargs["sequence_id"] if sequence_id is None: raise InvalidParameterError( "Parameter sequence_id must be defined") # First, find focus sequence in alignment focus_index = None for i, id_ in enumerate(ali_raw.ids): if id_.startswith(sequence_id): focus_index = i break # if we didn't find it, cannot continue if focus_index is None: raise InvalidParameterError( "Target sequence {} could not be found in alignment".format( sequence_id)) # identify what columns (non-gap) to keep for focus # this should be all columns in the raw_focus_alignment_file # but checking anyway focus_seq = ali_raw[focus_index] focus_cols = np.array([ c not in [ali_raw._match_gap, ali_raw._insert_gap] for c in focus_seq ]) # extract focus alignment focus_ali = ali_raw.select(columns=focus_cols) focus_seq_nogap = "".join(focus_ali[focus_index]) # determine region of sequence. If first_index is given, # use that in any case, otherwise try to autodetect full_focus_header = ali_raw.ids[focus_index] focus_id = full_focus_header.split()[0] # try to extract region from sequence header id_, region_start, region_end = parse_header(focus_id) # override with first_index if given if kwargs["first_index"] is not None: region_start = kwargs["first_index"] region_end = region_start + len(focus_seq_nogap) - 1 if region_start is None or region_end is None: raise InvalidParameterError( "Could not extract region information " + "from sequence header {} ".format(full_focus_header) + "and first_index parameter is not given.") # resubstitute full sequence ID from identifier # and region information header = "{}/{}-{}".format(id_, region_start, region_end) focus_ali.ids[focus_index] = header # write target sequence to file target_sequence_file = prefix + ".fa" with open(target_sequence_file, "w") as f: write_fasta([(header, focus_seq_nogap)], f) # swap target sequence to first position if it is not # the first sequence in alignment; # this is particularly important for hhfilter run # because target sequence might otherwise be filtered out if focus_index != 0: indices = np.arange(0, len(focus_ali)) indices[0] = focus_index indices[focus_index] = 0 focus_index = 0 focus_ali = focus_ali.select(sequences=indices) # write the raw focus alignment for hmmbuild focus_fasta_file = prefix + "_raw_focus_input.fasta" with open(focus_fasta_file, "w") as f: focus_ali.write(f, "fasta") return focus_fasta_file, target_sequence_file, region_start, region_end # define the gap threshold for inclusion in HMM's build by HMMbuild. SYMFRAC_HMMBUILD = 0.0 # check for required options check_required(kwargs, [ "prefix", "sequence_id", "alignment_file", "use_bitscores", "domain_threshold", "sequence_threshold", "database", "cpu", "nobias", "reuse_alignment", "hmmbuild", "hmmsearch" ]) prefix = kwargs["prefix"] # make sure output directory exists create_prefix_folders(prefix) # prepare input alignment for hmmbuild focus_fasta_file, target_sequence_file, region_start, region_end = \ _format_alignment_for_hmmbuild( kwargs["alignment_file"], **kwargs ) # run hmmbuild_and_search... allow to reuse pre-exisiting # Stockholm alignment file here ali_outcfg_file = prefix + ".align_hmmbuild_and_search.outcfg" # determine if to rerun, only possible if previous results # were stored in ali_outcfg_file if kwargs["reuse_alignment"] and valid_file(ali_outcfg_file): ali = read_config_file(ali_outcfg_file) # check if the alignment file itself is also there verify_resources( "Tried to reuse alignment, but empty or " "does not exist", ali["alignment"], ali["domtblout"]) else: # otherwise, we have to run the alignment # modify search thresholds to be suitable for hmmsearch sequence_length = region_end - region_start + 1 seq_threshold, domain_threshold = search_thresholds( kwargs["use_bitscores"], kwargs["sequence_threshold"], kwargs["domain_threshold"], sequence_length) # create the hmm hmmbuild_result = at.run_hmmbuild( alignment_file=focus_fasta_file, prefix=prefix, symfrac=SYMFRAC_HMMBUILD, cpu=kwargs["cpu"], binary=kwargs["hmmbuild"], ) hmmfile = hmmbuild_result.hmmfile # run the alignment from the hmm ali = at.run_hmmsearch( hmmfile=hmmfile, database=kwargs[kwargs["database"]], prefix=prefix, use_bitscores=kwargs["use_bitscores"], domain_threshold=domain_threshold, seq_threshold=seq_threshold, nobias=kwargs["nobias"], cpu=kwargs["cpu"], binary=kwargs["hmmsearch"], ) # get rid of huge stdout log file immediately try: os.remove(ali.output) except OSError: pass # turn namedtuple into dictionary to make # restarting code nicer ali = dict(ali._asdict()) # only item from hmmsearch_result to save is the hmmfile ali["hmmfile"] = hmmfile # save results of search for possible restart write_config_file(ali_outcfg_file, ali) # prepare output dictionary with result files outcfg = { "sequence_file": target_sequence_file, "first_index": region_start, "input_raw_focus_alignment": focus_fasta_file, "target_sequence_file": target_sequence_file, "focus_mode": True, "raw_alignment_file": ali["alignment"], "hittable_file": ali["domtblout"], } # convert the raw output alignment to fasta format # and add the appropriate query sequecne raw_focus_alignment_file = _make_hmmsearch_raw_fasta(outcfg, prefix) outcfg["raw_focus_alignment_file"] = raw_focus_alignment_file # define a single protein segment based on target sequence outcfg["segments"] = [ Segment("aa", kwargs["sequence_id"], region_start, region_end, range(region_start, region_end + 1)).to_list() ] outcfg["focus_sequence"] = "{}/{}-{}".format(kwargs["sequence_id"], region_start, region_end) return outcfg
def secondary_structure(**kwargs): """ Predict or load secondary structure for an input sequence Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- residues : pandas.DataFrame Table with sequence and secondary structure in columns i, A_i and sec_struct_3state """ check_required( kwargs, [ "prefix", "target_sequence_file", "segments", "sec_struct_method", "sec_struct_file", "psipred", ] ) prefix = kwargs["prefix"] create_prefix_folders(prefix) secstruct_file = kwargs["sec_struct_file"] if secstruct_file is not None: verify_resources( "Secondary structure prediction file does not exist/is empty", secstruct_file ) residues = pd.read_csv(secstruct_file) else: # make sure target sequence file is there so we can # predict secondary structure target_seq_file = kwargs["target_sequence_file"] verify_resources( "Sequence file does not exist/is empty", target_seq_file ) # we need to figure out what the index of the first residue # in the target sequence is; obtain first index from segment # information if possible if kwargs["segments"] is not None: s = Segment.from_list(kwargs["segments"][0]) first_index = s.region_start else: # otherwise try to get it from sequence file first_index = None with open(target_seq_file) as f: header, _ = next(read_fasta(f)) if header is not None: _, first_index, _ = parse_header(header) # if we cannot identify first index from header, # do not make guesses but fail if first_index is None: raise InvalidParameterError( "Could not unambiguously identify sequence range from " "FASTA header, needs to specified as id/start-end: {}".format( header ) ) # finally, run secondary structure prediction if kwargs["sec_struct_method"] == "psipred": # store psipred output in a separate directory output_dir = path.join(path.dirname(prefix), "psipred") # run psipred ss2_file, horiz_file = run_psipred( target_seq_file, output_dir, binary=kwargs["psipred"] ) # parse output, renumber to first index residues = read_psipred_prediction( horiz_file, first_index=first_index ) else: raise InvalidParameterError( "Secondary structure prediction method not implemented: " "{}. Valid choices: psipred".format(kwargs["sec_struct_method"]) ) # return predicted table return residues
def standard(**kwargs): """ Protocol: Predict 3D structure from evolutionary couplings Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * sec_struct_file * folding_ec_file * folded_structure_files """ check_required( kwargs, [ "prefix", "engine", "ec_file", "target_sequence_file", "segments", "folding_config_file", "cut_to_alignment_region", "sec_struct_method", "reuse_sec_struct", "sec_struct_file", "filter_sec_struct_clashes", "min_sequence_distance", "fold_probability_cutoffs", "fold_lowest_count", "fold_highest_count", "fold_increase", "num_models", "psipred", "cpu", "remapped_pdb_files", "cleanup", ] ) prefix = kwargs["prefix"] # make sure output directory exists create_prefix_folders(prefix) outcfg = { "folding_ec_file": prefix + "_CouplingScores_with_clashes.csv", "sec_struct_file": prefix + "_secondary_structure.csv", } # get secondary structure prediction # check if we should (and can) reuse output file from previous run if kwargs["reuse_sec_struct"] and valid_file(outcfg["sec_struct_file"]): residues = pd.read_csv(outcfg["sec_struct_file"]) else: residues = secondary_structure(**kwargs) # make pymol secondary structure assignment script outcfg["secondary_structure_pml_file"] = prefix + "_ss_draw.pml" pymol_secondary_structure( residues, outcfg["secondary_structure_pml_file"] ) # load ECs and filter for long-range pairs verify_resources( "EC file does not exist", kwargs["ec_file"] ) ecs_all = pd.read_csv(kwargs["ec_file"]) ecs = ecs_all.query("abs(i - j) > {}".format( kwargs["min_sequence_distance"]) ) # find secondary structure clashes ecs = secstruct_clashes(ecs, residues) ecs.to_csv(outcfg["folding_ec_file"], index=False) # if requested, filter clashes out before folding if kwargs["filter_sec_struct_clashes"]: ecs_fold = ecs.loc[~ecs.ss_clash] else: ecs_fold = ecs # cut modelled region to aligned region, if selected if kwargs["cut_to_alignment_region"]: segments = kwargs["segments"] # infer region from segment positions if we have it if segments is not None: positions = Segment.from_list(segments[0]).positions else: # otherwise get from EC values (could be misleading if # EC list is truncated, so only second option) positions = set(ecs.i.unique()).union(ecs.j.unique()) # limit modelled positions to covered region first_pos, last_pos = min(positions), max(positions) residues.loc[:, "in_model"] = False residues.loc[ (residues.i >= first_pos) & (residues.i <= last_pos), "in_model" ] = True else: # otherwise include all positions in model residues.loc[:, "in_model"] = True # save secondary structure prediction residues.to_csv(outcfg["sec_struct_file"], index=False) # only use the residues that will be in model for folding residues_fold = residues.loc[residues.in_model] # after all the setup, now fold the structures... # to speed things up, parallelize this to the number of # available CPUs num_procs = kwargs["cpu"] if num_procs is None: num_procs = 1 # first define all the sub-runs... folding_runs = [] # ... based on mixture model probability cutoffs = kwargs["fold_probability_cutoffs"] if cutoffs is not None and "probability" in ecs_fold.columns: if not isinstance(cutoffs, list): cutoffs = [cutoffs] for c in cutoffs: sig_ecs = ecs_fold.query("probability >= @c") if len(sig_ecs) > 0: folding_runs.append( (sig_ecs, "_significant_ECs_{}".format(c)) ) # ... and on simple EC counts/bins flc = kwargs["fold_lowest_count"] fhc = kwargs["fold_highest_count"] fi = kwargs["fold_increase"] if flc is not None and fhc is not None and fi is not None: num_sites = len( set.union(set(ecs.i.unique()), set(ecs.j.unique())) ) # transform fraction of number of sites into discrete number of ECs def _discrete_count(x): if isinstance(x, float): x = ceil(x * num_sites) return int(x) # range of plots to make lowest = _discrete_count(flc) highest = _discrete_count(fhc) step = _discrete_count(fi) # append to list of jobs to run folding_runs += [ ( ecs_fold.iloc[:c], "_{}".format(c) ) for c in range(lowest, highest + 1, step) ] # set up method to drive the folding of each job method = kwargs["engine"] # store structures in an auxiliary subdirectory, after folding # final models will be moved to main folding dir. Depending # on cleanup setting, the aux directory will be removed aux_prefix = insert_dir(prefix, "aux", rootname_subdir=False) aux_dir = path.dirname(aux_prefix) folding_runs = [ (job_ecs, aux_prefix + job_suffix) for (job_ecs, job_suffix) in folding_runs ] if method == "cns_dgsa": folder = partial( cns_dgsa_fold, residues_fold, config_file=kwargs["folding_config_file"], num_structures=kwargs["num_models"], log_level=None, binary=kwargs["cns"] ) else: raise InvalidParameterError( "Invalid folding engine: {} ".format(method) + "Valid selections are: cns_dgsa" ) # then apply folding function to each sub-run pool = mp.Pool(processes=num_procs) results = pool.starmap(folder, folding_runs) # make double sure that the pool is cleaned up, # or SIGTERM upon exit will interfere with # interrupt signal interception pool.close() pool.join() # merge result dictionaries into one dict folded_files = { k: v for subres in results for k, v in subres.items() } # move structures from aux into main folding dir fold_dir = path.dirname(prefix) prediction_files = [] for name, file_path in folded_files.items(): # move file (use copy to allow overwriting) shutil.copy(file_path, fold_dir) # update file path to main folding dir, # and put in a flat list of result files prediction_files.append( file_path.replace(aux_prefix, prefix) ) outcfg["folded_structure_files"] = prediction_files # remove aux dir if cleanup is requested if kwargs["cleanup"]: shutil.rmtree(aux_dir) # apply ranking to predicted models ranking = dihedral_ranking(prediction_files, residues) # apply clustering (all available methods), but only # if we have something to cluster if len(prediction_files) > 1: clustering = maxcluster_clustering_table( prediction_files, binary=kwargs["maxcluster"] ) # join ranking with clustering ranking = ranking.merge(clustering, on="filename", how="left") # sort by score (best models first) ranking = ranking.sort_values(by="ranking_score", ascending=False) # store as file outcfg["folding_ranking_file"] = prefix + "_ranking.csv" ranking.to_csv(outcfg["folding_ranking_file"], index=False) # apply comparison to existing structures if kwargs["remapped_pdb_files"] is not None and len(kwargs["remapped_pdb_files"]) > 0: experimental_files = kwargs["remapped_pdb_files"] comp_all, comp_singles = compare_models_maxcluster( list(experimental_files.keys()), prediction_files, norm_by_intersection=True, distance_cutoff=None, binary=kwargs["maxcluster"] ) # merge with ranking and save comparison = ranking.merge( comp_all, on="filename", how="left" ).sort_values(by="tm", ascending=False) outcfg["folding_comparison_file"] = prefix + "_comparison.csv" comparison.to_csv(outcfg["folding_comparison_file"], index=False) # also store comparison to structures in individual files ind_comp_files = {} for filename, comp_single in comp_singles.items(): comparison_s = ranking.merge( comp_single, on="filename", how="left" ).sort_values(by="tm", ascending=False) basename = path.splitext(path.split(filename)[1])[0] ind_file = path.join(fold_dir, basename + ".csv") # map back to original key from remapped_pdb_files as a key for this list ind_comp_files[ind_file] = experimental_files[filename] comparison_s.to_csv(ind_file, index=False) outcfg["folding_individual_comparison_files"] = ind_comp_files return outcfg
def complex(**kwargs): """ Protocol: Mutation effect prediction and visualization for protein complexes Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * mutation_matrix_file * [mutation_dataset_predicted_file] """ check_required( kwargs, ["prefix", "model_file", "mutation_dataset_file", "segments"]) prefix = kwargs["prefix"] outcfg = { "mutation_matrix_file": prefix + "_single_mutant_matrix.csv", "mutation_matrix_plot_files": [], } # make sure model file exists verify_resources("Model parameter file does not exist", kwargs["model_file"]) # make sure output directory exists create_prefix_folders(prefix) # load segments to create couplings object segment_objects = [] for segment_list in kwargs["segments"]: segment_objects.append(Segment.from_list(segment_list)) first_segment_name = Segment.from_list(kwargs["segments"][0]).segment_id second_segment_name = Segment.from_list(kwargs["segments"][1]).segment_id first_chain_name = Segment.from_list( kwargs["segments"][0]).default_chain_name() second_chain_name = Segment.from_list( kwargs["segments"][1]).default_chain_name() # load couplings object c = MultiSegmentCouplingsModel(kwargs["model_file"], *segment_objects) # create the independent model c0 = c.to_independent_model() # create the inter-protein only Jij model ci = c.to_inter_segment_model() for model, type_ in [(c, "Epistatic"), (c0, "Independent"), (ci, "Inter_segment")]: # interactive plot using bokeh filename = prefix + "_{}_model".format(type_.lower(), ) output_file(filename + ".html", "{} model".format(type_)) fig = evcouplings.visualize.mutations.plot_mutation_matrix( model, engine="bokeh") save(fig) outcfg["mutation_matrix_plot_files"].append(filename + ".html") # static matplotlib plot evcouplings.visualize.mutations.plot_mutation_matrix(model) plt.savefig(filename + ".pdf", bbox_inches="tight") outcfg["mutation_matrix_plot_files"].append(filename + ".pdf") # create single mutation matrix table, # add prediction by independent model and # save to file singles = single_mutant_matrix(c, output_column="prediction_epistatic") singles = predict_mutation_table(c0, singles, "prediction_independent") singles = predict_mutation_table(ci, singles, "prediction_inter_segment") singles.to_csv(outcfg["mutation_matrix_file"], index=False) # Pymol scripts outcfg["mutations_epistatic_pml_files"] = [] for model in ["epistatic", "independent", "inter_segment"]: pml_filename = prefix + "_{}_model.pml".format(model) evcouplings.visualize.mutations.mutation_pymol_script( singles, pml_filename, effect_column="prediction_" + model, segment_to_chain_mapping={ first_segment_name: first_chain_name, second_segment_name: second_chain_name }) outcfg["mutations_epistatic_pml_files"].append(pml_filename) # predict experimental dataset if given dataset_file = kwargs["mutation_dataset_file"] if dataset_file is not None: verify_resources("Dataset file does not exist", dataset_file) data = pd.read_csv(dataset_file, comment="#", sep=",") if "segment" not in data.columns: raise ValueError("Input mutation dataset file does not contain " "a column called 'segment' to specify the " "protein of origin for each mutation") # add epistatic model prediction data_pred = predict_mutation_table(c, data, "prediction_epistatic") # add independent model prediction data_pred = predict_mutation_table(c0, data_pred, "prediction_independent") data_pred = predict_mutation_table(ci, data_pred, "inter_segment") outcfg[ "mutation_dataset_predicted_file"] = prefix + "_dataset_predicted.csv" data_pred.to_csv(outcfg["mutation_dataset_predicted_file"], index=False) return outcfg