def complex_probability(ecs, scoring_model, use_all_ecs=False, score="cn"): """ Adds confidence measure for complex evolutionary couplings Parameters ---------- ecs : pandas.DataFrame Table with evolutionary couplings scoring_model : {"skewnormal", "normal", "evcomplex"} Use this scoring model to assign EC confidence measure use_all_ecs : bool, optional (default: False) If true, fits the scoring model to all ECs; if false, fits the model to only the inter ECs. score : str, optional (default: "cn") Use this score column for confidence assignment Returns ------- ecs : pandas.DataFrame EC table with additional column "probability" containing confidence measure """ if use_all_ecs: ecs = pairs.add_mixture_proability( ecs, model=scoring_model ) else: inter_ecs = ecs.query("segment_i != segment_j") intra_ecs = ecs.query("segment_i == segment_j") intra_ecs = pairs.add_mixture_probability( intra_ecs, model=scoring_model, score=score ) inter_ecs = pairs.add_mixture_probability( inter_ecs, model=scoring_model, score=score ) ecs = pd.concat( [intra_ecs, inter_ecs] ).sort_values( score, ascending=False ) return ecs
def standard(**kwargs): """ Protocol: Infer ECs from alignment using plmc. Use complex protocol for heteromultimeric complexes instead. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required and infer_plmc() Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: raw_ec_file model_file num_sites num_sequences effective_sequences focus_mode (passed through) focus_sequence (passed through) segments (passed through) """ # for additional required parameters, see infer_plmc() check_required( kwargs, [ "prefix", "min_sequence_distance", ] ) prefix = kwargs["prefix"] # infer ECs and load them outcfg, ecs, segments = infer_plmc(**kwargs) model = CouplingsModel(outcfg["model_file"]) # add mixture model probability ecs = pairs.add_mixture_probability(ecs) # following computations are mostly specific to monomer pipeline is_single_segment = segments is None or len(segments) == 1 outcfg = { **outcfg, **_postprocess_inference( ecs, kwargs, model, outcfg, prefix, generate_enrichment=is_single_segment, generate_line_plot=is_single_segment ) } # dump output config to YAML file for debugging/logging write_config_file(prefix + ".couplings_standard.outcfg", outcfg) return outcfg
def standard(**kwargs): """ Protocol: Infer ECs from alignment using plmc. .. todo:: 1. make EC enrichment calculation segment-ready 2. explain meaning of parameters in detail. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * raw_ec_file * model_file * num_sites * num_sequences * effective_sequences * focus_mode (passed through) * focus_sequence (passed through) * segments (passed through) """ check_required( kwargs, [ "prefix", "alignment_file", "focus_mode", "focus_sequence", "theta", "alphabet", "segments", "ignore_gaps", "iterations", "lambda_h", "lambda_J", "lambda_group", "scale_clusters", "cpu", "plmc", "reuse_ecs", "min_sequence_distance", # "save_model", ] ) prefix = kwargs["prefix"] # for now disable option to not save model, since # otherwise mutate stage will crash. To remove model # file at end, use delete option in management section. """ if kwargs["save_model"]: model = prefix + ".model" else: model = None """ model = prefix + ".model" outcfg = { "model_file": model, "raw_ec_file": prefix + "_ECs.txt", "ec_file": prefix + "_CouplingScores.csv", # TODO: the following are passed through stage... # keep this or unnecessary? "focus_mode": kwargs["focus_mode"], "focus_sequence": kwargs["focus_sequence"], "segments": kwargs["segments"], } # make sure input alignment exists verify_resources( "Input alignment does not exist", kwargs["alignment_file"] ) # make sure output directory exists create_prefix_folders(prefix) # regularization strength on couplings J_ij lambda_J = kwargs["lambda_J"] segments = kwargs["segments"] if segments is not None: segments = [ mapping.Segment.from_list(s) for s in segments ] # first determine size of alphabet; # default is amino acid alphabet if kwargs["alphabet"] is None: alphabet = ALPHABET_PROTEIN alphabet_setting = None else: alphabet = kwargs["alphabet"] # allow shortcuts for protein, DNA, RNA if alphabet in ALPHABET_MAP: alphabet = ALPHABET_MAP[alphabet] # if we have protein alphabet, do not set # as plmc parameter since default parameter, # has some implementation advantages for focus mode if alphabet == ALPHABET_PROTEIN: alphabet_setting = None else: alphabet_setting = alphabet # scale lambda_J to proportionally compensate # for higher number of J_ij compared to h_i? if kwargs["lambda_J_times_Lq"]: num_symbols = len(alphabet) # if we ignore gaps, there is one character less if kwargs["ignore_gaps"]: num_symbols -= 1 # second, determine number of uppercase positions # that are included in the calculation with open(kwargs["alignment_file"]) as f: seq_id, seq = next(read_fasta(f)) # gap character is by convention first char in alphabet gap = alphabet[0] uppercase = [ c for c in seq if c == c.upper() or c == gap ] L = len(uppercase) # finally, scale lambda_J lambda_J *= (num_symbols - 1) * (L - 1) # run plmc... or reuse pre-exisiting results from previous run plm_outcfg_file = prefix + ".couplings_standard_plmc.outcfg" # determine if to rerun, only possible if previous results # were stored in ali_outcfg_file if kwargs["reuse_ecs"] and valid_file(plm_outcfg_file): plmc_result = read_config_file(plm_outcfg_file) # check if the EC/parameter files are there required_files = [outcfg["raw_ec_file"]] if outcfg["model_file"] is not None: required_files += [outcfg["model_file"]] verify_resources( "Tried to reuse ECs, but empty or " "does not exist", *required_files ) else: # run plmc binary plmc_result = ct.run_plmc( kwargs["alignment_file"], outcfg["raw_ec_file"], outcfg["model_file"], focus_seq=kwargs["focus_sequence"], alphabet=alphabet_setting, theta=kwargs["theta"], scale=kwargs["scale_clusters"], ignore_gaps=kwargs["ignore_gaps"], iterations=kwargs["iterations"], lambda_h=kwargs["lambda_h"], lambda_J=lambda_J, lambda_g=kwargs["lambda_group"], cpu=kwargs["cpu"], binary=kwargs["plmc"], ) # save iteration table to file iter_table_file = prefix + "_iteration_table.csv" plmc_result.iteration_table.to_csv( iter_table_file ) # turn namedtuple into dictionary to make # restarting code nicer plmc_result = dict(plmc_result._asdict()) # then replace table with filename so # we can store results in config file plmc_result["iteration_table"] = iter_table_file # save results of search for possible restart write_config_file(plm_outcfg_file, plmc_result) # store useful information about model in outcfg outcfg.update({ "num_sites": plmc_result["num_valid_sites"], "num_sequences": plmc_result["num_valid_seqs"], "effective_sequences": plmc_result["effective_samples"], "region_start": plmc_result["region_start"], }) # read and sort ECs ecs = pairs.read_raw_ec_file(outcfg["raw_ec_file"]) # add mixture model probability ecs = pairs.add_mixture_probability(ecs) if segments is not None: # and (len(segments) > 1 or not kwargs["focus_mode"]): # create index mapping seg_mapper = mapping.SegmentIndexMapper( kwargs["focus_mode"], outcfg["region_start"], *segments ) # apply to EC table ecs = mapping.segment_map_ecs(ecs, seg_mapper) # write updated table to csv file ecs.to_csv(outcfg["ec_file"], index=False) # also store longrange ECs as convenience output if kwargs["min_sequence_distance"] is not None: outcfg["ec_longrange_file"] = prefix + "_CouplingScores_longrange.csv" ecs_longrange = ecs.query( "abs(i - j) >= {}".format(kwargs["min_sequence_distance"]) ) ecs_longrange.to_csv(outcfg["ec_longrange_file"], index=False) # also create line-drawing script (for now, only for single segments) if segments is None or len(segments) == 1: outcfg["ec_lines_pml_file"] = prefix + "_draw_ec_lines.pml" L = outcfg["num_sites"] ec_lines_pymol_script( ecs_longrange.iloc[:L, :], outcfg["ec_lines_pml_file"] ) # compute EC enrichment (for now, for single segments # only since enrichment code cannot handle multiple segments) if segments is None or len(segments) == 1: outcfg["enrichment_file"] = prefix + "_enrichment.csv" ecs_enriched = pairs.enrichment(ecs) ecs_enriched.to_csv(outcfg["enrichment_file"], index=False) # create corresponding enrichment pymol scripts outcfg["enrichment_pml_files"] = [] for sphere_view, pml_suffix in [ (True, "_enrichment_spheres.pml"), (False, "_enrichment_sausage.pml") ]: pml_file = prefix + pml_suffix enrichment_pymol_script(ecs_enriched, pml_file, sphere_view=sphere_view) outcfg["enrichment_pml_files"].append(pml_file) # output EVzoom JSON file if we have stored model file if outcfg.get("model_file", None) is not None: outcfg["evzoom_file"] = prefix + "_evzoom.json" with open(outcfg["evzoom_file"], "w") as f: # load parameters c = CouplingsModel(outcfg["model_file"]) # create JSON output and write to file f.write( evzoom_json(c) + "\n" ) # dump output config to YAML file for debugging/logging write_config_file(prefix + ".couplings_standard.outcfg", outcfg) return outcfg
def evzoom_data(model, ec_threshold=0.9, freq_threshold=0.01, Jij_threshold=10, score="cn", reorder="KRHEDNQTSCGAVLIMPYFW"): """ Generate data for EVzoom visualization. Use evzoom_json() to get final JSON string to use with EVzoom. Parameters ---------- model : evcouplings.couplings.model.CouplingsModel Parameters of pairwise graphical model ec_threshold : float or int, optional (default: 0.9) Only display evolutionary couplings above this threshold. If float between 0 and 1, this will be interpreted as probability cutoff for mixture model. Otherwise, will be interpreted as absolute number of couplings. freq_threshold : float, optional (default: 0.01) Only display coupling parameters for amino acids with at least this frequency in the underlying sequence alignment Jij_threshold : int or float, optional (default: 10) Only display coupling parameters above this threshold. If float, this number will be interpreted as an actual score threshold; if int, this will be interpreted as a percentage of the maximum absolute score. score : str, optional (default: "cn") Use this score to determine which couplings to display. Valid choices are the score columns contained in the CouplingsModel.ecs dataframe reorder : str, optional (default: "KRHEDNQTSCGAVLIMPYFW") Order of amino acids in displayed coupling matrices Returns ------- map_ : dict Map containing sequence indices and characters logo : list List containing information about sequence logos for axes of visualization matrix : dict List containing couplings that will be visualized """ DIGITS = 1 DIGITS_LOGO = 2 ecs = model.ecs if 0 < ec_threshold <= 1.0: ecs = add_mixture_probability(ecs, score=score) ecs_sel = ecs.loc[ecs.probability >= ec_threshold] else: ecs_sel = ecs.iloc[:int(ec_threshold)] # if cutoff for couplings is given as int, interpret # as percentage of biggest absolute coupling value if isinstance(Jij_threshold, int): max_val = np.max(np.abs(model.Jij())) Jij_threshold = max_val * Jij_threshold / 100 if reorder is not None: alphabet = np.array(list(reorder)) alphabet_order = [ model.alphabet_map[c] for c in reorder ] else: alphabet = model.alphabet alphabet_order = sorted( model.alphabet_map.values() ) # Map containing sequence and indeces map_ = { "letters": "".join(model.seq()), "indices": list(map(int, model.sn())), } # assemble coupling matrix matrix = [] for idx, r in ecs_sel.iterrows(): i, j, score_ij = r["i"], r["j"], r[score] Jij = model.Jij(i, j)[alphabet_order, :][:, alphabet_order] ai_set = np.where( np.max(np.abs(Jij), axis=1) > Jij_threshold )[0] aj_set = np.where( np.max(np.abs(Jij), axis=0) > Jij_threshold )[0] cur_matrix = [ [round(Jij[ai, aj], DIGITS) for aj in list(aj_set)] for ai in list(ai_set) ] cur_matrix_T = [ [round(Jij[ai, aj], DIGITS) for ai in list(ai_set)] for aj in list(aj_set) ] cur_row = { "i": model.mn(i) + 1, "j": model.mn(j) + 1, "score": round(score_ij, DIGITS), "iC": "".join(alphabet[ai_set]), "jC": "".join(alphabet[aj_set]), "matrix": cur_matrix, } cur_row_T = { "i": cur_row["j"], "j": cur_row["i"], "score": cur_row["score"], "iC": cur_row["jC"], "jC": cur_row["iC"], "matrix": cur_matrix_T, } matrix.append(cur_row) matrix.append(cur_row_T) # assemble sequence logo fi = model.fi() q = model.num_symbols B = -fi * np.log2(fi) B[fi <= 0] = 0 R = np.log2(q) - B.sum(axis=1) logo = [] for i in range(model.L): order = np.argsort(fi[i, :]) frequent = order[fi[i, order] >= freq_threshold] symbols = model.alphabet[frequent] fi_row = fi[i, frequent] * R[i] logo.append([ {"code": s, "bits": round(float(h), DIGITS_LOGO)} for s, h in zip(symbols, fi_row) ]) return map_, logo, matrix
def rescore_cn_score_ecs(ecs, segments, outcfg, kwargs, score="cn"): """ Probabilistic rescoring of CN-score based ECS Parameters ---------- ecs : pd.DataFrame EC table segments : list(evcouplings.couplings.mapping.Segment) Input segment list outcfg : dict Current output configuration state of couplings protocol kwargs : dict Input parameters of couplings protocol score : str, optional (default: "cn") Target score column to use Returns ------- ecs : pd.DataFrame Enhanced EC table with probabilities and new score (if applicable) outcfg_update : dict Additional outputs for stage output configuration, need to be merged into outcfg in main protocol """ check_required( kwargs, [ "scoring_model", "min_sequence_distance", "theta", "frequencies_file", ] ) # None will trigger default behaviour of add_mixture_probability # (which currently is "skewnormal") scoring_model = kwargs.get("scoring_model", "skewnormal") # currently we need to distinguish between full rescoring (score and # probability) like with logistic regression model, or just putting # probabilities on top of default CN score using add_mixture_probability if scoring_model == "logistic_regression": scorer = pairs.LogisticRegressionScorer() # load amino acid/gap frequencies and conservation info freqs = pd.read_csv(kwargs["frequencies_file"]) num_sites = outcfg["num_sites"] min_seq_dist = kwargs["min_sequence_distance"] # rescore EC table ecs = scorer.score( ecs, freqs, kwargs["theta"], outcfg["effective_sequences"], num_sites, score=score ) # currently only perform quality scoring for single segments if segments is None or len(segments) == 1: is_longrange = ((ecs.i - ecs.j).abs() >= min_seq_dist).astype(int) ecs_lr = ecs.assign( longrange_count=is_longrange.cumsum() ) # compute expectation for true positives on all contacts expected_positives_all = ecs_lr.query( "longrange_count <= @num_sites" ).probability.sum() expected_positives_longrange = ecs_lr.query( "longrange_count <= @num_sites and abs(i - j) >= @min_seq_dist" ).probability.sum() # store in config outcfg_update = { "expected_true_ecs_all": float(expected_positives_all), "expected_true_ecs_longrange": float(expected_positives_longrange) } else: # add mixture model probability ecs = pairs.add_mixture_probability( ecs, model=scoring_model ) # put CN score into default score column for more generic # downstream score handling ecs = ecs.assign( score=ecs[score] ) # no update to output config in this case outcfg_update = {} # sort ECs ecs = ecs.sort_values( by="score", ascending=False ) return ecs, outcfg_update