Ejemplo n.º 1
0
def complex_probability(ecs, scoring_model, use_all_ecs=False,
                        score="cn"):
    """
    Adds confidence measure for complex evolutionary couplings

    Parameters
    ----------
    ecs : pandas.DataFrame
        Table with evolutionary couplings
    scoring_model : {"skewnormal", "normal", "evcomplex"}
        Use this scoring model to assign EC confidence measure
    use_all_ecs : bool, optional (default: False)
        If true, fits the scoring model to all ECs;
        if false, fits the model to only the inter ECs.
    score : str, optional (default: "cn")
        Use this score column for confidence assignment
        
    Returns
    -------
    ecs : pandas.DataFrame
        EC table with additional column "probability"
        containing confidence measure
    """
    if use_all_ecs:
        ecs = pairs.add_mixture_proability(
            ecs, model=scoring_model
        )
    else:
        inter_ecs = ecs.query("segment_i != segment_j")
        intra_ecs = ecs.query("segment_i == segment_j")

        intra_ecs = pairs.add_mixture_probability(
            intra_ecs, model=scoring_model, score=score
        )

        inter_ecs = pairs.add_mixture_probability(
            inter_ecs, model=scoring_model, score=score
        )

        ecs = pd.concat(
            [intra_ecs, inter_ecs]
        ).sort_values(
            score, ascending=False
        )

    return ecs
Ejemplo n.º 2
0
def standard(**kwargs):
    """
    Protocol:

    Infer ECs from alignment using plmc. Use complex protocol
    for heteromultimeric complexes instead.

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required
        and infer_plmc()

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        raw_ec_file
        model_file
        num_sites
        num_sequences
        effective_sequences

        focus_mode (passed through)
        focus_sequence (passed through)
        segments (passed through)
    """
    # for additional required parameters, see infer_plmc()
    check_required(
        kwargs,
        [
            "prefix", "min_sequence_distance",
        ]
    )

    prefix = kwargs["prefix"]

    # infer ECs and load them
    outcfg, ecs, segments = infer_plmc(**kwargs)
    model = CouplingsModel(outcfg["model_file"])

    # add mixture model probability
    ecs = pairs.add_mixture_probability(ecs)

    # following computations are mostly specific to monomer pipeline
    is_single_segment = segments is None or len(segments) == 1
    outcfg = {
        **outcfg,
        **_postprocess_inference(
            ecs, kwargs, model, outcfg, prefix,
            generate_enrichment=is_single_segment,
            generate_line_plot=is_single_segment
        )
    }

    # dump output config to YAML file for debugging/logging
    write_config_file(prefix + ".couplings_standard.outcfg", outcfg)

    return outcfg
Ejemplo n.º 3
0
def standard(**kwargs):
    """
    Protocol:

    Infer ECs from alignment using plmc.

    .. todo::

        1. make EC enrichment calculation segment-ready
        2. explain meaning of parameters in detail.

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * raw_ec_file
        * model_file
        * num_sites
        * num_sequences
        * effective_sequences
        * focus_mode (passed through)
        * focus_sequence (passed through)
        * segments (passed through)
    """
    check_required(
        kwargs,
        [
            "prefix", "alignment_file",
            "focus_mode", "focus_sequence", "theta",
            "alphabet", "segments", "ignore_gaps", "iterations",
            "lambda_h", "lambda_J", "lambda_group",
            "scale_clusters",
            "cpu", "plmc", "reuse_ecs",
            "min_sequence_distance", # "save_model",
        ]
    )

    prefix = kwargs["prefix"]

    # for now disable option to not save model, since
    # otherwise mutate stage will crash. To remove model
    # file at end, use delete option in management section.
    """
    if kwargs["save_model"]:
        model = prefix + ".model"
    else:
        model = None
    """
    model = prefix + ".model"

    outcfg = {
        "model_file": model,
        "raw_ec_file": prefix + "_ECs.txt",
        "ec_file": prefix + "_CouplingScores.csv",
        # TODO: the following are passed through stage...
        # keep this or unnecessary?
        "focus_mode": kwargs["focus_mode"],
        "focus_sequence": kwargs["focus_sequence"],
        "segments": kwargs["segments"],
    }

    # make sure input alignment exists
    verify_resources(
        "Input alignment does not exist",
        kwargs["alignment_file"]
    )

    # make sure output directory exists
    create_prefix_folders(prefix)

    # regularization strength on couplings J_ij
    lambda_J = kwargs["lambda_J"]

    segments = kwargs["segments"]
    if segments is not None:
        segments = [
            mapping.Segment.from_list(s) for s in segments
        ]

    # first determine size of alphabet;
    # default is amino acid alphabet
    if kwargs["alphabet"] is None:
        alphabet = ALPHABET_PROTEIN
        alphabet_setting = None
    else:
        alphabet = kwargs["alphabet"]

        # allow shortcuts for protein, DNA, RNA
        if alphabet in ALPHABET_MAP:
            alphabet = ALPHABET_MAP[alphabet]

        # if we have protein alphabet, do not set
        # as plmc parameter since default parameter,
        # has some implementation advantages for focus mode
        if alphabet == ALPHABET_PROTEIN:
            alphabet_setting = None
        else:
            alphabet_setting = alphabet

    # scale lambda_J to proportionally compensate
    # for higher number of J_ij compared to h_i?
    if kwargs["lambda_J_times_Lq"]:
        num_symbols = len(alphabet)

        # if we ignore gaps, there is one character less
        if kwargs["ignore_gaps"]:
            num_symbols -= 1

        # second, determine number of uppercase positions
        # that are included in the calculation
        with open(kwargs["alignment_file"]) as f:
            seq_id, seq = next(read_fasta(f))

        # gap character is by convention first char in alphabet
        gap = alphabet[0]
        uppercase = [
            c for c in seq if c == c.upper() or c == gap
        ]
        L = len(uppercase)

        # finally, scale lambda_J
        lambda_J *= (num_symbols - 1) * (L - 1)

    # run plmc... or reuse pre-exisiting results from previous run
    plm_outcfg_file = prefix + ".couplings_standard_plmc.outcfg"

    # determine if to rerun, only possible if previous results
    # were stored in ali_outcfg_file
    if kwargs["reuse_ecs"] and valid_file(plm_outcfg_file):
        plmc_result = read_config_file(plm_outcfg_file)

        # check if the EC/parameter files are there
        required_files = [outcfg["raw_ec_file"]]

        if outcfg["model_file"] is not None:
            required_files += [outcfg["model_file"]]

        verify_resources(
            "Tried to reuse ECs, but empty or "
            "does not exist",
            *required_files
        )

    else:
        # run plmc binary
        plmc_result = ct.run_plmc(
            kwargs["alignment_file"],
            outcfg["raw_ec_file"],
            outcfg["model_file"],
            focus_seq=kwargs["focus_sequence"],
            alphabet=alphabet_setting,
            theta=kwargs["theta"],
            scale=kwargs["scale_clusters"],
            ignore_gaps=kwargs["ignore_gaps"],
            iterations=kwargs["iterations"],
            lambda_h=kwargs["lambda_h"],
            lambda_J=lambda_J,
            lambda_g=kwargs["lambda_group"],
            cpu=kwargs["cpu"],
            binary=kwargs["plmc"],
        )

        # save iteration table to file
        iter_table_file = prefix + "_iteration_table.csv"
        plmc_result.iteration_table.to_csv(
            iter_table_file
        )

        # turn namedtuple into dictionary to make
        # restarting code nicer
        plmc_result = dict(plmc_result._asdict())

        # then replace table with filename so
        # we can store results in config file
        plmc_result["iteration_table"] = iter_table_file

        # save results of search for possible restart
        write_config_file(plm_outcfg_file, plmc_result)

    # store useful information about model in outcfg
    outcfg.update({
        "num_sites": plmc_result["num_valid_sites"],
        "num_sequences": plmc_result["num_valid_seqs"],
        "effective_sequences": plmc_result["effective_samples"],
        "region_start": plmc_result["region_start"],
    })

    # read and sort ECs
    ecs = pairs.read_raw_ec_file(outcfg["raw_ec_file"])

    # add mixture model probability
    ecs = pairs.add_mixture_probability(ecs)

    if segments is not None:  # and (len(segments) > 1 or not kwargs["focus_mode"]):
        # create index mapping
        seg_mapper = mapping.SegmentIndexMapper(
            kwargs["focus_mode"], outcfg["region_start"], *segments
        )

        # apply to EC table
        ecs = mapping.segment_map_ecs(ecs, seg_mapper)

    # write updated table to csv file
    ecs.to_csv(outcfg["ec_file"], index=False)

    # also store longrange ECs as convenience output
    if kwargs["min_sequence_distance"] is not None:
        outcfg["ec_longrange_file"] = prefix + "_CouplingScores_longrange.csv"
        ecs_longrange = ecs.query(
            "abs(i - j) >= {}".format(kwargs["min_sequence_distance"])
        )
        ecs_longrange.to_csv(outcfg["ec_longrange_file"], index=False)

        # also create line-drawing script (for now, only for single segments)
        if segments is None or len(segments) == 1:
            outcfg["ec_lines_pml_file"] = prefix + "_draw_ec_lines.pml"
            L = outcfg["num_sites"]
            ec_lines_pymol_script(
                ecs_longrange.iloc[:L, :],
                outcfg["ec_lines_pml_file"]
            )

    # compute EC enrichment (for now, for single segments
    # only since enrichment code cannot handle multiple segments)
    if segments is None or len(segments) == 1:
        outcfg["enrichment_file"] = prefix + "_enrichment.csv"
        ecs_enriched = pairs.enrichment(ecs)
        ecs_enriched.to_csv(outcfg["enrichment_file"], index=False)

        # create corresponding enrichment pymol scripts
        outcfg["enrichment_pml_files"] = []
        for sphere_view, pml_suffix in [
            (True, "_enrichment_spheres.pml"), (False, "_enrichment_sausage.pml")
        ]:
            pml_file = prefix + pml_suffix
            enrichment_pymol_script(ecs_enriched, pml_file, sphere_view=sphere_view)
            outcfg["enrichment_pml_files"].append(pml_file)

    # output EVzoom JSON file if we have stored model file
    if outcfg.get("model_file", None) is not None:
        outcfg["evzoom_file"] = prefix + "_evzoom.json"
        with open(outcfg["evzoom_file"], "w") as f:
            # load parameters
            c = CouplingsModel(outcfg["model_file"])

            # create JSON output and write to file
            f.write(
                evzoom_json(c) + "\n"
            )

    # dump output config to YAML file for debugging/logging
    write_config_file(prefix + ".couplings_standard.outcfg", outcfg)

    return outcfg
Ejemplo n.º 4
0
def evzoom_data(model, ec_threshold=0.9, freq_threshold=0.01,
                Jij_threshold=10, score="cn",
                reorder="KRHEDNQTSCGAVLIMPYFW"):
    """
    Generate data for EVzoom visualization. Use evzoom_json()
    to get final JSON string to use with EVzoom.

    Parameters
    ----------
    model : evcouplings.couplings.model.CouplingsModel
        Parameters of pairwise graphical model
    ec_threshold : float or int, optional (default: 0.9)
        Only display evolutionary couplings above this
        threshold. If float between 0 and 1, this will be
        interpreted as probability cutoff for mixture model.
        Otherwise, will be interpreted as absolute number of couplings.
    freq_threshold : float, optional (default: 0.01)
        Only display coupling parameters for amino acids with
        at least this frequency in the underlying sequence
        alignment
    Jij_threshold : int or float, optional (default: 10)
        Only display coupling parameters above this
        threshold. If float, this number will be interpreted
        as an actual score threshold; if int, this will
        be interpreted as a percentage of the maximum
        absolute score.
    score : str, optional (default: "cn")
        Use this score to determine which couplings to display.
        Valid choices are the score columns contained in the
        CouplingsModel.ecs dataframe
    reorder : str, optional (default: "KRHEDNQTSCGAVLIMPYFW")
        Order of amino acids in displayed coupling matrices

    Returns
    -------
    map_ : dict
        Map containing sequence indices and characters
    logo : list
        List containing information about sequence logos
        for axes of visualization
    matrix : dict
        List containing couplings that will be visualized
    """
    DIGITS = 1
    DIGITS_LOGO = 2
    ecs = model.ecs

    if 0 < ec_threshold <= 1.0:
        ecs = add_mixture_probability(ecs, score=score)
        ecs_sel = ecs.loc[ecs.probability >= ec_threshold]
    else:
        ecs_sel = ecs.iloc[:int(ec_threshold)]

    # if cutoff for couplings is given as int, interpret
    # as percentage of biggest absolute coupling value
    if isinstance(Jij_threshold, int):
        max_val = np.max(np.abs(model.Jij()))
        Jij_threshold = max_val * Jij_threshold / 100

    if reorder is not None:
        alphabet = np.array(list(reorder))
        alphabet_order = [
            model.alphabet_map[c] for c in reorder
        ]
    else:
        alphabet = model.alphabet
        alphabet_order = sorted(
            model.alphabet_map.values()
        )

    # Map containing sequence and indeces
    map_ = {
        "letters": "".join(model.seq()),
        "indices": list(map(int, model.sn())),
    }

    # assemble coupling matrix
    matrix = []

    for idx, r in ecs_sel.iterrows():
        i, j, score_ij = r["i"], r["j"], r[score]
        Jij = model.Jij(i, j)[alphabet_order, :][:, alphabet_order]
        ai_set = np.where(
            np.max(np.abs(Jij), axis=1) > Jij_threshold
        )[0]
        aj_set = np.where(
            np.max(np.abs(Jij), axis=0) > Jij_threshold
        )[0]

        cur_matrix = [
            [round(Jij[ai, aj], DIGITS) for aj in list(aj_set)]
            for ai in list(ai_set)
        ]

        cur_matrix_T = [
            [round(Jij[ai, aj], DIGITS) for ai in list(ai_set)]
            for aj in list(aj_set)
        ]

        cur_row = {
            "i": model.mn(i) + 1,
            "j": model.mn(j) + 1,
            "score": round(score_ij, DIGITS),
            "iC": "".join(alphabet[ai_set]),
            "jC": "".join(alphabet[aj_set]),
            "matrix": cur_matrix,
        }

        cur_row_T = {
            "i": cur_row["j"],
            "j": cur_row["i"],
            "score": cur_row["score"],
            "iC": cur_row["jC"],
            "jC": cur_row["iC"],
            "matrix": cur_matrix_T,
        }

        matrix.append(cur_row)
        matrix.append(cur_row_T)

    # assemble sequence logo
    fi = model.fi()
    q = model.num_symbols

    B = -fi * np.log2(fi)
    B[fi <= 0] = 0
    R = np.log2(q) - B.sum(axis=1)

    logo = []
    for i in range(model.L):
        order = np.argsort(fi[i, :])
        frequent = order[fi[i, order] >= freq_threshold]

        symbols = model.alphabet[frequent]
        fi_row = fi[i, frequent] * R[i]

        logo.append([
            {"code": s, "bits": round(float(h), DIGITS_LOGO)}
            for s, h in zip(symbols, fi_row)
        ])

    return map_, logo, matrix
Ejemplo n.º 5
0
def rescore_cn_score_ecs(ecs, segments, outcfg, kwargs, score="cn"):
    """
    Probabilistic rescoring of CN-score based ECS

    Parameters
    ----------
    ecs : pd.DataFrame
        EC table
    segments : list(evcouplings.couplings.mapping.Segment)
        Input segment list
    outcfg : dict
        Current output configuration state of couplings protocol
    kwargs : dict
        Input parameters of couplings protocol
    score : str, optional (default: "cn")
        Target score column to use

    Returns
    -------
    ecs : pd.DataFrame
        Enhanced EC table with probabilities and new score (if applicable)
    outcfg_update : dict
        Additional outputs for stage output configuration, need to be
        merged into outcfg in main protocol
    """
    check_required(
        kwargs,
        [
            "scoring_model", "min_sequence_distance", "theta", "frequencies_file",
        ]
    )

    # None will trigger default behaviour of add_mixture_probability
    # (which currently is "skewnormal")
    scoring_model = kwargs.get("scoring_model", "skewnormal")

    # currently we need to distinguish between full rescoring (score and
    # probability) like with logistic regression model, or just putting
    # probabilities on top of default CN score using add_mixture_probability
    if scoring_model == "logistic_regression":
        scorer = pairs.LogisticRegressionScorer()

        # load amino acid/gap frequencies and conservation info
        freqs = pd.read_csv(kwargs["frequencies_file"])

        num_sites = outcfg["num_sites"]
        min_seq_dist = kwargs["min_sequence_distance"]

        # rescore EC table
        ecs = scorer.score(
            ecs,
            freqs,
            kwargs["theta"],
            outcfg["effective_sequences"],
            num_sites,
            score=score
        )

        # currently only perform quality scoring for single segments
        if segments is None or len(segments) == 1:
            is_longrange = ((ecs.i - ecs.j).abs() >= min_seq_dist).astype(int)
            ecs_lr = ecs.assign(
                longrange_count=is_longrange.cumsum()
            )

            # compute expectation for true positives on all contacts
            expected_positives_all = ecs_lr.query(
                "longrange_count <= @num_sites"
            ).probability.sum()

            expected_positives_longrange = ecs_lr.query(
                "longrange_count <= @num_sites and abs(i - j) >= @min_seq_dist"
            ).probability.sum()

            # store in config
            outcfg_update = {
                "expected_true_ecs_all": float(expected_positives_all),
                "expected_true_ecs_longrange": float(expected_positives_longrange)
            }

    else:
        # add mixture model probability
        ecs = pairs.add_mixture_probability(
            ecs, model=scoring_model
        )

        # put CN score into default score column for more generic
        # downstream score handling
        ecs = ecs.assign(
            score=ecs[score]
        )

        # no update to output config in this case
        outcfg_update = {}

    # sort ECs
    ecs = ecs.sort_values(
        by="score", ascending=False
    )

    return ecs, outcfg_update