Ejemplo n.º 1
0
    def _add_gaps_to_query(query_sequence_ali, ali):

        # get the index of columns that do not contain match states (indicated by an x)
        gap_index = [
            i for i, x in enumerate(ali.annotation["GC"]["RF"]) if x != "x"
        ]
        # get the index of columns that contain match states (indicated by an x)
        match_index = [
            i for i, x in enumerate(ali.annotation["GC"]["RF"]) if x == "x"
        ]

        # ensure that the length of the match states
        # match the length of the sequence
        if len(match_index) != query_sequence_ali.L:
            raise ValueError("HMMsearch result {} does not have a one-to-one"
                             " mapping to the query sequence columns".format(
                                 ar["raw_alignment_file"]))

        gapped_query_sequence = ""
        seq = list(query_sequence_ali.matrix[0, :])

        # loop through every position in the HMMsearch hits
        for i in range(len(ali.annotation["GC"]["RF"])):
            # if that position should be a gap, add a gap
            if i in gap_index:
                gapped_query_sequence += "-"
            # if that position should be a letter, pop the next
            # letter in the query sequence
            else:
                gapped_query_sequence += seq.pop(0)

        new_sequence_ali = Alignment.from_dict(
            {query_sequence_ali.ids[0]: gapped_query_sequence})
        return new_sequence_ali
Ejemplo n.º 2
0
def modify_alignment(focus_ali, target_seq_index, target_seq_id, region_start,
                     **kwargs):
    """
    Apply pairwise identity filtering, fragment filtering, and exclusion
    of columns with too many gaps to a sequence alignment. Also generates
    files describing properties of the alignment such as frequency distributions,
    conservation, and "old-style" alignment statistics files.

    .. note::

        assumes focus alignment (otherwise unprocessed) as input.

    .. todo::

        come up with something more clever  to filter fragments than fixed width
        (e.g. use 95% quantile of length distribution as reference point)

    Parameters
    ----------
    focus_ali : Alignment
        Focus-mode input alignment
    target_seq_index : int
        Index of target sequence in alignment
    target_seq_id : str
        Identifier of target sequence (without range)
    region_start : int
        Index of first sequence position in target sequence
    kwargs : See required arguments in source code

    Returns
    -------
    outcfg : Dict
        File products generated by the function:

        * alignment_file
        * statistics_file
        * frequencies_file
        * identities_file
        * raw_focus_alignment_file
    ali : Alignment
        Final processed alignment
    """
    check_required(kwargs, [
        "prefix",
        "seqid_filter",
        "hhfilter",
        "minimum_sequence_coverage",
        "minimum_column_coverage",
        "compute_num_effective_seqs",
        "theta",
    ])

    prefix = kwargs["prefix"]

    create_prefix_folders(prefix)

    focus_fasta_file = prefix + "_raw_focus.fasta"

    outcfg = {
        "alignment_file": prefix + ".a2m",
        "statistics_file": prefix + "_alignment_statistics.csv",
        "frequencies_file": prefix + "_frequencies.csv",
        "identities_file": prefix + "_identities.csv",
        "raw_focus_alignment_file": focus_fasta_file,
    }

    # swap target sequence to first position if it is not
    # the first sequence in alignment;
    # this is particularly important for hhfilter run
    # because target sequence might otherwise be filtered out
    if target_seq_index != 0:
        indices = np.arange(0, len(focus_ali))
        indices[0] = target_seq_index
        indices[target_seq_index] = 0
        target_seq_index = 0
        focus_ali = focus_ali.select(sequences=indices)

    with open(focus_fasta_file, "w") as f:
        focus_ali.write(f, "fasta")

    # apply pairwise identity filter (using hhfilter)
    if kwargs["seqid_filter"] is not None:
        filtered_file = prefix + "_filtered.a3m"

        at.run_hhfilter(focus_fasta_file,
                        filtered_file,
                        threshold=kwargs["seqid_filter"],
                        columns="first",
                        binary=kwargs["hhfilter"])

        with open(filtered_file) as f:
            focus_ali = Alignment.from_file(f, "a3m")

        # final FASTA alignment before applying A2M format modifications
        filtered_fasta_file = prefix + "_raw_focus_filtered.fasta"
        with open(filtered_fasta_file, "w") as f:
            focus_ali.write(f, "fasta")

    ali = focus_ali

    # filter fragments
    # come up with something more clever here than fixed width
    # (e.g. use 95% quantile of length distribution as reference point)
    min_cov = kwargs["minimum_sequence_coverage"]
    if min_cov is not None:
        if isinstance(min_cov, int):
            min_cov /= 100

        keep_seqs = (1 - ali.count("-", axis="seq")) >= min_cov
        ali = ali.select(sequences=keep_seqs)

    # Calculate frequencies, conservation and identity to query
    # on final alignment (except for lowercase modification)
    # Note: running hhfilter might cause a loss of the target seque
    # if it is not the first sequence in the file! To be sure that
    # nothing goes wrong, target_seq_index should always be 0.
    describe_seq_identities(ali, target_seq_index=target_seq_index).to_csv(
        outcfg["identities_file"], float_format="%.3f", index=False)

    describe_frequencies(ali, region_start,
                         target_seq_index=target_seq_index).to_csv(
                             outcfg["frequencies_file"],
                             float_format="%.3f",
                             index=False)

    coverage_stats = describe_coverage(ali, prefix, region_start,
                                       kwargs["minimum_column_coverage"])

    # keep list of uppercase sequence positions in alignment
    pos_list = np.arange(region_start, region_start + ali.L, dtype="int32")

    # Make columns with too many gaps lowercase
    min_col_cov = kwargs["minimum_column_coverage"]
    if min_col_cov is not None:
        if isinstance(min_col_cov, int):
            min_col_cov /= 100

        lc_cols = ali.count(ali._match_gap, axis="pos") > 1 - min_col_cov
        ali = ali.lowercase_columns(lc_cols)

        # if we remove columns, we have to update list of positions
        pos_list = pos_list[~lc_cols]
    else:
        lc_cols = None

    # compute effective number of sequences
    # (this is intended for cases where coupling stage is
    # not run, but this number is wanted nonetheless)
    if kwargs["compute_num_effective_seqs"]:
        # make sure we only compute N_eff on the columns
        # that would be used for model inference, dispose
        # the rest
        if lc_cols is None:
            cut_ali = ali
        else:
            cut_ali = ali.select(columns=~lc_cols)

        # compute sequence weights
        cut_ali.set_weights(kwargs["theta"])

        # N_eff := sum of all sequence weights
        n_eff = float(cut_ali.weights.sum())

        # patch into coverage statistics (N_eff column)
        coverage_stats.loc[:, "N_eff"] = n_eff
    else:
        n_eff = None

    # save coverage statistics to file
    coverage_stats.to_csv(outcfg["statistics_file"],
                          float_format="%.3f",
                          index=False)

    # store description of final sequence alignment in outcfg
    # (note these parameters will be updated by couplings protocol)
    outcfg.update({
        "num_sites": len(pos_list),
        "num_sequences": len(ali),
        "effective_sequences": n_eff,
        "region_start": region_start,
    })

    # create segment in outcfg
    outcfg["segments"] = [
        Segment("aa", target_seq_id, region_start, region_start + ali.L - 1,
                pos_list).to_list()
    ]

    with open(outcfg["alignment_file"], "w") as f:
        ali.write(f, "fasta")

    return outcfg, ali
Ejemplo n.º 3
0
def _make_hmmsearch_raw_fasta(alignment_result, prefix):
    """
    HMMsearch results do not contain the query sequence
    so we must construct a raw_fasta file with the query 
    sequence as the first hit, to ensure proper numbering. 
    The search result is filtered to only contain the columns with
    match states to the HMM, which has a one to one mapping to the
    query sequence.

    Paramters
    ---------
    alignment_result : dict
        Alignment result dictionary, output by run_hmmsearch
    prefix : str
        Prefix for file creation

    Returns
    -------
    str
        path to raw focus alignment file

    """
    def _add_gaps_to_query(query_sequence_ali, ali):

        # get the index of columns that do not contain match states (indicated by an x)
        gap_index = [
            i for i, x in enumerate(ali.annotation["GC"]["RF"]) if x != "x"
        ]
        # get the index of columns that contain match states (indicated by an x)
        match_index = [
            i for i, x in enumerate(ali.annotation["GC"]["RF"]) if x == "x"
        ]

        # ensure that the length of the match states
        # match the length of the sequence
        if len(match_index) != query_sequence_ali.L:
            raise ValueError("HMMsearch result {} does not have a one-to-one"
                             " mapping to the query sequence columns".format(
                                 ar["raw_alignment_file"]))

        gapped_query_sequence = ""
        seq = list(query_sequence_ali.matrix[0, :])

        # loop through every position in the HMMsearch hits
        for i in range(len(ali.annotation["GC"]["RF"])):
            # if that position should be a gap, add a gap
            if i in gap_index:
                gapped_query_sequence += "-"
            # if that position should be a letter, pop the next
            # letter in the query sequence
            else:
                gapped_query_sequence += seq.pop(0)

        new_sequence_ali = Alignment.from_dict(
            {query_sequence_ali.ids[0]: gapped_query_sequence})
        return new_sequence_ali

    # open the sequence file
    with open(alignment_result["target_sequence_file"]) as a:
        query_sequence_ali = Alignment.from_file(a, format="fasta")

    # if the provided alignment is empty, just return the target sequence
    raw_focus_alignment_file = prefix + "_raw.fasta"
    if not valid_file(alignment_result["raw_alignment_file"]):
        # write the query sequence to a fasta file
        with open(raw_focus_alignment_file, "w") as of:
            query_sequence_ali.write(of)

        # return as an alignment object
        return raw_focus_alignment_file

    # else, open the HMM search result
    with open(alignment_result["raw_alignment_file"]) as a:
        ali = Alignment.from_file(a, format="stockholm")

    # make sure that the stockholm alignment contains the match annotation
    if not ("GC" in ali.annotation and "RF" in ali.annotation["GC"]):
        raise ValueError("Stockholm alignment {} missing RF"
                         " annotation of match states".format(
                             alignment_result["raw_alignment_file"]))

    # add insertions to the query sequence in order to preserve correct
    # numbering of match sequences
    gapped_sequence_ali = _add_gaps_to_query(query_sequence_ali, ali)

    # write a new alignment file with the query sequence as
    # the first entry

    with open(raw_focus_alignment_file, "w") as of:
        gapped_sequence_ali.write(of)
        ali.write(of)

    return raw_focus_alignment_file
Ejemplo n.º 4
0
def existing(**kwargs):
    """
    Protocol:

    Use external sequence alignment and extract all relevant
    information from there (e.g. sequence, region, etc.),
    then apply gap & fragment filtering as usual

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * sequence_id (passed through from input)
        * alignment_file
        * raw_focus_alignment_file
        * statistics_file
        * sequence_file
        * first_index
        * target_sequence_file
        * annotation_file (None)
        * frequencies_file
        * identities_file
        * focus_mode
        * focus_sequence
        * segments
    """
    check_required(kwargs, [
        "prefix", "input_alignment", "sequence_id", "first_index",
        "extract_annotation"
    ])

    prefix = kwargs["prefix"]

    # make sure output directory exists
    create_prefix_folders(prefix)

    # this file is starting point of pipeline;
    # check if input alignment actually exists
    input_alignment = kwargs["input_alignment"]
    verify_resources("Input alignment does not exist", input_alignment)

    # first try to autodetect format of alignment
    with open(input_alignment) as f:
        format = detect_format(f)
        if format is None:
            raise InvalidParameterError(
                "Format of input alignment {} could not be "
                "automatically detected.".format(input_alignment))

    with open(input_alignment) as f:
        ali_raw = Alignment.from_file(f, format)

    # save annotation in sequence headers (species etc.)
    annotation_file = None
    if kwargs["extract_annotation"]:
        annotation_file = prefix + "_annotation.csv"
        from_anno_line = (format == "stockholm")
        annotation = extract_header_annotation(ali_raw,
                                               from_annotation=from_anno_line)
        annotation.to_csv(annotation_file, index=False)

    # Target sequence of alignment
    sequence_id = kwargs["sequence_id"]

    if sequence_id is None:
        raise InvalidParameterError("Parameter sequence_id must be defined")

    # First, find focus sequence in alignment
    focus_index = None
    for i, id_ in enumerate(ali_raw.ids):
        if id_.startswith(sequence_id):
            focus_index = i
            break

    # if we didn't find it, cannot continue
    if focus_index is None:
        raise InvalidParameterError(
            "Target sequence {} could not be found in alignment".format(
                sequence_id))

    # identify what columns (non-gap) to keep for focus
    focus_seq = ali_raw[focus_index]
    focus_cols = np.array([
        c not in [ali_raw._match_gap, ali_raw._insert_gap] for c in focus_seq
    ])

    # extract focus alignment
    focus_ali = ali_raw.select(columns=focus_cols)
    focus_seq_nogap = "".join(focus_ali[focus_index])

    # determine region of sequence. If first_index is given,
    # use that in any case, otherwise try to autodetect
    full_focus_header = ali_raw.ids[focus_index]
    focus_id = full_focus_header.split()[0]

    # try to extract region from sequence header
    id_, region_start, region_end = parse_header(focus_id)

    # override with first_index if given
    if kwargs["first_index"] is not None:
        region_start = kwargs["first_index"]
        region_end = region_start + len(focus_seq_nogap) - 1

    if region_start is None or region_end is None:
        raise InvalidParameterError(
            "Could not extract region information " +
            "from sequence header {} ".format(full_focus_header) +
            "and first_index parameter is not given.")

    # resubstitute full sequence ID from identifier
    # and region information
    header = "{}/{}-{}".format(id_, region_start, region_end)

    focus_ali.ids[focus_index] = header

    # write target sequence to file
    target_sequence_file = prefix + ".fa"
    with open(target_sequence_file, "w") as f:
        write_fasta([(header, focus_seq_nogap)], f)

    # apply sequence identity and fragment filters,
    # and gap threshold
    mod_outcfg, ali = modify_alignment(focus_ali, focus_index, id_,
                                       region_start, **kwargs)

    # generate output configuration of protocol
    outcfg = {
        **mod_outcfg,
        "sequence_id": sequence_id,
        "sequence_file": target_sequence_file,
        "first_index": region_start,
        "target_sequence_file": target_sequence_file,
        "focus_sequence": header,
        "focus_mode": True,
    }

    if annotation_file is not None:
        outcfg["annotation_file"] = annotation_file

    # dump config to YAML file for debugging/logging
    write_config_file(prefix + ".align_existing.outcfg", outcfg)

    # return results of protocol
    return outcfg
Ejemplo n.º 5
0
    def _format_alignment_for_hmmbuild(input_alignment_file, **kwargs):
        # this file is starting point of pipeline;
        # check if input alignment actually exists

        verify_resources("Input alignment does not exist",
                         input_alignment_file)

        # first try to autodetect format of alignment
        with open(input_alignment_file) as f:
            format = detect_format(f)
            if format is None:
                raise InvalidParameterError(
                    "Format of input alignment {} could not be "
                    "automatically detected.".format(input_alignment_file))

        with open(input_alignment_file) as f:
            ali_raw = Alignment.from_file(f, format)

        # Target sequence of alignment
        sequence_id = kwargs["sequence_id"]

        if sequence_id is None:
            raise InvalidParameterError(
                "Parameter sequence_id must be defined")

        # First, find focus sequence in alignment
        focus_index = None
        for i, id_ in enumerate(ali_raw.ids):
            if id_.startswith(sequence_id):
                focus_index = i
                break

        # if we didn't find it, cannot continue
        if focus_index is None:
            raise InvalidParameterError(
                "Target sequence {} could not be found in alignment".format(
                    sequence_id))

        # identify what columns (non-gap) to keep for focus
        # this should be all columns in the raw_focus_alignment_file
        # but checking anyway
        focus_seq = ali_raw[focus_index]
        focus_cols = np.array([
            c not in [ali_raw._match_gap, ali_raw._insert_gap]
            for c in focus_seq
        ])

        # extract focus alignment
        focus_ali = ali_raw.select(columns=focus_cols)
        focus_seq_nogap = "".join(focus_ali[focus_index])

        # determine region of sequence. If first_index is given,
        # use that in any case, otherwise try to autodetect
        full_focus_header = ali_raw.ids[focus_index]
        focus_id = full_focus_header.split()[0]

        # try to extract region from sequence header
        id_, region_start, region_end = parse_header(focus_id)

        # override with first_index if given
        if kwargs["first_index"] is not None:
            region_start = kwargs["first_index"]
            region_end = region_start + len(focus_seq_nogap) - 1

        if region_start is None or region_end is None:
            raise InvalidParameterError(
                "Could not extract region information " +
                "from sequence header {} ".format(full_focus_header) +
                "and first_index parameter is not given.")

        # resubstitute full sequence ID from identifier
        # and region information
        header = "{}/{}-{}".format(id_, region_start, region_end)

        focus_ali.ids[focus_index] = header

        # write target sequence to file
        target_sequence_file = prefix + ".fa"
        with open(target_sequence_file, "w") as f:
            write_fasta([(header, focus_seq_nogap)], f)

        # swap target sequence to first position if it is not
        # the first sequence in alignment;
        # this is particularly important for hhfilter run
        # because target sequence might otherwise be filtered out
        if focus_index != 0:
            indices = np.arange(0, len(focus_ali))
            indices[0] = focus_index
            indices[focus_index] = 0
            focus_index = 0
            focus_ali = focus_ali.select(sequences=indices)

        # write the raw focus alignment for hmmbuild
        focus_fasta_file = prefix + "_raw_focus_input.fasta"
        with open(focus_fasta_file, "w") as f:
            focus_ali.write(f, "fasta")

        return focus_fasta_file, target_sequence_file, region_start, region_end
Ejemplo n.º 6
0
def standard(**kwargs):
    """
    Protocol:

    Standard buildali4 workflow (run iterative jackhmmer
    search against sequence database, than determine which
    sequences and columns to include in the calculation based
    on coverage and maximum gap thresholds).

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * sequence_id (passed through from input)
        * first_index (passed through from input)
        * alignment_file
        * raw_alignment_file
        * raw_focus_alignment_file
        * statistics_file
        * target_sequence_file
        * sequence_file
        * annotation_file
        * frequencies_file
        * identities_file
        * hittable_file
        * focus_mode
        * focus_sequence
        * segments

    ali : Alignment
        Final sequence alignment

    """
    check_required(kwargs, [
        "prefix",
        "extract_annotation",
    ])

    prefix = kwargs["prefix"]

    # make sure output directory exists
    create_prefix_folders(prefix)

    # first step of protocol is to get alignment using
    # jackhmmer; initialize output configuration with
    # results of this search
    jackhmmer_outcfg = jackhmmer_search(**kwargs)
    stockholm_file = jackhmmer_outcfg["raw_alignment_file"]

    segment = Segment.from_list(jackhmmer_outcfg["segments"][0])
    target_seq_id = segment.sequence_id
    region_start = segment.region_start
    region_end = segment.region_end

    # read in stockholm format (with full annotation)
    with open(stockholm_file) as a:
        ali_raw = Alignment.from_file(a, "stockholm")

    # and store as FASTA file first (disabled for now
    # since equivalent information easily be obtained
    # from Stockholm file
    """
    ali_raw_fasta_file = prefix + "_raw.fasta"
    with open(ali_raw_fasta_file, "w") as f:
        ali_raw.write(f, "fasta")
    """

    # save annotation in sequence headers (species etc.)
    if kwargs["extract_annotation"]:
        annotation_file = prefix + "_annotation.csv"
        annotation = extract_header_annotation(ali_raw)
        annotation.to_csv(annotation_file, index=False)

    # center alignment around focus/search sequence
    focus_cols = np.array([c != "-" for c in ali_raw[0]])
    focus_ali = ali_raw.select(columns=focus_cols)

    target_seq_index = 0
    mod_outcfg, ali = modify_alignment(focus_ali, target_seq_index,
                                       target_seq_id, region_start, **kwargs)

    #  merge results of jackhmmer_search and modify_alignment stage
    outcfg = {
        **jackhmmer_outcfg,
        **mod_outcfg, "annotation_file": annotation_file
    }

    # dump output config to YAML file for debugging/logging
    write_config_file(prefix + ".align_standard.outcfg", outcfg)

    # return results of protocol
    return outcfg
Ejemplo n.º 7
0
def find_homologs(pdb_alignment_method="jackhmmer", **kwargs):
    """
    Identify homologs using jackhmmer or hmmbuild/hmmsearch

    Parameters
    ----------
    pdb_alignment_method : {"jackhmmer", "hmmsearch"}, 
             optional (default: "jackhmmer")
        Sequence alignment method used for searching the PDB
    **kwargs
        Passed into jackhmmer / hmmbuild_and_search protocol
        (see documentation for available options)

    Returns
    -------
    ali : evcouplings.align.Alignment
        Alignment of homologs of query sequence
        in sequence database
    hits : pandas.DataFrame
        Tabular representation of hits
    """

    # load default configuration
    config = parse_config(HMMER_CONFIG)

    # update with overrides from kwargs
    config = {
        **config,
        **kwargs,
    }

    # create temporary output if no prefix is given
    if config["prefix"] is None:
        config["prefix"] = path.join(tempdir(), "compare")

    check_required(config, ["prefix"])

    # run hmmsearch (possibly preceded by hmmbuild)
    if pdb_alignment_method == "hmmsearch":
        # set up config to run hmmbuild_and_search on the unfiltered alignment file
        updated_config = deepcopy(config)
        updated_config["alignment_file"] = config.get(
            "raw_focus_alignment_file")
        ar = hmmbuild_and_search(**updated_config)

        # For hmmbuild and search, we have to read the raw focus alignment file
        # to guarantee that the query sequence is present
        with open(ar["raw_focus_alignment_file"]) as a:
            ali = Alignment.from_file(a, "fasta")

    # run jackhmmer against sequence database
    # at this point we have already checked to ensure
    # that the input is either jackhmmer or hmmsearch
    elif pdb_alignment_method == "jackhmmer":
        ar = jackhmmer_search(**config)

        with open(ar["raw_alignment_file"]) as a:
            ali = Alignment.from_file(a, "stockholm")

        # write alignment as FASTA file for easier checking by hand,
        # if necessary
        with open(config["prefix"] + "_raw.fasta", "w") as f:
            ali.write(f)
    else:
        raise InvalidParameterError(
            "Invalid pdb_alignment_method selected. Valid options are: " +
            ", ".join(["jackhmmer", "hmmsearch"]))

    # read hmmer hittable and simplify
    hits = read_hmmer_domtbl(ar["hittable_file"])

    hits.loc[:, "uniprot_ac"] = hits.loc[:, "target_name"].map(
        lambda x: x.split("|")[1])
    hits.loc[:, "uniprot_id"] = hits.loc[:, "target_name"].map(
        lambda x: x.split("|")[2])

    hits = hits.rename(
        columns={
            "domain_score": "bitscore",
            "domain_i_Evalue": "e_value",
            "ali_from": "alignment_start",
            "ali_to": "alignment_end",
            "hmm_from": "hmm_start",
            "hmm_to": "hmm_end",
        })

    hits.loc[:, "alignment_start"] = pd.to_numeric(
        hits.alignment_start).astype(int)
    hits.loc[:,
             "alignment_end"] = pd.to_numeric(hits.alignment_end).astype(int)

    hits.loc[:, "alignment_id"] = (hits.target_name + "/" +
                                   hits.alignment_start.astype(str) + "-" +
                                   hits.alignment_end.astype(str))

    hits = hits.loc[:, [
        "alignment_id", "uniprot_ac", "uniprot_id", "alignment_start",
        "alignment_end", "bitscore", "e_value"
    ]]

    return ali, hits
Ejemplo n.º 8
0
def mean_field(**kwargs):
    """
    Protocol:

    Infer ECs from alignment using mean field direct coupling analysis.

    For now, mean field DCA can only be run in focus mode, gaps
    included.

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required.

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * raw_ec_file
        * model_file
        * num_sites
        * num_sequences
        * effective_sequences

        * focus_mode (passed through)
        * focus_sequence (passed through)
        * segments (passed through)
    """
    check_required(
        kwargs,
        [
            "prefix", "alignment_file", "segments",
            "focus_mode", "focus_sequence", "theta",
            "pseudo_count", "alphabet",
            "min_sequence_distance", # "save_model",
        ]
    )

    if not kwargs["focus_mode"]:
        raise InvalidParameterError(
            "For now, mean field DCA can only be run in focus mode."
        )

    prefix = kwargs["prefix"]

    # option to save model disabled
    """
    if kwargs["save_model"]:
        model = prefix + ".model"
    else:
        model = None
    """
    model = prefix + ".model"

    outcfg = {
        "model_file": model,
        "raw_ec_file": prefix + "_ECs.txt",
        "ec_file": prefix + "_CouplingScores.csv",
        # TODO: the following are passed through stage...
        # keep this or unnecessary?
        "focus_mode": kwargs["focus_mode"],
        "focus_sequence": kwargs["focus_sequence"],
        "segments": kwargs["segments"],
    }

    # make sure input alignment exists
    alignment_file = kwargs["alignment_file"]
    verify_resources(
        "Input alignment does not exist",
        kwargs["alignment_file"]
    )

    # make sure output directory exists
    create_prefix_folders(prefix)

    segments = kwargs["segments"]
    if segments is not None:
        segments = [
            mapping.Segment.from_list(s) for s in segments
        ]

    # determine alphabet
    # default is protein
    if kwargs["alphabet"] is None:
        alphabet = ALPHABET_PROTEIN
    else:
        alphabet = kwargs["alphabet"]

        # allow shortcuts for protein, DNA, RNA
        if alphabet in ALPHABET_MAP:
            alphabet = ALPHABET_MAP[alphabet]

    # read in a2m alignment
    with open(alignment_file) as f:
        input_alignment = Alignment.from_file(
            f, alphabet=alphabet,
            format="fasta"
        )

    # init mean field direct coupling analysis
    mf_dca = MeanFieldDCA(input_alignment)

    # run mean field approximation
    model = mf_dca.fit(
        theta=kwargs["theta"],
        pseudo_count=kwargs["pseudo_count"]
    )

    # write ECs to file
    model.to_raw_ec_file(
        outcfg["raw_ec_file"]
    )

    # write model file
    if outcfg["model_file"] is not None:
        model.to_file(
            outcfg["model_file"],
            file_format="plmc_v2"
        )

    # store useful information about model in outcfg
    outcfg.update({
        "num_sites": model.L,
        "num_valid_sequences": model.N_valid,
        "effective_sequences": float(round(model.N_eff, 1)),
        "region_start": int(model.index_list[0]),
    })

    # read and sort ECs
    ecs = pd.read_csv(
        outcfg["raw_ec_file"], sep=" ",
        # for now, call the last two columns
        # "fn" and "cn" to prevent compare
        # stage from crashing
        names=["i", "A_i", "j", "A_j", "fn", "cn"]
        # names=["i", "A_i", "j", "A_j", "mi", "di"]
    ).sort_values(
        by="cn",
        ascending=False
    )

    is_single_segment = segments is None or len(segments) == 1
    outcfg = {
        **outcfg,
        **_postprocess_inference(
            ecs, kwargs, model, outcfg, prefix,
            generate_enrichment=is_single_segment,
            generate_line_plot=is_single_segment
        )
    }

    # dump output config to YAML file for debugging/logging
    write_config_file(prefix + ".couplings_meanfield.outcfg", outcfg)

    return outcfg
Ejemplo n.º 9
0
def find_homologs_jackhmmer(**kwargs):
    """
    Identify homologs using jackhmmer

    Parameters
    ----------
    **kwargs
        Passed into jackhmmer_search protocol
        (see documentation for available options)

    Returns
    -------
    ali : evcouplings.align.Alignment
        Alignment of homologs of query sequence
        in sequence database
    hits : pandas.DataFrame
        Tabular representation of hits
    """
    # load default configuration
    config = parse_config(JACKHMMER_CONFIG)

    # update with overrides from kwargs
    config = {
        **config,
        **kwargs,
    }

    # create temporary output if no prefix is given
    if config["prefix"] is None:
        config["prefix"] = path.join(tempdir(), "compare")

    # run jackhmmer against sequence database
    ar = jackhmmer_search(**config)

    with open(ar["raw_alignment_file"]) as a:
        ali = Alignment.from_file(a, "stockholm")

    # write alignment as FASTA file for easier checking by hand,
    # if necessary
    with open(config["prefix"] + "_raw.fasta", "w") as f:
        ali.write(f)

    # read hmmer hittable and simplify
    hits = read_hmmer_domtbl(ar["hittable_file"])

    hits.loc[:, "uniprot_ac"] = hits.loc[:, "target_name"].map(
        lambda x: x.split("|")[1])
    hits.loc[:, "uniprot_id"] = hits.loc[:, "target_name"].map(
        lambda x: x.split("|")[2])

    hits = hits.rename(
        columns={
            "domain_score": "bitscore",
            "domain_i_Evalue": "e_value",
            "ali_from": "alignment_start",
            "ali_to": "alignment_end",
        })

    hits.loc[:, "alignment_start"] = pd.to_numeric(
        hits.alignment_start).astype(int)
    hits.loc[:,
             "alignment_end"] = pd.to_numeric(hits.alignment_end).astype(int)

    hits.loc[:, "alignment_id"] = (hits.target_name + "/" +
                                   hits.alignment_start.astype(str) + "-" +
                                   hits.alignment_end.astype(str))

    hits = hits.loc[:, [
        "alignment_id", "uniprot_ac", "uniprot_id", "alignment_start",
        "alignment_end", "bitscore", "e_value"
    ]]

    return ali, hits
Ejemplo n.º 10
0
def mean_field(**kwargs):
    """
    Protocol:

    Infer ECs from alignment using mean field direct coupling analysis.

    For now, mean field DCA can only be run in focus mode, gaps
    included.

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required.

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * raw_ec_file
        * model_file
        * num_sites
        * num_sequences
        * effective_sequences

        * focus_mode (passed through)
        * focus_sequence (passed through)
        * segments (passed through)
    """
    check_required(
        kwargs,
        [
            "prefix", "alignment_file", "segments",
            "focus_mode", "focus_sequence", "theta",
            "pseudo_count", "alphabet",
            "min_sequence_distance", # "save_model",
        ]
    )

    if not kwargs["focus_mode"]:
        raise InvalidParameterError(
            "For now, mean field DCA can only be run in focus mode."
        )

    prefix = kwargs["prefix"]

    # option to save model disabled
    """
    if kwargs["save_model"]:
        model = prefix + ".model"
    else:
        model = None
    """
    model = prefix + ".model"

    outcfg = {
        "model_file": model,
        "raw_ec_file": prefix + "_ECs.txt",
        "ec_file": prefix + "_CouplingScores.csv",
        # TODO: the following are passed through stage...
        # keep this or unnecessary?
        "focus_mode": kwargs["focus_mode"],
        "focus_sequence": kwargs["focus_sequence"],
        "segments": kwargs["segments"],
    }

    # make sure input alignment exists
    alignment_file = kwargs["alignment_file"]
    verify_resources(
        "Input alignment does not exist",
        kwargs["alignment_file"]
    )

    # make sure output directory exists
    create_prefix_folders(prefix)

    segments = kwargs["segments"]
    if segments is not None:
        segments = [
            mapping.Segment.from_list(s) for s in segments
        ]

    # determine alphabet
    # default is protein
    if kwargs["alphabet"] is None:
        alphabet = ALPHABET_PROTEIN
    else:
        alphabet = kwargs["alphabet"]

        # allow shortcuts for protein, DNA, RNA
        if alphabet in ALPHABET_MAP:
            alphabet = ALPHABET_MAP[alphabet]

    # read in a2m alignment
    with open(alignment_file) as f:
        input_alignment = Alignment.from_file(
            f, alphabet=alphabet,
            format="fasta"
        )

    # init mean field direct coupling analysis
    mf_dca = MeanFieldDCA(input_alignment)

    # run mean field approximation
    model = mf_dca.fit(
        theta=kwargs["theta"],
        pseudo_count=kwargs["pseudo_count"]
    )

    # write ECs to file
    model.to_raw_ec_file(
        outcfg["raw_ec_file"]
    )

    # write model file
    if outcfg["model_file"] is not None:
        model.to_file(
            outcfg["model_file"],
            file_format="plmc_v2"
        )

    # store useful information about model in outcfg
    outcfg.update({
        "num_sites": model.L,
        "num_sequences": model.N_valid,
        "effective_sequences": float(round(model.N_eff, 1)),
        "region_start": int(model.index_list[0]),
    })

    # read and sort ECs
    ecs = pd.read_csv(
        outcfg["raw_ec_file"], sep=" ",
        # for now, call the last two columns
        # "fn" and "cn" to prevent compare
        # stage from crashing
        names=["i", "A_i", "j", "A_j", "fn", "cn"]
        # names=["i", "A_i", "j", "A_j", "mi", "di"]
    ).sort_values(
        by="cn",
        ascending=False
    )

    # write the sorted ECs table to csv file
    ecs.to_csv(outcfg["ec_file"], index=False)

    # also store longrange ECs as convenience output
    if kwargs["min_sequence_distance"] is not None:
        outcfg["ec_longrange_file"] = prefix + "_CouplingScores_longrange.csv"
        ecs_longrange = ecs.query(
            "abs(i - j) >= {}".format(kwargs["min_sequence_distance"])
        )
        ecs_longrange.to_csv(outcfg["ec_longrange_file"], index=False)

        # also create line-drawing script (for now, only for single segments)
        if segments is None or len(segments) == 1:
            outcfg["ec_lines_pml_file"] = prefix + "_draw_ec_lines.pml"
            L = outcfg["num_sites"]
            ec_lines_pymol_script(
                ecs_longrange.iloc[:L, :],
                outcfg["ec_lines_pml_file"],
                score_column="cn"  # "di
            )

    # compute EC enrichment (for now, for single segments
    # only since enrichment code cannot handle multiple segments)
    if segments is None or len(segments) == 1:
        outcfg["enrichment_file"] = prefix + "_enrichment.csv"
        ecs_enriched = pairs.enrichment(ecs, score="cn")  # "di"
        ecs_enriched.to_csv(outcfg["enrichment_file"], index=False)

        # create corresponding enrichment pymol scripts
        outcfg["enrichment_pml_files"] = []
        for sphere_view, pml_suffix in [
            (True, "_enrichment_spheres.pml"), (False, "_enrichment_sausage.pml")
        ]:
            pml_file = prefix + pml_suffix
            enrichment_pymol_script(ecs_enriched, pml_file, sphere_view=sphere_view)
            outcfg["enrichment_pml_files"].append(pml_file)

    # output EVzoom JSON file if we have stored model file
    if outcfg.get("model_file", None) is not None:
        outcfg["evzoom_file"] = prefix + "_evzoom.json"
        with open(outcfg["evzoom_file"], "w") as f:
            # create JSON output and write to file
            f.write(
                evzoom_json(model) + "\n"
            )

    # dump output config to YAML file for debugging/logging
    write_config_file(prefix + ".couplings_standard.outcfg", outcfg)

    return outcfg
Ejemplo n.º 11
0
def filter_best_reciprocal(alignment,
                           paralogs,
                           most_similar_in_species,
                           allowed_error=0.02):
    """
    Takes in a dictionary of the best hit to each genome
    Removes sequences that are not the best reciprocal hit to the query sequence

    Parameters
    ----------
    alignment : str
        Path to sequence alignment file
    paralogs : pd.DataFrame
        Rows correspond to paralogs to the query sequence
        Created by find_paralogs() function
    most_similar_in_species : pd.DataFrame
        Contains the id, species name, and percent identity to query
        for each sequence that was the best hit to the query in its 
        respective species
    allowed_error : float
        In order for a sequence to be filtered out of the alignment, 
        it must be more identitical to a paralog sequence than the 
        target sequence by at least this amount

    Returns
    -------
    pd.DataFrame
        Contains the id, species name, and percenty identity to query
        for each sequence that was the best reciprocal hit to the query sequence
    """
    with open(alignment, "r") as inf:
        ali = Alignment.from_file(inf)

    # Create an n_paralogs x n_sequences ndarray
    # where entry i,j is percent identity of paralog i to sequence j
    # note the identity here will be different than for the unfiltered alignment

    # initialize the matrix
    identity_mat = np.zeros((len(paralogs), len(ali.ids)), dtype=float)

    for idx, paralog_id in enumerate(paralogs.id):
        # calculate the % identity of every seq in the alignment to current paralog
        identities = ali.identities_to(ali[ali.id_to_index[paralog_id]])

        # save the results
        identity_mat[idx, :] = identities

    indices_to_keep = []
    # for every sequence in the alignment that is the most similar to the query
    # in its respective species...
    for index, row in most_similar_in_species.iterrows():

        # get the index of that sequence in the alignment.
        alignment_index = ali.id_to_index[row.id]

        # Keep sequences if they are the best reciprocal hit -
        # i.e., that sequence is not more similar to any paralog
        # than it is to the query sequence
        if np.all(identity_mat[:, alignment_index] < row.identity_to_query +
                  allowed_error):

            indices_to_keep.append(index)

    return most_similar_in_species.loc[indices_to_keep, :]
Ejemplo n.º 12
0
def alignment_index_mapping(alignment_file,
                            format="stockholm",
                            target_seq=None):
    """

    Create index mapping table between sequence positions
    based on a sequence alignment.

    Parameters
    ----------
    alignment_file : str
        Path of alignment file containing sequences for
        which indices shoul dbe mapped
    format : {"stockholm", "fasta"}
        Format of alignment file
    target_seq : str, optional (default: None)
        Identifier of sequence around which the index
        mapping will be centered. If None, first sequence
        in alignment will be used.

    Returns
    -------
    pandas.DataFrame
        Mapping table containing assignment of

        1. index in target sequence (i)
        2. symbol in target sequence (A_i)

        For all other sequences in alignment, the following
        two columns:

        3. index in second sequence (j_<sequence id>)
        4. symbol in second sequence (A_j_<sequence_id>)
    """
    # read alignment that is basis of mapping
    with open(alignment_file) as a:
        ali = Alignment.from_file(a, format)

    # determine index of target sequence if necessary
    # (default: first sequence in alignment)
    if target_seq is None:
        target_seq_index = 0
    else:
        for i, full_id in enumerate(ali.ids):
            if full_id.startswith(target_seq):
                target_seq_index = i

    # get range and sequence of target
    id_, target_start, target_end = parse_header(ali.ids[target_seq_index])
    target_seq = ali.matrix[target_seq_index]

    # now map from target numbering to hit numbering
    full_map = None

    for i, full_id in enumerate(ali.ids):
        if i == target_seq_index:
            continue

        # extract information about sequence we are comparing to
        id_, region_start, region_end = parse_header(full_id)
        other_seq = ali.matrix[i]

        # compute mapping table
        map_df = map_indices(target_seq, target_start, target_end, other_seq,
                             region_start, region_end,
                             [ali._match_gap, ali._insert_gap])

        # adjust column names for non-target sequence
        map_df = map_df.rename(columns={
            "j": "i_" + full_id,
            "A_j": "A_i_" + full_id,
        })

        # add to full mapping table, left outer join
        # so all positions in target sequence are kept
        if full_map is None:
            full_map = map_df
        else:
            full_map = full_map.merge(map_df, on=("i", "A_i"), how="left")

    return full_map
Ejemplo n.º 13
0
def mean_field(**kwargs):
    """
    Protocol:

    Infer ECs from alignment using mean field direct coupling analysis.

    For now, mean field DCA can only be run in focus mode, gaps
    included.

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required.

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * raw_ec_file
        * model_file
        * num_sites
        * num_sequences
        * effective_sequences

        * focus_mode (passed through)
        * focus_sequence (passed through)
        * segments (passed through)
    """
    check_required(
        kwargs,
        [
            "prefix", "alignment_file", "segments",
            "focus_mode", "focus_sequence", "theta",
            "pseudo_count", "alphabet",
            "min_sequence_distance", # "save_model",
            "ec_score_type",
        ]
    )

    if not kwargs["focus_mode"]:
        raise InvalidParameterError(
            "For now, mean field DCA can only be run in focus mode."
        )

    prefix = kwargs["prefix"]

    # option to save model disabled
    """
    if kwargs["save_model"]:
        model = prefix + ".model"
    else:
        model = None
    """
    model = prefix + ".model"

    outcfg = {
        "model_file": model,
        "raw_ec_file": prefix + "_ECs.txt",
        "ec_file": prefix + "_CouplingScores.csv",
        # TODO: the following are passed through stage...
        # keep this or unnecessary?
        "focus_mode": kwargs["focus_mode"],
        "focus_sequence": kwargs["focus_sequence"],
        "segments": kwargs["segments"],
    }

    # make sure input alignment exists
    alignment_file = kwargs["alignment_file"]
    verify_resources(
        "Input alignment does not exist",
        kwargs["alignment_file"]
    )

    # make sure output directory exists
    create_prefix_folders(prefix)

    segments = kwargs["segments"]
    if segments is not None:
        segments = [
            mapping.Segment.from_list(s) for s in segments
        ]

    # determine alphabet
    # default is protein
    if kwargs["alphabet"] is None:
        alphabet = ALPHABET_PROTEIN
    else:
        alphabet = kwargs["alphabet"]

        # allow shortcuts for protein, DNA, RNA
        if alphabet in ALPHABET_MAP:
            alphabet = ALPHABET_MAP[alphabet]

    # read in a2m alignment
    with open(alignment_file) as f:
        input_alignment = Alignment.from_file(
            f, alphabet=alphabet,
            format="fasta"
        )

    # init mean field direct coupling analysis
    mf_dca = MeanFieldDCA(input_alignment)

    # run mean field approximation
    model = mf_dca.fit(
        theta=kwargs["theta"],
        pseudo_count=kwargs["pseudo_count"]
    )

    # write ECs to file
    model.to_raw_ec_file(
        outcfg["raw_ec_file"]
    )

    # write model file
    if outcfg["model_file"] is not None:
        model.to_file(
            outcfg["model_file"],
            file_format="plmc_v2"
        )

    # store useful information about model in outcfg
    outcfg.update({
        "num_sites": model.L,
        "num_valid_sequences": model.N_valid,
        "effective_sequences": float(round(model.N_eff, 1)),
        "region_start": int(model.index_list[0]),
    })

    # read and sort ECs
    # Note: this now deviates from the original EC format
    # file because it has 4 score columns to accomodate
    # MI (raw), MI (APC-corrected), DI, CN;
    ecs = pd.read_csv(
        outcfg["raw_ec_file"], sep=" ",
        names=["i", "A_i", "j", "A_j", "mi_raw", "mi_apc", "di", "cn"]
    )

    # select target score;
    # by default select CN score, since it allows to compute probabilities etc.
    ec_score_type = kwargs.get("ec_score_type", "cn")
    valid_ec_type_choices = ["cn", "di", "mi_raw", "mi_apc"]

    if ec_score_type not in valid_ec_type_choices:
        raise InvalidParameterError(
            "Invalid choice for valid_ec_type: {}, valid options are: {}".format(
                ec_score_type, ", ".join(valid_ec_type_choices)
            )
        )

    # perform rescoring if CN score is selected, otherwise cannot rescore
    # since all models are based on distribution shapes generated by CN score
    if ec_score_type == "cn":
        # perform EC rescoring starting from CN score output by plmc;
        # outconfig update will be merged further down in final outcfg merge

        # returned list is already sorted
        ecs, rescorer_outcfg_update = rescore_cn_score_ecs(
            ecs, segments, outcfg, kwargs, score="cn"
        )
    else:
        # If MI or DI, cannot apply distribution-based rescoring approaches,
        # so just set score column and add dummy probability value for compatibility
        # with downstream code
        ecs = ecs.assign(
            score=ecs[ec_score_type],
            probability=np.nan
        ).sort_values(
            by="score",
            ascending=False
        )

        # no additional values to be updated in outcfg in this case
        rescorer_outcfg_update = {}

    is_single_segment = segments is None or len(segments) == 1
    outcfg = {
        **outcfg,
        **rescorer_outcfg_update,
        **_postprocess_inference(
            ecs, kwargs, model, outcfg, prefix,
            generate_enrichment=is_single_segment,
            generate_line_plot=is_single_segment,
            score="score"
        )
    }

    # dump output config to YAML file for debugging/logging
    write_config_file(prefix + ".couplings_meanfield.outcfg", outcfg)

    return outcfg