Example #1
0
    def _modify_segments(seg_list, seg_prefix):
        # extract segments from list representation into objects
        segs = [Segment.from_list(s) for s in seg_list]
        # update segment IDs
        for i, s in enumerate(segs, start=1):
            s.segment_id = "{}_{}".format(seg_prefix, i)

        return segs
Example #2
0
def jackhmmer_search(**kwargs):
    """
    Protocol:

    Iterative jackhmmer search against a sequence database.

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    .. todo::
        explain meaning of parameters in detail.

    Returns
    -------
    outcfg : dict
        Output configuration of the protocol, including
        the following fields:

        * sequence_id (passed through from input)
        * first_index (passed through from input)
        * target_sequence_file
        * sequence_file
        * raw_alignment_file
        * hittable_file
        * focus_mode
        * focus_sequence
        * segments
    """
    check_required(kwargs, [
        "prefix", "sequence_id", "sequence_file", "sequence_download_url",
        "region", "first_index", "use_bitscores", "domain_threshold",
        "sequence_threshold", "database", "iterations", "cpu", "nobias",
        "reuse_alignment", "checkpoints_hmm", "checkpoints_ali", "jackhmmer",
        "extract_annotation"
    ])
    prefix = kwargs["prefix"]

    # make sure output directory exists
    create_prefix_folders(prefix)

    # store search sequence file here
    target_sequence_file = prefix + ".fa"
    full_sequence_file = prefix + "_full.fa"

    # make sure search sequence is defined and load it
    full_seq_file, (full_seq_id, full_seq) = fetch_sequence(
        kwargs["sequence_id"], kwargs["sequence_file"],
        kwargs["sequence_download_url"], full_sequence_file)

    # cut sequence to target region and save in sequence_file
    # (this is the main sequence file used downstream)
    (region_start, region_end), cut_seq = cut_sequence(full_seq,
                                                       kwargs["sequence_id"],
                                                       kwargs["region"],
                                                       kwargs["first_index"],
                                                       target_sequence_file)

    # run jackhmmer... allow to reuse pre-exisiting
    # Stockholm alignment file here
    ali_outcfg_file = prefix + ".align_jackhmmer_search.outcfg"

    # determine if to rerun, only possible if previous results
    # were stored in ali_outcfg_file
    if kwargs["reuse_alignment"] and valid_file(ali_outcfg_file):
        ali = read_config_file(ali_outcfg_file)

        # check if the alignment file itself is also there
        verify_resources(
            "Tried to reuse alignment, but empty or "
            "does not exist", ali["alignment"], ali["domtblout"])
    else:
        # otherwise, we have to run the alignment
        # modify search thresholds to be suitable for jackhmmer
        seq_threshold, domain_threshold = search_thresholds(
            kwargs["use_bitscores"], kwargs["sequence_threshold"],
            kwargs["domain_threshold"], len(cut_seq))

        # run search process
        ali = at.run_jackhmmer(
            query=target_sequence_file,
            database=kwargs[kwargs["database"]],
            prefix=prefix,
            use_bitscores=kwargs["use_bitscores"],
            domain_threshold=domain_threshold,
            seq_threshold=seq_threshold,
            iterations=kwargs["iterations"],
            nobias=kwargs["nobias"],
            cpu=kwargs["cpu"],
            checkpoints_hmm=kwargs["checkpoints_hmm"],
            checkpoints_ali=kwargs["checkpoints_ali"],
            binary=kwargs["jackhmmer"],
        )

        # get rid of huge stdout log file immediately
        # (do not use /dev/null option of jackhmmer function
        # to make no assumption about operating system)
        try:
            os.remove(ali.output)
        except OSError:
            pass

        # turn namedtuple into dictionary to make
        # restarting code nicer
        ali = dict(ali._asdict())

        # save results of search for possible restart
        write_config_file(ali_outcfg_file, ali)

    # prepare output dictionary with result files
    outcfg = {
        "sequence_id": kwargs["sequence_id"],
        "target_sequence_file": target_sequence_file,
        "sequence_file": full_sequence_file,
        "first_index": kwargs["first_index"],
        "focus_mode": True,
        "raw_alignment_file": ali["alignment"],
        "hittable_file": ali["domtblout"],
    }

    # define a single protein segment based on target sequence
    outcfg["segments"] = [
        Segment("aa", kwargs["sequence_id"], region_start, region_end,
                range(region_start, region_end + 1)).to_list()
    ]

    outcfg["focus_sequence"] = "{}/{}-{}".format(kwargs["sequence_id"],
                                                 region_start, region_end)

    return outcfg
Example #3
0
def modify_alignment(focus_ali, target_seq_index, target_seq_id, region_start,
                     **kwargs):
    """
    Apply pairwise identity filtering, fragment filtering, and exclusion
    of columns with too many gaps to a sequence alignment. Also generates
    files describing properties of the alignment such as frequency distributions,
    conservation, and "old-style" alignment statistics files.

    .. note::

        assumes focus alignment (otherwise unprocessed) as input.

    .. todo::

        come up with something more clever  to filter fragments than fixed width
        (e.g. use 95% quantile of length distribution as reference point)

    Parameters
    ----------
    focus_ali : Alignment
        Focus-mode input alignment
    target_seq_index : int
        Index of target sequence in alignment
    target_seq_id : str
        Identifier of target sequence (without range)
    region_start : int
        Index of first sequence position in target sequence
    kwargs : See required arguments in source code

    Returns
    -------
    outcfg : Dict
        File products generated by the function:

        * alignment_file
        * statistics_file
        * frequencies_file
        * identities_file
        * raw_focus_alignment_file
    ali : Alignment
        Final processed alignment
    """
    check_required(kwargs, [
        "prefix",
        "seqid_filter",
        "hhfilter",
        "minimum_sequence_coverage",
        "minimum_column_coverage",
        "compute_num_effective_seqs",
        "theta",
    ])

    prefix = kwargs["prefix"]

    create_prefix_folders(prefix)

    focus_fasta_file = prefix + "_raw_focus.fasta"

    outcfg = {
        "alignment_file": prefix + ".a2m",
        "statistics_file": prefix + "_alignment_statistics.csv",
        "frequencies_file": prefix + "_frequencies.csv",
        "identities_file": prefix + "_identities.csv",
        "raw_focus_alignment_file": focus_fasta_file,
    }

    # swap target sequence to first position if it is not
    # the first sequence in alignment;
    # this is particularly important for hhfilter run
    # because target sequence might otherwise be filtered out
    if target_seq_index != 0:
        indices = np.arange(0, len(focus_ali))
        indices[0] = target_seq_index
        indices[target_seq_index] = 0
        target_seq_index = 0
        focus_ali = focus_ali.select(sequences=indices)

    with open(focus_fasta_file, "w") as f:
        focus_ali.write(f, "fasta")

    # apply pairwise identity filter (using hhfilter)
    if kwargs["seqid_filter"] is not None:
        filtered_file = prefix + "_filtered.a3m"

        at.run_hhfilter(focus_fasta_file,
                        filtered_file,
                        threshold=kwargs["seqid_filter"],
                        columns="first",
                        binary=kwargs["hhfilter"])

        with open(filtered_file) as f:
            focus_ali = Alignment.from_file(f, "a3m")

        # final FASTA alignment before applying A2M format modifications
        filtered_fasta_file = prefix + "_raw_focus_filtered.fasta"
        with open(filtered_fasta_file, "w") as f:
            focus_ali.write(f, "fasta")

    ali = focus_ali

    # filter fragments
    # come up with something more clever here than fixed width
    # (e.g. use 95% quantile of length distribution as reference point)
    min_cov = kwargs["minimum_sequence_coverage"]
    if min_cov is not None:
        if isinstance(min_cov, int):
            min_cov /= 100

        keep_seqs = (1 - ali.count("-", axis="seq")) >= min_cov
        ali = ali.select(sequences=keep_seqs)

    # Calculate frequencies, conservation and identity to query
    # on final alignment (except for lowercase modification)
    # Note: running hhfilter might cause a loss of the target seque
    # if it is not the first sequence in the file! To be sure that
    # nothing goes wrong, target_seq_index should always be 0.
    describe_seq_identities(ali, target_seq_index=target_seq_index).to_csv(
        outcfg["identities_file"], float_format="%.3f", index=False)

    describe_frequencies(ali, region_start,
                         target_seq_index=target_seq_index).to_csv(
                             outcfg["frequencies_file"],
                             float_format="%.3f",
                             index=False)

    coverage_stats = describe_coverage(ali, prefix, region_start,
                                       kwargs["minimum_column_coverage"])

    # keep list of uppercase sequence positions in alignment
    pos_list = np.arange(region_start, region_start + ali.L, dtype="int32")

    # Make columns with too many gaps lowercase
    min_col_cov = kwargs["minimum_column_coverage"]
    if min_col_cov is not None:
        if isinstance(min_col_cov, int):
            min_col_cov /= 100

        lc_cols = ali.count(ali._match_gap, axis="pos") > 1 - min_col_cov
        ali = ali.lowercase_columns(lc_cols)

        # if we remove columns, we have to update list of positions
        pos_list = pos_list[~lc_cols]
    else:
        lc_cols = None

    # compute effective number of sequences
    # (this is intended for cases where coupling stage is
    # not run, but this number is wanted nonetheless)
    if kwargs["compute_num_effective_seqs"]:
        # make sure we only compute N_eff on the columns
        # that would be used for model inference, dispose
        # the rest
        if lc_cols is None:
            cut_ali = ali
        else:
            cut_ali = ali.select(columns=~lc_cols)

        # compute sequence weights
        cut_ali.set_weights(kwargs["theta"])

        # N_eff := sum of all sequence weights
        n_eff = float(cut_ali.weights.sum())

        # patch into coverage statistics (N_eff column)
        coverage_stats.loc[:, "N_eff"] = n_eff
    else:
        n_eff = None

    # save coverage statistics to file
    coverage_stats.to_csv(outcfg["statistics_file"],
                          float_format="%.3f",
                          index=False)

    # store description of final sequence alignment in outcfg
    # (note these parameters will be updated by couplings protocol)
    outcfg.update({
        "num_sites": len(pos_list),
        "num_sequences": len(ali),
        "effective_sequences": n_eff,
        "region_start": region_start,
    })

    # create segment in outcfg
    outcfg["segments"] = [
        Segment("aa", target_seq_id, region_start, region_start + ali.L - 1,
                pos_list).to_list()
    ]

    with open(outcfg["alignment_file"], "w") as f:
        ali.write(f, "fasta")

    return outcfg, ali
Example #4
0
def standard(**kwargs):
    """
    Protocol:

    Standard buildali4 workflow (run iterative jackhmmer
    search against sequence database, than determine which
    sequences and columns to include in the calculation based
    on coverage and maximum gap thresholds).

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * sequence_id (passed through from input)
        * first_index (passed through from input)
        * alignment_file
        * raw_alignment_file
        * raw_focus_alignment_file
        * statistics_file
        * target_sequence_file
        * sequence_file
        * annotation_file
        * frequencies_file
        * identities_file
        * hittable_file
        * focus_mode
        * focus_sequence
        * segments

    ali : Alignment
        Final sequence alignment

    """
    check_required(kwargs, [
        "prefix",
        "extract_annotation",
    ])

    prefix = kwargs["prefix"]

    # make sure output directory exists
    create_prefix_folders(prefix)

    # first step of protocol is to get alignment using
    # jackhmmer; initialize output configuration with
    # results of this search
    jackhmmer_outcfg = jackhmmer_search(**kwargs)
    stockholm_file = jackhmmer_outcfg["raw_alignment_file"]

    segment = Segment.from_list(jackhmmer_outcfg["segments"][0])
    target_seq_id = segment.sequence_id
    region_start = segment.region_start
    region_end = segment.region_end

    # read in stockholm format (with full annotation)
    with open(stockholm_file) as a:
        ali_raw = Alignment.from_file(a, "stockholm")

    # and store as FASTA file first (disabled for now
    # since equivalent information easily be obtained
    # from Stockholm file
    """
    ali_raw_fasta_file = prefix + "_raw.fasta"
    with open(ali_raw_fasta_file, "w") as f:
        ali_raw.write(f, "fasta")
    """

    # save annotation in sequence headers (species etc.)
    if kwargs["extract_annotation"]:
        annotation_file = prefix + "_annotation.csv"
        annotation = extract_header_annotation(ali_raw)
        annotation.to_csv(annotation_file, index=False)

    # center alignment around focus/search sequence
    focus_cols = np.array([c != "-" for c in ali_raw[0]])
    focus_ali = ali_raw.select(columns=focus_cols)

    target_seq_index = 0
    mod_outcfg, ali = modify_alignment(focus_ali, target_seq_index,
                                       target_seq_id, region_start, **kwargs)

    #  merge results of jackhmmer_search and modify_alignment stage
    outcfg = {
        **jackhmmer_outcfg,
        **mod_outcfg, "annotation_file": annotation_file
    }

    # dump output config to YAML file for debugging/logging
    write_config_file(prefix + ".align_standard.outcfg", outcfg)

    # return results of protocol
    return outcfg
Example #5
0
def hmmbuild_and_search(**kwargs):
    """
    Protocol:

    Build HMM from sequence alignment using hmmbuild and 
    search against a sequence database using hmmsearch.
    
    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the protocol, including
        the following fields:

        * target_sequence_file
        * sequence_file
        * raw_alignment_file
        * hittable_file
        * focus_mode
        * focus_sequence
        * segments
    """
    def _format_alignment_for_hmmbuild(input_alignment_file, **kwargs):
        # this file is starting point of pipeline;
        # check if input alignment actually exists

        verify_resources("Input alignment does not exist",
                         input_alignment_file)

        # first try to autodetect format of alignment
        with open(input_alignment_file) as f:
            format = detect_format(f)
            if format is None:
                raise InvalidParameterError(
                    "Format of input alignment {} could not be "
                    "automatically detected.".format(input_alignment_file))

        with open(input_alignment_file) as f:
            ali_raw = Alignment.from_file(f, format)

        # Target sequence of alignment
        sequence_id = kwargs["sequence_id"]

        if sequence_id is None:
            raise InvalidParameterError(
                "Parameter sequence_id must be defined")

        # First, find focus sequence in alignment
        focus_index = None
        for i, id_ in enumerate(ali_raw.ids):
            if id_.startswith(sequence_id):
                focus_index = i
                break

        # if we didn't find it, cannot continue
        if focus_index is None:
            raise InvalidParameterError(
                "Target sequence {} could not be found in alignment".format(
                    sequence_id))

        # identify what columns (non-gap) to keep for focus
        # this should be all columns in the raw_focus_alignment_file
        # but checking anyway
        focus_seq = ali_raw[focus_index]
        focus_cols = np.array([
            c not in [ali_raw._match_gap, ali_raw._insert_gap]
            for c in focus_seq
        ])

        # extract focus alignment
        focus_ali = ali_raw.select(columns=focus_cols)
        focus_seq_nogap = "".join(focus_ali[focus_index])

        # determine region of sequence. If first_index is given,
        # use that in any case, otherwise try to autodetect
        full_focus_header = ali_raw.ids[focus_index]
        focus_id = full_focus_header.split()[0]

        # try to extract region from sequence header
        id_, region_start, region_end = parse_header(focus_id)

        # override with first_index if given
        if kwargs["first_index"] is not None:
            region_start = kwargs["first_index"]
            region_end = region_start + len(focus_seq_nogap) - 1

        if region_start is None or region_end is None:
            raise InvalidParameterError(
                "Could not extract region information " +
                "from sequence header {} ".format(full_focus_header) +
                "and first_index parameter is not given.")

        # resubstitute full sequence ID from identifier
        # and region information
        header = "{}/{}-{}".format(id_, region_start, region_end)

        focus_ali.ids[focus_index] = header

        # write target sequence to file
        target_sequence_file = prefix + ".fa"
        with open(target_sequence_file, "w") as f:
            write_fasta([(header, focus_seq_nogap)], f)

        # swap target sequence to first position if it is not
        # the first sequence in alignment;
        # this is particularly important for hhfilter run
        # because target sequence might otherwise be filtered out
        if focus_index != 0:
            indices = np.arange(0, len(focus_ali))
            indices[0] = focus_index
            indices[focus_index] = 0
            focus_index = 0
            focus_ali = focus_ali.select(sequences=indices)

        # write the raw focus alignment for hmmbuild
        focus_fasta_file = prefix + "_raw_focus_input.fasta"
        with open(focus_fasta_file, "w") as f:
            focus_ali.write(f, "fasta")

        return focus_fasta_file, target_sequence_file, region_start, region_end

    # define the gap threshold for inclusion in HMM's build by HMMbuild.
    SYMFRAC_HMMBUILD = 0.0

    # check for required options
    check_required(kwargs, [
        "prefix", "sequence_id", "alignment_file", "use_bitscores",
        "domain_threshold", "sequence_threshold", "database", "cpu", "nobias",
        "reuse_alignment", "hmmbuild", "hmmsearch"
    ])
    prefix = kwargs["prefix"]

    # make sure output directory exists
    create_prefix_folders(prefix)

    # prepare input alignment for hmmbuild
    focus_fasta_file, target_sequence_file, region_start, region_end = \
        _format_alignment_for_hmmbuild(
            kwargs["alignment_file"], **kwargs
        )

    # run hmmbuild_and_search... allow to reuse pre-exisiting
    # Stockholm alignment file here
    ali_outcfg_file = prefix + ".align_hmmbuild_and_search.outcfg"

    # determine if to rerun, only possible if previous results
    # were stored in ali_outcfg_file
    if kwargs["reuse_alignment"] and valid_file(ali_outcfg_file):
        ali = read_config_file(ali_outcfg_file)

        # check if the alignment file itself is also there
        verify_resources(
            "Tried to reuse alignment, but empty or "
            "does not exist", ali["alignment"], ali["domtblout"])
    else:
        # otherwise, we have to run the alignment
        # modify search thresholds to be suitable for hmmsearch
        sequence_length = region_end - region_start + 1
        seq_threshold, domain_threshold = search_thresholds(
            kwargs["use_bitscores"], kwargs["sequence_threshold"],
            kwargs["domain_threshold"], sequence_length)

        # create the hmm
        hmmbuild_result = at.run_hmmbuild(
            alignment_file=focus_fasta_file,
            prefix=prefix,
            symfrac=SYMFRAC_HMMBUILD,
            cpu=kwargs["cpu"],
            binary=kwargs["hmmbuild"],
        )
        hmmfile = hmmbuild_result.hmmfile

        # run the alignment from the hmm
        ali = at.run_hmmsearch(
            hmmfile=hmmfile,
            database=kwargs[kwargs["database"]],
            prefix=prefix,
            use_bitscores=kwargs["use_bitscores"],
            domain_threshold=domain_threshold,
            seq_threshold=seq_threshold,
            nobias=kwargs["nobias"],
            cpu=kwargs["cpu"],
            binary=kwargs["hmmsearch"],
        )

        # get rid of huge stdout log file immediately
        try:
            os.remove(ali.output)
        except OSError:
            pass

        # turn namedtuple into dictionary to make
        # restarting code nicer
        ali = dict(ali._asdict())
        # only item from hmmsearch_result to save is the hmmfile
        ali["hmmfile"] = hmmfile

        # save results of search for possible restart
        write_config_file(ali_outcfg_file, ali)

    # prepare output dictionary with result files
    outcfg = {
        "sequence_file": target_sequence_file,
        "first_index": region_start,
        "input_raw_focus_alignment": focus_fasta_file,
        "target_sequence_file": target_sequence_file,
        "focus_mode": True,
        "raw_alignment_file": ali["alignment"],
        "hittable_file": ali["domtblout"],
    }

    # convert the raw output alignment to fasta format
    # and add the appropriate query sequecne
    raw_focus_alignment_file = _make_hmmsearch_raw_fasta(outcfg, prefix)
    outcfg["raw_focus_alignment_file"] = raw_focus_alignment_file

    # define a single protein segment based on target sequence
    outcfg["segments"] = [
        Segment("aa", kwargs["sequence_id"], region_start, region_end,
                range(region_start, region_end + 1)).to_list()
    ]

    outcfg["focus_sequence"] = "{}/{}-{}".format(kwargs["sequence_id"],
                                                 region_start, region_end)

    return outcfg
Example #6
0
def secondary_structure(**kwargs):
    """
    Predict or load secondary structure for an
    input sequence

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    residues : pandas.DataFrame
        Table with sequence and secondary structure
        in columns i, A_i and sec_struct_3state
    """
    check_required(
        kwargs,
        [
            "prefix", "target_sequence_file",
            "segments", "sec_struct_method",
            "sec_struct_file", "psipred",
        ]
    )

    prefix = kwargs["prefix"]
    create_prefix_folders(prefix)

    secstruct_file = kwargs["sec_struct_file"]
    if secstruct_file is not None:
        verify_resources(
            "Secondary structure prediction file does not exist/is empty",
            secstruct_file
        )
        residues = pd.read_csv(secstruct_file)
    else:
        # make sure target sequence file is there so we can
        # predict secondary structure
        target_seq_file = kwargs["target_sequence_file"]
        verify_resources(
            "Sequence file does not exist/is empty", target_seq_file
        )

        # we need to figure out what the index of the first residue
        # in the target sequence is; obtain first index from segment
        # information if possible
        if kwargs["segments"] is not None:
            s = Segment.from_list(kwargs["segments"][0])
            first_index = s.region_start
        else:
            # otherwise try to get it from sequence file
            first_index = None

            with open(target_seq_file) as f:
                header, _ = next(read_fasta(f))
                if header is not None:
                    _, first_index, _ = parse_header(header)

                # if we cannot identify first index from header,
                # do not make guesses but fail
                if first_index is None:
                    raise InvalidParameterError(
                        "Could not unambiguously identify sequence range from "
                        "FASTA header, needs to specified as id/start-end: {}".format(
                            header
                        )
                    )

        # finally, run secondary structure prediction
        if kwargs["sec_struct_method"] == "psipred":
            # store psipred output in a separate directory
            output_dir = path.join(path.dirname(prefix), "psipred")

            # run psipred
            ss2_file, horiz_file = run_psipred(
                target_seq_file, output_dir, binary=kwargs["psipred"]
            )

            # parse output, renumber to first index
            residues = read_psipred_prediction(
                horiz_file, first_index=first_index
            )
        else:
            raise InvalidParameterError(
                "Secondary structure prediction method not implemented: "
                "{}. Valid choices: psipred".format(kwargs["sec_struct_method"])
            )

    # return predicted table
    return residues
Example #7
0
def standard(**kwargs):
    """
    Protocol:
    Predict 3D structure from evolutionary couplings

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * sec_struct_file
        * folding_ec_file
        * folded_structure_files
    """
    check_required(
        kwargs,
        [
            "prefix", "engine", "ec_file", "target_sequence_file",
            "segments", "folding_config_file", "cut_to_alignment_region",
            "sec_struct_method", "reuse_sec_struct",
            "sec_struct_file", "filter_sec_struct_clashes",
            "min_sequence_distance", "fold_probability_cutoffs",
            "fold_lowest_count", "fold_highest_count", "fold_increase",
            "num_models", "psipred", "cpu", "remapped_pdb_files",
            "cleanup",
        ]
    )

    prefix = kwargs["prefix"]

    # make sure output directory exists
    create_prefix_folders(prefix)

    outcfg = {
        "folding_ec_file": prefix + "_CouplingScores_with_clashes.csv",
        "sec_struct_file": prefix + "_secondary_structure.csv",
    }

    # get secondary structure prediction
    # check if we should (and can) reuse output file from previous run
    if kwargs["reuse_sec_struct"] and valid_file(outcfg["sec_struct_file"]):
        residues = pd.read_csv(outcfg["sec_struct_file"])
    else:
        residues = secondary_structure(**kwargs)

    # make pymol secondary structure assignment script
    outcfg["secondary_structure_pml_file"] = prefix + "_ss_draw.pml"
    pymol_secondary_structure(
        residues, outcfg["secondary_structure_pml_file"]
    )

    # load ECs and filter for long-range pairs
    verify_resources(
        "EC file does not exist", kwargs["ec_file"]
    )
    ecs_all = pd.read_csv(kwargs["ec_file"])
    ecs = ecs_all.query("abs(i - j) > {}".format(
        kwargs["min_sequence_distance"])
    )

    # find secondary structure clashes
    ecs = secstruct_clashes(ecs, residues)
    ecs.to_csv(outcfg["folding_ec_file"], index=False)

    # if requested, filter clashes out before folding
    if kwargs["filter_sec_struct_clashes"]:
        ecs_fold = ecs.loc[~ecs.ss_clash]
    else:
        ecs_fold = ecs

    # cut modelled region to aligned region, if selected
    if kwargs["cut_to_alignment_region"]:
        segments = kwargs["segments"]
        # infer region from segment positions if we have it
        if segments is not None:
            positions = Segment.from_list(segments[0]).positions
        else:
            # otherwise get from EC values (could be misleading if
            # EC list is truncated, so only second option)
            positions = set(ecs.i.unique()).union(ecs.j.unique())

        # limit modelled positions to covered region
        first_pos, last_pos = min(positions), max(positions)
        residues.loc[:, "in_model"] = False
        residues.loc[
            (residues.i >= first_pos) & (residues.i <= last_pos),
            "in_model"
        ] = True
    else:
        # otherwise include all positions in model
        residues.loc[:, "in_model"] = True

    # save secondary structure prediction
    residues.to_csv(outcfg["sec_struct_file"], index=False)

    # only use the residues that will be in model for folding
    residues_fold = residues.loc[residues.in_model]

    # after all the setup, now fold the structures...
    # to speed things up, parallelize this to the number of
    # available CPUs
    num_procs = kwargs["cpu"]
    if num_procs is None:
        num_procs = 1

    # first define all the sub-runs...
    folding_runs = []

    # ... based on mixture model probability
    cutoffs = kwargs["fold_probability_cutoffs"]
    if cutoffs is not None and "probability" in ecs_fold.columns:
        if not isinstance(cutoffs, list):
            cutoffs = [cutoffs]

        for c in cutoffs:
            sig_ecs = ecs_fold.query("probability >= @c")
            if len(sig_ecs) > 0:
                folding_runs.append(
                    (sig_ecs,
                     "_significant_ECs_{}".format(c))
                )

    # ... and on simple EC counts/bins
    flc = kwargs["fold_lowest_count"]
    fhc = kwargs["fold_highest_count"]
    fi = kwargs["fold_increase"]
    if flc is not None and fhc is not None and fi is not None:
        num_sites = len(
            set.union(set(ecs.i.unique()), set(ecs.j.unique()))
        )

        # transform fraction of number of sites into discrete number of ECs
        def _discrete_count(x):
            if isinstance(x, float):
                x = ceil(x * num_sites)
            return int(x)

        # range of plots to make
        lowest = _discrete_count(flc)
        highest = _discrete_count(fhc)
        step = _discrete_count(fi)

        # append to list of jobs to run
        folding_runs += [
            (
                ecs_fold.iloc[:c],
                "_{}".format(c)
            )
            for c in range(lowest, highest + 1, step)
        ]

    # set up method to drive the folding of each job
    method = kwargs["engine"]

    # store structures in an auxiliary subdirectory, after folding
    # final models will be moved to main folding dir. Depending
    # on cleanup setting, the aux directory will be removed
    aux_prefix = insert_dir(prefix, "aux", rootname_subdir=False)
    aux_dir = path.dirname(aux_prefix)

    folding_runs = [
        (job_ecs, aux_prefix + job_suffix)
        for (job_ecs, job_suffix) in folding_runs
    ]

    if method == "cns_dgsa":
        folder = partial(
            cns_dgsa_fold,
            residues_fold,
            config_file=kwargs["folding_config_file"],
            num_structures=kwargs["num_models"],
            log_level=None,
            binary=kwargs["cns"]
        )
    else:
        raise InvalidParameterError(
            "Invalid folding engine: {} ".format(method) +
            "Valid selections are: cns_dgsa"
        )

    # then apply folding function to each sub-run
    pool = mp.Pool(processes=num_procs)
    results = pool.starmap(folder, folding_runs)

    # make double sure that the pool is cleaned up,
    # or SIGTERM upon exit will interfere with
    # interrupt signal interception
    pool.close()
    pool.join()

    # merge result dictionaries into one dict
    folded_files = {
        k: v for subres in results for k, v in subres.items()
    }

    # move structures from aux into main folding dir
    fold_dir = path.dirname(prefix)
    prediction_files = []
    for name, file_path in folded_files.items():
        # move file (use copy to allow overwriting)
        shutil.copy(file_path, fold_dir)

        # update file path to main folding dir,
        # and put in a flat list of result files
        prediction_files.append(
            file_path.replace(aux_prefix, prefix)
        )

    outcfg["folded_structure_files"] = prediction_files

    # remove aux dir if cleanup is requested
    if kwargs["cleanup"]:
        shutil.rmtree(aux_dir)

    # apply ranking to predicted models
    ranking = dihedral_ranking(prediction_files, residues)

    # apply clustering (all available methods), but only
    # if we have something to cluster
    if len(prediction_files) > 1:
        clustering = maxcluster_clustering_table(
            prediction_files, binary=kwargs["maxcluster"]
        )

        # join ranking with clustering
        ranking = ranking.merge(clustering, on="filename", how="left")

    # sort by score (best models first)
    ranking = ranking.sort_values(by="ranking_score", ascending=False)

    # store as file
    outcfg["folding_ranking_file"] = prefix + "_ranking.csv"
    ranking.to_csv(outcfg["folding_ranking_file"], index=False)

    # apply comparison to existing structures
    if kwargs["remapped_pdb_files"] is not None and len(kwargs["remapped_pdb_files"]) > 0:
        experimental_files = kwargs["remapped_pdb_files"]

        comp_all, comp_singles = compare_models_maxcluster(
            list(experimental_files.keys()), prediction_files,
            norm_by_intersection=True, distance_cutoff=None,
            binary=kwargs["maxcluster"]
        )

        # merge with ranking and save
        comparison = ranking.merge(
            comp_all, on="filename", how="left"
        ).sort_values(by="tm", ascending=False)
        outcfg["folding_comparison_file"] = prefix + "_comparison.csv"
        comparison.to_csv(outcfg["folding_comparison_file"], index=False)

        # also store comparison to structures in individual files
        ind_comp_files = {}
        for filename, comp_single in comp_singles.items():
            comparison_s = ranking.merge(
                comp_single, on="filename", how="left"
            ).sort_values(by="tm", ascending=False)
            basename = path.splitext(path.split(filename)[1])[0]
            ind_file = path.join(fold_dir, basename + ".csv")

            # map back to original key from remapped_pdb_files as a key for this list
            ind_comp_files[ind_file] = experimental_files[filename]
            comparison_s.to_csv(ind_file, index=False)

        outcfg["folding_individual_comparison_files"] = ind_comp_files

    return outcfg
Example #8
0
def complex(**kwargs):
    """
    Protocol:
    Mutation effect prediction and visualization for protein complexes

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * mutation_matrix_file
        * [mutation_dataset_predicted_file]
    """
    check_required(
        kwargs, ["prefix", "model_file", "mutation_dataset_file", "segments"])

    prefix = kwargs["prefix"]

    outcfg = {
        "mutation_matrix_file": prefix + "_single_mutant_matrix.csv",
        "mutation_matrix_plot_files": [],
    }

    # make sure model file exists
    verify_resources("Model parameter file does not exist",
                     kwargs["model_file"])

    # make sure output directory exists
    create_prefix_folders(prefix)

    # load segments to create couplings object
    segment_objects = []
    for segment_list in kwargs["segments"]:
        segment_objects.append(Segment.from_list(segment_list))

    first_segment_name = Segment.from_list(kwargs["segments"][0]).segment_id
    second_segment_name = Segment.from_list(kwargs["segments"][1]).segment_id

    first_chain_name = Segment.from_list(
        kwargs["segments"][0]).default_chain_name()
    second_chain_name = Segment.from_list(
        kwargs["segments"][1]).default_chain_name()

    # load couplings object
    c = MultiSegmentCouplingsModel(kwargs["model_file"], *segment_objects)

    # create the independent model
    c0 = c.to_independent_model()

    # create the inter-protein only Jij model
    ci = c.to_inter_segment_model()

    for model, type_ in [(c, "Epistatic"), (c0, "Independent"),
                         (ci, "Inter_segment")]:
        # interactive plot using bokeh
        filename = prefix + "_{}_model".format(type_.lower(), )
        output_file(filename + ".html", "{} model".format(type_))
        fig = evcouplings.visualize.mutations.plot_mutation_matrix(
            model, engine="bokeh")
        save(fig)
        outcfg["mutation_matrix_plot_files"].append(filename + ".html")

        # static matplotlib plot
        evcouplings.visualize.mutations.plot_mutation_matrix(model)
        plt.savefig(filename + ".pdf", bbox_inches="tight")
        outcfg["mutation_matrix_plot_files"].append(filename + ".pdf")

    # create single mutation matrix table,
    # add prediction by independent model and
    # save to file
    singles = single_mutant_matrix(c, output_column="prediction_epistatic")

    singles = predict_mutation_table(c0, singles, "prediction_independent")

    singles = predict_mutation_table(ci, singles, "prediction_inter_segment")

    singles.to_csv(outcfg["mutation_matrix_file"], index=False)

    # Pymol scripts
    outcfg["mutations_epistatic_pml_files"] = []
    for model in ["epistatic", "independent", "inter_segment"]:
        pml_filename = prefix + "_{}_model.pml".format(model)
        evcouplings.visualize.mutations.mutation_pymol_script(
            singles,
            pml_filename,
            effect_column="prediction_" + model,
            segment_to_chain_mapping={
                first_segment_name: first_chain_name,
                second_segment_name: second_chain_name
            })
        outcfg["mutations_epistatic_pml_files"].append(pml_filename)

    # predict experimental dataset if given
    dataset_file = kwargs["mutation_dataset_file"]
    if dataset_file is not None:
        verify_resources("Dataset file does not exist", dataset_file)
        data = pd.read_csv(dataset_file, comment="#", sep=",")

        if "segment" not in data.columns:
            raise ValueError("Input mutation dataset file does not contain "
                             "a column called 'segment' to specify the "
                             "protein of origin for each mutation")

        # add epistatic model prediction
        data_pred = predict_mutation_table(c, data, "prediction_epistatic")

        # add independent model prediction
        data_pred = predict_mutation_table(c0, data_pred,
                                           "prediction_independent")

        data_pred = predict_mutation_table(ci, data_pred, "inter_segment")

        outcfg[
            "mutation_dataset_predicted_file"] = prefix + "_dataset_predicted.csv"
        data_pred.to_csv(outcfg["mutation_dataset_predicted_file"],
                         index=False)

    return outcfg