Esempio n. 1
0
def _docking_config(config_file=None):
    """
    Load docking configuration

    Parameters
    ----------
    config_file: str, optional (default: None)
        Path to configuration file. If None,
        loads default configuration included
        with package.

    Returns
    -------
    dict
        Loaded configuration
    """
    if config_file is None:
        # get path of config within package
        config_file = resource_filename(
            __name__, "cns_templates/haddock_restraints.yml")

    # check if config file exists and read
    verify_resources("Folding config file does not exist or is empty",
                     config_file)

    return read_config_file(config_file)
Esempio n. 2
0
def run_hhfilter(input_file,
                 output_file,
                 threshold=95,
                 columns="a2m",
                 binary="hhfilter"):
    """
    Redundancy-reduce a sequence alignment using hhfilter
    from the HHsuite alignment suite.

    Parameters
    ----------
    input_file : str
        Path to input alignment in A2M/FASTA format
    output_file : str
        Path to output alignment (will be in A3M format)
    threshold : int, optional (default: 95)
        Sequence identity threshold for maximum pairwise
        identity (between 0 and 100)
    columns : {"first", "a2m"}, optional (default: "a2m")
        Definition of match columns (based on first sequence
        or upper-case columns (a2m))
    binary : str
        Path to hhfilter binary

    Returns
    -------
    str
        output_file

    Raises
    ------
    ResourceError
        If output alignment is non-existent/empty
    ValueError
        Upon invalid value of columns parameter
    """
    if columns not in ["first", "a2m"]:
        raise ValueError("Invalid column selection: {}".format(columns))

    verify_resources("Alignment file does not exist or is empty", input_file)

    create_prefix_folders(output_file)

    cmd = [
        binary, "-i", input_file, "-o", output_file, "-id",
        str(threshold), "-M", columns, "-v",
        str(2)
    ]

    return_code, stdout, stderr = run(cmd)

    verify_resources(
        "hhfilter returned empty alignment: "
        "stdout={} stderr={} file={}".format(stdout, stderr, output_file),
        output_file)

    return output_file
Esempio n. 3
0
def fetch_sequence(sequence_id, sequence_file, sequence_download_url,
                   out_file):
    """
    Fetch sequence either from database based on identifier, or from
    input sequence file.

    Parameters
    ----------
    sequence_id : str
        Identifier of sequence that should be retrieved
    sequence_file : str
        File containing sequence. If None, sqeuence will
        be downloaded from sequence_download_url
    sequence_download_url : str
        URL from which to download missing sequence. Must
        contain "{}" at the position where sequence ID will
        be inserted into download URL (using str.format).
    out_file : str
        Output file in which sequence will be stored, if
        sequence_file is not existing.

    Returns
    -------
    str
        Path of file with stored sequence (can be sequence_file
        or out_file)
    tuple (str, str)
        Identifier of sequence as stored in file, and sequence
    """
    if sequence_file is None:
        get(sequence_download_url.format(sequence_id),
            out_file,
            allow_redirects=True)
    else:
        # if we have sequence file, try to copy it
        try:
            copy(sequence_file, out_file)
        except FileNotFoundError:
            raise ResourceError(
                "sequence_file does not exist: {}".format(sequence_file))

    # also make sure input file has something in it
    verify_resources("Input sequence missing", out_file)

    with open(out_file) as f:
        seq = next(read_fasta(f))

    return out_file, seq
Esempio n. 4
0
def run(**kwargs):
    """
    EVcouplings pipeline execution from a
    configuration file (single thread, no
    batch or environment configuration)
    
    Parameters
    ----------
    kwargs
        See click.option decorators for app()
    """
    config_file = kwargs["config"]
    verify_resources("Config file does not exist or is empty.", config_file)

    # read configuration and execute
    config = read_config_file(config_file)

    # execute configuration in "wrapped" mode
    # that handles exceptions and internal interrupts
    return execute_wrapped(**config)
Esempio n. 5
0
def _cns_render_template(template_name, mapping):
    """
    Render an included CNS template .inp

    Parameters
    ----------
    template_name : str
        Name of CNS template (e.g. dg_sa)
    mapping : dict
        Values to be substituted into template

    Returns
    -------
    str
        Rendered template
    """
    # get path of template within package
    template_file = resource_filename(
        __name__, "cns_templates/{}.inp".format(template_name))

    verify_resources("CNS template does not exist: {}".format(template_file),
                     template_file)

    return render_template(template_file, mapping)
Esempio n. 6
0
def execute(**config):
    """
    Execute a pipeline configuration

    Parameters
    ----------
    **config
        Input configuration for pipeline
        (see pipeline config files for
        example of how this should look like)

    Returns
    -------
    global_state : dict
        Global output state of pipeline
    """
    check_required(config, ["pipeline", "stages", "global"])

    # check if valid pipeline was selected
    if config["pipeline"] not in PIPELINES:
        raise InvalidParameterError("Not a valid pipeline selection. "
                                    "Valid choices are:\n{}".format(", ".join(
                                        PIPELINES.keys())))

    stages = config["stages"]
    if stages is None:
        raise InvalidParameterError("No stages defined, need at least one.")

    # get definition of selected pipeline
    pipeline = PIPELINES[config["pipeline"]]
    prefix = config["global"]["prefix"]

    # make sure output directory exists
    create_prefix_folders(prefix)

    # this is the global state of results as
    # we move through different stages of
    # the pipeline
    global_state = config["global"]

    # keep track of how many stages are still
    # to be run, so we can leave out stages at
    # the end of workflow below
    num_stages_to_run = len(stages)

    # get job tracker
    tracker = get_result_tracker(config)

    # set job status to running and also initalize global state
    tracker.update(status=EStatus.RUN, results=global_state)

    # iterate through individual stages
    for (stage, runner, key_prefix) in pipeline:
        # check if anything else is left to
        # run, otherwise skip
        if num_stages_to_run == 0:
            break

        # check if config for stage is there
        check_required(config, [stage])

        # output files for stage into an individual folder
        stage_prefix = insert_dir(prefix, stage)
        create_prefix_folders(stage_prefix)

        # config files for input and output of stage
        stage_incfg = "{}_{}.incfg".format(stage_prefix, stage)
        stage_outcfg = "{}_{}.outcfg".format(stage_prefix, stage)

        # update current stage of job
        tracker.update(stage=stage)

        # check if stage should be executed
        if stage in stages:
            # global state inserted at end, overrides any
            # stage-specific settings (except for custom prefix)
            incfg = {
                **config["tools"],
                **config["databases"],
                **config[stage],
                **global_state, "prefix": stage_prefix
            }
            # save input of stage in config file
            write_config_file(stage_incfg, incfg)

            # run stage
            outcfg = runner(**incfg)

            # prefix output keys if this parameter is
            # given in stage configuration, to avoid
            # name clashes if same protocol run multiple times
            if key_prefix is not None:
                outcfg = {key_prefix + k: v for k, v in outcfg.items()}

            # save output of stage in config file
            write_config_file(stage_outcfg, outcfg)

            # one less stage to put through after we ran this...
            num_stages_to_run -= 1
        else:
            # skip state by injecting state from previous run
            verify_resources(
                "Trying to skip, but output configuration "
                "for stage '{}' does not exist. Has it already "
                "been run?".format(stage, stage), stage_outcfg)

            # read output configuration
            outcfg = read_config_file(stage_outcfg)

            # verify all the output files are there
            outfiles = [
                filepath for f, filepath in outcfg.items()
                if f.endswith("_file") and filepath is not None
            ]

            verify_resources(
                "Output files from stage '{}' "
                "missing".format(stage), *outfiles)

        # update global state with outputs of stage
        global_state = {**global_state, **outcfg}

        # update state in tracker accordingly
        tracker.update(results=outcfg)

    # create results archive
    archive_file = create_archive(config, global_state, prefix)

    # only store results archive if a result file was created
    if archive_file is not None:
        global_state["archive_file"] = archive_file

        # prepare update for tracker, but only store in last
        # go when job is set to done
        tracker_archive_update = {"archive_file": archive_file}
    else:
        tracker_archive_update = None

    # set job status to done and transfer archive if selected for syncing
    tracker.update(status=EStatus.DONE, results=tracker_archive_update)

    # delete selected output files if requested;
    # tracker does not need to update here since it won't
    # sync entries of delete list in the first place
    global_state = delete_outputs(config, global_state)

    # write final global state of pipeline
    write_config_file(prefix + FINAL_CONFIG_SUFFIX, global_state)

    return global_state
Esempio n. 7
0
def mean_field(**kwargs):
    """
    Protocol:

    Infer ECs from alignment using mean field direct coupling analysis.

    For now, mean field DCA can only be run in focus mode, gaps
    included.

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required.

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * raw_ec_file
        * model_file
        * num_sites
        * num_sequences
        * effective_sequences

        * focus_mode (passed through)
        * focus_sequence (passed through)
        * segments (passed through)
    """
    check_required(
        kwargs,
        [
            "prefix", "alignment_file", "segments",
            "focus_mode", "focus_sequence", "theta",
            "pseudo_count", "alphabet",
            "min_sequence_distance", # "save_model",
        ]
    )

    if not kwargs["focus_mode"]:
        raise InvalidParameterError(
            "For now, mean field DCA can only be run in focus mode."
        )

    prefix = kwargs["prefix"]

    # option to save model disabled
    """
    if kwargs["save_model"]:
        model = prefix + ".model"
    else:
        model = None
    """
    model = prefix + ".model"

    outcfg = {
        "model_file": model,
        "raw_ec_file": prefix + "_ECs.txt",
        "ec_file": prefix + "_CouplingScores.csv",
        # TODO: the following are passed through stage...
        # keep this or unnecessary?
        "focus_mode": kwargs["focus_mode"],
        "focus_sequence": kwargs["focus_sequence"],
        "segments": kwargs["segments"],
    }

    # make sure input alignment exists
    alignment_file = kwargs["alignment_file"]
    verify_resources(
        "Input alignment does not exist",
        kwargs["alignment_file"]
    )

    # make sure output directory exists
    create_prefix_folders(prefix)

    segments = kwargs["segments"]
    if segments is not None:
        segments = [
            mapping.Segment.from_list(s) for s in segments
        ]

    # determine alphabet
    # default is protein
    if kwargs["alphabet"] is None:
        alphabet = ALPHABET_PROTEIN
    else:
        alphabet = kwargs["alphabet"]

        # allow shortcuts for protein, DNA, RNA
        if alphabet in ALPHABET_MAP:
            alphabet = ALPHABET_MAP[alphabet]

    # read in a2m alignment
    with open(alignment_file) as f:
        input_alignment = Alignment.from_file(
            f, alphabet=alphabet,
            format="fasta"
        )

    # init mean field direct coupling analysis
    mf_dca = MeanFieldDCA(input_alignment)

    # run mean field approximation
    model = mf_dca.fit(
        theta=kwargs["theta"],
        pseudo_count=kwargs["pseudo_count"]
    )

    # write ECs to file
    model.to_raw_ec_file(
        outcfg["raw_ec_file"]
    )

    # write model file
    if outcfg["model_file"] is not None:
        model.to_file(
            outcfg["model_file"],
            file_format="plmc_v2"
        )

    # store useful information about model in outcfg
    outcfg.update({
        "num_sites": model.L,
        "num_valid_sequences": model.N_valid,
        "effective_sequences": float(round(model.N_eff, 1)),
        "region_start": int(model.index_list[0]),
    })

    # read and sort ECs
    ecs = pd.read_csv(
        outcfg["raw_ec_file"], sep=" ",
        # for now, call the last two columns
        # "fn" and "cn" to prevent compare
        # stage from crashing
        names=["i", "A_i", "j", "A_j", "fn", "cn"]
        # names=["i", "A_i", "j", "A_j", "mi", "di"]
    ).sort_values(
        by="cn",
        ascending=False
    )

    is_single_segment = segments is None or len(segments) == 1
    outcfg = {
        **outcfg,
        **_postprocess_inference(
            ecs, kwargs, model, outcfg, prefix,
            generate_enrichment=is_single_segment,
            generate_line_plot=is_single_segment
        )
    }

    # dump output config to YAML file for debugging/logging
    write_config_file(prefix + ".couplings_meanfield.outcfg", outcfg)

    return outcfg
Esempio n. 8
0
def best_hit(**kwargs):
    """
    Protocol:

    Concatenate alignments based on the best hit 
    to the focus sequence in each species

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        alignment_file
        raw_alignment_file
        focus_mode
        focus_sequence
        segments
        frequencies_file
        identities_file
        num_sequences
        num_sites
        raw_focus_alignment_file
        statistics_file
    """
    check_required(
        kwargs,
        [
            "prefix",
            "first_alignment_file", "second_alignment_file",
            "first_focus_sequence", "second_focus_sequence",
            "first_focus_mode", "second_focus_mode",
            "first_segments", "second_segments",
            "first_identities_file", "second_identities_file",
            "first_annotation_file", "second_annotation_file",
            "use_best_reciprocal", "paralog_identity_threshold"
        ]
    )

    prefix = kwargs["prefix"]

    # make sure input alignments
    verify_resources(
        "Input alignment does not exist",
        kwargs["first_alignment_file"], kwargs["second_alignment_file"]
    )

    # make sure output directory exists
    create_prefix_folders(prefix)

    def _load_monomer_info(annotations_file, identities_file,
                           target_sequence, alignment_file,
                           use_best_reciprocal, identity_threshold):

        # read in annotation to a file and rename the appropriate column
        annotation_table = read_species_annotation_table(annotations_file)

        # read identity file
        similarities = pd.read_csv(identities_file)

        # create a pd.DataFrame containing the best hit in each organism
        most_similar_in_species = most_similar_by_organism(similarities, annotation_table)

        if use_best_reciprocal:
            paralogs = find_paralogs(
                target_sequence, annotation_table, similarities,
                identity_threshold
            )

            most_similar_in_species = filter_best_reciprocal(
                alignment_file, paralogs, most_similar_in_species
            )

        return most_similar_in_species

    # load the information about each monomer alignment
    most_similar_in_species_1 = _load_monomer_info(
        kwargs["first_annotation_file"],
        kwargs["first_identities_file"],
        kwargs["first_focus_sequence"],
        kwargs["first_alignment_file"],
        kwargs["use_best_reciprocal"],
        kwargs["paralog_identity_threshold"]
    )

    most_similar_in_species_2 = _load_monomer_info(
        kwargs["second_annotation_file"],
        kwargs["second_identities_file"],
        kwargs["second_focus_sequence"],
        kwargs["second_alignment_file"],
        kwargs["use_best_reciprocal"],
        kwargs["paralog_identity_threshold"]
    )

    # merge the two dataframes to get all species found in 
    # both alignments
    species_intersection = most_similar_in_species_1.merge(
        most_similar_in_species_2,
        how="inner",  # takes the intersection
        on="species",  # merges on species identifiers
        suffixes=("_1", "_2")
    )

    # write concatenated alignment with distance filtering
    # TODO: save monomer alignments?
    target_seq_id, target_seq_index, raw_ali, mon_ali_1, mon_ali_2 = \
        write_concatenated_alignment(
            species_intersection,
            kwargs["first_alignment_file"],
            kwargs["second_alignment_file"],
            kwargs["first_focus_sequence"],
            kwargs["second_focus_sequence"]
        )

    # save the alignment files
    raw_alignment_file = prefix + "_raw.fasta"
    with open(raw_alignment_file, "w") as of:
        raw_ali.write(of)

    mon_alignment_file_1 = prefix + "_monomer_1.fasta"
    with open(mon_alignment_file_1, "w") as of:
        mon_ali_1.write(of)

    mon_alignment_file_2 = prefix + "_monomer_2.fasta"
    with open(mon_alignment_file_2, "w") as of:
        mon_ali_2.write(of)

    aln_outcfg, _ = modify_alignment(
        raw_ali,
        target_seq_index,
        target_seq_id,
        kwargs["first_region_start"],
        **kwargs
    )

    # make sure we return all the necessary information:
    # * alignment_file: final concatenated alignment that will go into plmc
    # * focus_sequence: this is the identifier of the concatenated target
    #   sequence which will be passed into plmc with -f
    outcfg = aln_outcfg
    outcfg["raw_alignment_file"] = raw_alignment_file
    outcfg["first_concatenated_monomer_alignment_file"] = mon_alignment_file_1
    outcfg["second_concatenated_monomer_alignment_file"] = mon_alignment_file_2
    outcfg["focus_sequence"] = target_seq_id

    # Update the segments
    outcfg = modify_complex_segments(outcfg, **kwargs)

    # Describe the statistics of the concatenation
    outcfg = _run_describe_concatenation(outcfg, **kwargs)

    return outcfg
Esempio n. 9
0
def standard(**kwargs):
    """
    Protocol:

    Infer ECs from alignment using plmc.

    .. todo::

        1. make EC enrichment calculation segment-ready
        2. explain meaning of parameters in detail.

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * raw_ec_file
        * model_file
        * num_sites
        * num_sequences
        * effective_sequences
        * focus_mode (passed through)
        * focus_sequence (passed through)
        * segments (passed through)
    """
    check_required(
        kwargs,
        [
            "prefix", "alignment_file",
            "focus_mode", "focus_sequence", "theta",
            "alphabet", "segments", "ignore_gaps", "iterations",
            "lambda_h", "lambda_J", "lambda_group",
            "scale_clusters",
            "cpu", "plmc", "reuse_ecs",
            "min_sequence_distance", # "save_model",
        ]
    )

    prefix = kwargs["prefix"]

    # for now disable option to not save model, since
    # otherwise mutate stage will crash. To remove model
    # file at end, use delete option in management section.
    """
    if kwargs["save_model"]:
        model = prefix + ".model"
    else:
        model = None
    """
    model = prefix + ".model"

    outcfg = {
        "model_file": model,
        "raw_ec_file": prefix + "_ECs.txt",
        "ec_file": prefix + "_CouplingScores.csv",
        # TODO: the following are passed through stage...
        # keep this or unnecessary?
        "focus_mode": kwargs["focus_mode"],
        "focus_sequence": kwargs["focus_sequence"],
        "segments": kwargs["segments"],
    }

    # make sure input alignment exists
    verify_resources(
        "Input alignment does not exist",
        kwargs["alignment_file"]
    )

    # make sure output directory exists
    create_prefix_folders(prefix)

    # regularization strength on couplings J_ij
    lambda_J = kwargs["lambda_J"]

    segments = kwargs["segments"]
    if segments is not None:
        segments = [
            mapping.Segment.from_list(s) for s in segments
        ]

    # first determine size of alphabet;
    # default is amino acid alphabet
    if kwargs["alphabet"] is None:
        alphabet = ALPHABET_PROTEIN
        alphabet_setting = None
    else:
        alphabet = kwargs["alphabet"]

        # allow shortcuts for protein, DNA, RNA
        if alphabet in ALPHABET_MAP:
            alphabet = ALPHABET_MAP[alphabet]

        # if we have protein alphabet, do not set
        # as plmc parameter since default parameter,
        # has some implementation advantages for focus mode
        if alphabet == ALPHABET_PROTEIN:
            alphabet_setting = None
        else:
            alphabet_setting = alphabet

    # scale lambda_J to proportionally compensate
    # for higher number of J_ij compared to h_i?
    if kwargs["lambda_J_times_Lq"]:
        num_symbols = len(alphabet)

        # if we ignore gaps, there is one character less
        if kwargs["ignore_gaps"]:
            num_symbols -= 1

        # second, determine number of uppercase positions
        # that are included in the calculation
        with open(kwargs["alignment_file"]) as f:
            seq_id, seq = next(read_fasta(f))

        # gap character is by convention first char in alphabet
        gap = alphabet[0]
        uppercase = [
            c for c in seq if c == c.upper() or c == gap
        ]
        L = len(uppercase)

        # finally, scale lambda_J
        lambda_J *= (num_symbols - 1) * (L - 1)

    # run plmc... or reuse pre-exisiting results from previous run
    plm_outcfg_file = prefix + ".couplings_standard_plmc.outcfg"

    # determine if to rerun, only possible if previous results
    # were stored in ali_outcfg_file
    if kwargs["reuse_ecs"] and valid_file(plm_outcfg_file):
        plmc_result = read_config_file(plm_outcfg_file)

        # check if the EC/parameter files are there
        required_files = [outcfg["raw_ec_file"]]

        if outcfg["model_file"] is not None:
            required_files += [outcfg["model_file"]]

        verify_resources(
            "Tried to reuse ECs, but empty or "
            "does not exist",
            *required_files
        )

    else:
        # run plmc binary
        plmc_result = ct.run_plmc(
            kwargs["alignment_file"],
            outcfg["raw_ec_file"],
            outcfg["model_file"],
            focus_seq=kwargs["focus_sequence"],
            alphabet=alphabet_setting,
            theta=kwargs["theta"],
            scale=kwargs["scale_clusters"],
            ignore_gaps=kwargs["ignore_gaps"],
            iterations=kwargs["iterations"],
            lambda_h=kwargs["lambda_h"],
            lambda_J=lambda_J,
            lambda_g=kwargs["lambda_group"],
            cpu=kwargs["cpu"],
            binary=kwargs["plmc"],
        )

        # save iteration table to file
        iter_table_file = prefix + "_iteration_table.csv"
        plmc_result.iteration_table.to_csv(
            iter_table_file
        )

        # turn namedtuple into dictionary to make
        # restarting code nicer
        plmc_result = dict(plmc_result._asdict())

        # then replace table with filename so
        # we can store results in config file
        plmc_result["iteration_table"] = iter_table_file

        # save results of search for possible restart
        write_config_file(plm_outcfg_file, plmc_result)

    # store useful information about model in outcfg
    outcfg.update({
        "num_sites": plmc_result["num_valid_sites"],
        "num_sequences": plmc_result["num_valid_seqs"],
        "effective_sequences": plmc_result["effective_samples"],
        "region_start": plmc_result["region_start"],
    })

    # read and sort ECs
    ecs = pairs.read_raw_ec_file(outcfg["raw_ec_file"])

    # add mixture model probability
    ecs = pairs.add_mixture_probability(ecs)

    if segments is not None:  # and (len(segments) > 1 or not kwargs["focus_mode"]):
        # create index mapping
        seg_mapper = mapping.SegmentIndexMapper(
            kwargs["focus_mode"], outcfg["region_start"], *segments
        )

        # apply to EC table
        ecs = mapping.segment_map_ecs(ecs, seg_mapper)

    # write updated table to csv file
    ecs.to_csv(outcfg["ec_file"], index=False)

    # also store longrange ECs as convenience output
    if kwargs["min_sequence_distance"] is not None:
        outcfg["ec_longrange_file"] = prefix + "_CouplingScores_longrange.csv"
        ecs_longrange = ecs.query(
            "abs(i - j) >= {}".format(kwargs["min_sequence_distance"])
        )
        ecs_longrange.to_csv(outcfg["ec_longrange_file"], index=False)

        # also create line-drawing script (for now, only for single segments)
        if segments is None or len(segments) == 1:
            outcfg["ec_lines_pml_file"] = prefix + "_draw_ec_lines.pml"
            L = outcfg["num_sites"]
            ec_lines_pymol_script(
                ecs_longrange.iloc[:L, :],
                outcfg["ec_lines_pml_file"]
            )

    # compute EC enrichment (for now, for single segments
    # only since enrichment code cannot handle multiple segments)
    if segments is None or len(segments) == 1:
        outcfg["enrichment_file"] = prefix + "_enrichment.csv"
        ecs_enriched = pairs.enrichment(ecs)
        ecs_enriched.to_csv(outcfg["enrichment_file"], index=False)

        # create corresponding enrichment pymol scripts
        outcfg["enrichment_pml_files"] = []
        for sphere_view, pml_suffix in [
            (True, "_enrichment_spheres.pml"), (False, "_enrichment_sausage.pml")
        ]:
            pml_file = prefix + pml_suffix
            enrichment_pymol_script(ecs_enriched, pml_file, sphere_view=sphere_view)
            outcfg["enrichment_pml_files"].append(pml_file)

    # output EVzoom JSON file if we have stored model file
    if outcfg.get("model_file", None) is not None:
        outcfg["evzoom_file"] = prefix + "_evzoom.json"
        with open(outcfg["evzoom_file"], "w") as f:
            # load parameters
            c = CouplingsModel(outcfg["model_file"])

            # create JSON output and write to file
            f.write(
                evzoom_json(c) + "\n"
            )

    # dump output config to YAML file for debugging/logging
    write_config_file(prefix + ".couplings_standard.outcfg", outcfg)

    return outcfg
Esempio n. 10
0
def complex(**kwargs):
    """
    Protocol:

    Run monomer alignment protocol and postprocess it for
    EVcomplex calculations

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the alignment protocol, and
        the following additional field:

        genome_location_file : path to file containing
            the genomic locations for CDs's corresponding to
            identifiers in the alignment.

    """
    check_required(kwargs, [
        "prefix", "alignment_protocol", "uniprot_to_embl_table",
        "ena_genome_location_table"
    ])

    verify_resources("Uniprot to EMBL mapping table does not exist",
                     kwargs["uniprot_to_embl_table"])

    verify_resources("ENA genome location table does not exist",
                     kwargs["ena_genome_location_table"])

    prefix = kwargs["prefix"]

    # make sure output directory exists
    create_prefix_folders(prefix)

    # run the regular alignment protocol
    # (standard, existing, ...)
    alignment_protocol = kwargs["alignment_protocol"]

    if alignment_protocol not in PROTOCOLS:
        raise InvalidParameterError(
            "Invalid choice for alignment protocol: {}".format(
                alignment_protocol))

    outcfg = PROTOCOLS[kwargs["alignment_protocol"]](**kwargs)

    # if the user selected the existing alignment protocol
    # they can supply an input annotation file
    # which overwrites the annotation file generated by the existing protocol
    if alignment_protocol == "existing":
        check_required(kwargs, ["override_annotation_file"])

        if kwargs["override_annotation_file"] is not None:
            verify_resources("Override annotation file does not exist",
                             kwargs["override_annotation_file"])

            outcfg["annotation_file"] = prefix + "_annotation.csv"
            annotation_data = pd.read_csv(kwargs["override_annotation_file"])
            annotation_data.to_csv(outcfg["annotation_file"])

    # extract cds identifiers for alignment uniprot IDs
    cds_ids = extract_cds_ids(outcfg["alignment_file"],
                              kwargs["uniprot_to_embl_table"])

    # extract genome location information from ENA
    genome_location_filename = prefix + "_genome_location.csv"

    genome_location_table = extract_embl_annotation(
        cds_ids, kwargs["ena_genome_location_table"], genome_location_filename)

    genome_location_table = add_full_header(genome_location_table,
                                            outcfg["alignment_file"])

    genome_location_table.to_csv(genome_location_filename)
    outcfg["genome_location_file"] = genome_location_filename

    # dump output config to YAML file for debugging/logging
    write_config_file(prefix + ".align_complex.outcfg", outcfg)

    return outcfg
Esempio n. 11
0
def run_jackhmmer(query,
                  database,
                  prefix,
                  use_bitscores,
                  domain_threshold,
                  seq_threshold,
                  iterations=5,
                  nobias=False,
                  cpu=None,
                  stdout_redirect=None,
                  checkpoints_hmm=False,
                  checkpoints_ali=False,
                  binary="jackhmmer"):
    """
    Run jackhmmer sequence search against target database.
    Refer to HMMER Userguide for explanation of these parameters.

    Parameters
    ----------
    query : str
        File containing query sequence
    database : str
        File containing sequence database
    prefix : str
        Prefix path for output files. Folder structure in
        the prefix will be created if not existing.
    use_bitscores : bool
        Use bitscore inclusion thresholds rather than E-values.
    domain_threshold : int or float or str
        Inclusion threshold applied on the domain level
        (e.g. "1E-03" or 0.001 or 50)
    seq_threshold : int or float or str
        Inclusion threshold applied on the sequence level
        (e.g. "1E-03" or 0.001 or 50)
    iterations : int
        number of jackhmmer search iterations
    nobias : bool, optional (default: False)
        Turn of bias correction
    cpu : int, optional (default: None)
        Number of CPUs to use for search. Uses all if None.
    stdout_redirect : str, optional (default: None)
        Redirect bulky stdout instead of storing
        with rest of results (use "/dev/null" to dispose)
    checkpoints_hmm : bool, optional (default: False)
        Store checkpoint HMMs to prefix.<iter>.hmm
    checkpoints_ali : bool, optional (default: False)
        Store checkpoint alignments to prefix.<iter>.sto
    binary : str (default: "jackhmmer")
        Path to jackhmmer binary (put in PATH for
        default to work)

    Returns
    -------
    JackhmmerResult
        namedtuple with fields corresponding to the different
        output files (prefix, alignment, output, tblout, domtblout)

    Raises
    ------
    ExternalToolError, ResourceError
    """
    verify_resources("Input file does not exist or is empty", query, database)

    create_prefix_folders(prefix)

    # store filenames of all individual results;
    # these will be returned as result of the
    # function.
    result = JackhmmerResult(
        prefix, prefix + ".sto",
        prefix + ".output" if stdout_redirect is None else stdout_redirect,
        prefix + ".tblout", prefix + ".domtblout")

    cmd = [
        binary, "-N",
        str(iterations), "-o", result.output, "-A", result.alignment,
        "--tblout", result.tblout, "--domtblout", result.domtblout, "--noali",
        "--notextw"
    ]

    # reporting thresholds are set accordingly to
    # inclusion threshold to reduce memory footprit
    if use_bitscores:
        cmd += [
            "-T",
            str(seq_threshold), "--domT",
            str(domain_threshold), "--incT",
            str(seq_threshold), "--incdomT",
            str(domain_threshold)
        ]
    else:
        cmd += [
            "-E",
            str(seq_threshold), "--domE",
            str(domain_threshold), "--incE",
            str(seq_threshold), "--incdomE",
            str(domain_threshold)
        ]

    # number of CPUs
    if cpu is not None:
        cmd += ["--cpu", str(cpu)]

    # bias correction filter
    if nobias:
        cmd += ["--nobias"]

    # save checkpoints for alignments and HMMs?
    if checkpoints_ali:
        cmd += ["--chkali", prefix]
    if checkpoints_hmm:
        cmd += ["--chkhmm", prefix]

    cmd += [query, database]

    return_code, stdout, stderr = run(cmd)

    # also check we actually created some sort of alignment
    verify_resources(
        "jackhmmer returned empty alignment: "
        "stdout={} stderr={} file={}".format(stdout, stderr, result.alignment),
        result.alignment)

    return result
Esempio n. 12
0
def run_plmc(alignment, couplings_file, param_file=None,
             focus_seq=None, alphabet=None, theta=None,
             scale=None, ignore_gaps=False, iterations=None,
             lambda_h=None, lambda_J=None, lambda_g=None,
             cpu=None, binary="plmc"):
    """
    Run plmc on sequence alignment and store
    files with model parameters and pair couplings.

    Parameters
    ----------
    alignment : str
        Path to input sequence alignment
    couplings_file : str
        Output path for file with evolutionary couplings
        (folder will be created)
    param_file : str
        Output path for binary file containing model
        parameters (folder will be created)
    focus_seq : str, optional (default: None)
        Name of focus sequence, if None, non-focus mode
        will be used
    alphabet : str, optional (default: None)
        Alphabet for model inference. If None, standard
        amino acid alphabet including gap will be used.
        First character in string corresponds to gap
        character (relevant for ignore_gaps).
    theta : float, optional (default: None)
        Sequences with pairwise identity >= theta
        will be clustered and their sequence weights
        downweighted as 1 / num_cluster_members.
        Important: Note that plmc will be parametrized using
        1 - theta. If None, default value in plmc will be used,
        which corresponds to theta=0.8 (plmc setting 0.2).
    scale : float, optional (default: None)
        Scale weights of clusters by this value.
        If None, default value in plmc (1.0) will be used
    ignore_gaps : bool, optional (default: False)
        Exclude gaps from parameter inference. Gap
        character is first character of alphabet
        parameter.
    iterations : int, optional (default: None)
        Maximum iterations for optimization.
    lambda_h : float, optional (default: None)
        l2 regularization strength on fields.
        If None, plmc default will be used.
    lambda_J : float, optional (default: None)
        l2-regularization strength on couplings.
        If None, plmc default will be used
    lambda_g : float, optional (default: None)
        group l1-regularization strength on couplings
        If None, plmc default will be used.
    cpu : Number of cores to use for running plmc.
        Note that plmc has to be compiled in openmp
        mode to runnable with multiple cores.
        Can also be set to "max".
    binary : str, optional (default: "plmc")
        Path to plmc binary

    Returns
    -------
    PlmcResult
        namedtuple containing output files and
        parsed fields from console output of plmc

    Raises
    ------
    ExternalToolError
    """
    create_prefix_folders(couplings_file)

    # Make sure input alignment exists
    verify_resources(
        "Alignment file does not exist", alignment
    )

    cmd = [
        binary,
        "-c", couplings_file,
    ]

    # store eij file if explicitly requested
    if param_file is not None:
        create_prefix_folders(param_file)
        cmd += ["-o", param_file]

    # focus sequence mode and ID
    if focus_seq is not None:
        # TODO: for now split exclude sequence
        # region from focus seq name, otherwise
        # plmc does not remap names. If this
        # behaviour changes in plmc, remove the
        # following line.
        focus_seq = focus_seq.split("/")[0]
        cmd += ["-f", focus_seq]

    # exclude gaps from calculation?
    if ignore_gaps:
        cmd += ["-g"]

    # maximum number of iterations, can also be "max"
    if iterations is not None:
        cmd += ["-m", str(iterations)]

    # set custom alphabet
    # (first character is gap by default in nogap mode)
    if alphabet is not None:
        cmd += ["-a", alphabet]

    # sequence reweighting
    if theta is not None:
        # transform into plmc convention (1-theta)
        theta = 1.0 - theta
        cmd += ["-t", str(theta)]

    # cluster weight
    if scale is not None:
        cmd += ["-s", str(scale)]

    # L2 regularization weight for fields
    if lambda_h is not None:
        cmd += ["-lh", str(lambda_h)]

    # L2 regularization weight for pair couplings
    if lambda_J is not None:
        cmd += ["-le", str(lambda_J)]

    # Group L1 regularization weight for pair couplings
    if lambda_g is not None:
        cmd += ["-lg", str(lambda_g)]

    # Number of cores to use for calculation
    if cpu is not None:
        cmd += ["-n", str(cpu)]

    # finally also add input alignment (main parameter)
    cmd += [alignment]

    # TODO: for now do not check returncode because sometimes
    # returncode == -11 (segfault) despite successful calculation
    return_code, stdout, stderr = run(cmd, check_returncode=False)

    # TODO: remove this segfault-hunting output once fixed
    if return_code != 0:
        # if not a segfault, still raise exception
        if return_code != -11:
            from evcouplings.utils.system import ExternalToolError
            raise ExternalToolError(
                "Call failed:\ncmd={}\nreturncode={}\nstdout={}\nstderr={}".format(
                    cmd, return_code, stdout, stderr
                )
            )

        print("PLMC NON-ZERO RETURNCODE:", return_code)
        print(cmd)
        print(" ".join(cmd))
        print("stdout:", stdout)
        print("stderr:", stderr)

    iter_df, out_fields = parse_plmc_log(stderr)

    # also check we actually calculated couplings...
    if not valid_file(couplings_file):
        raise ResourceError(
            "plmc returned no couplings: stdout={} stderr={} file={}".format(
                stdout, stderr, couplings_file
            )
        )

    # ... and parameter file, if requested
    if param_file and not valid_file(param_file):
        raise ResourceError(
            "plmc returned no parameter file: stdout={} stderr={} file={}".format(
                stdout, stderr, param_file
            )
        )

    return PlmcResult(
        couplings_file, param_file,
        iter_df, *out_fields
    )
Esempio n. 13
0
    def _format_alignment_for_hmmbuild(input_alignment_file, **kwargs):
        # this file is starting point of pipeline;
        # check if input alignment actually exists

        verify_resources("Input alignment does not exist",
                         input_alignment_file)

        # first try to autodetect format of alignment
        with open(input_alignment_file) as f:
            format = detect_format(f)
            if format is None:
                raise InvalidParameterError(
                    "Format of input alignment {} could not be "
                    "automatically detected.".format(input_alignment_file))

        with open(input_alignment_file) as f:
            ali_raw = Alignment.from_file(f, format)

        # Target sequence of alignment
        sequence_id = kwargs["sequence_id"]

        if sequence_id is None:
            raise InvalidParameterError(
                "Parameter sequence_id must be defined")

        # First, find focus sequence in alignment
        focus_index = None
        for i, id_ in enumerate(ali_raw.ids):
            if id_.startswith(sequence_id):
                focus_index = i
                break

        # if we didn't find it, cannot continue
        if focus_index is None:
            raise InvalidParameterError(
                "Target sequence {} could not be found in alignment".format(
                    sequence_id))

        # identify what columns (non-gap) to keep for focus
        # this should be all columns in the raw_focus_alignment_file
        # but checking anyway
        focus_seq = ali_raw[focus_index]
        focus_cols = np.array([
            c not in [ali_raw._match_gap, ali_raw._insert_gap]
            for c in focus_seq
        ])

        # extract focus alignment
        focus_ali = ali_raw.select(columns=focus_cols)
        focus_seq_nogap = "".join(focus_ali[focus_index])

        # determine region of sequence. If first_index is given,
        # use that in any case, otherwise try to autodetect
        full_focus_header = ali_raw.ids[focus_index]
        focus_id = full_focus_header.split()[0]

        # try to extract region from sequence header
        id_, region_start, region_end = parse_header(focus_id)

        # override with first_index if given
        if kwargs["first_index"] is not None:
            region_start = kwargs["first_index"]
            region_end = region_start + len(focus_seq_nogap) - 1

        if region_start is None or region_end is None:
            raise InvalidParameterError(
                "Could not extract region information " +
                "from sequence header {} ".format(full_focus_header) +
                "and first_index parameter is not given.")

        # resubstitute full sequence ID from identifier
        # and region information
        header = "{}/{}-{}".format(id_, region_start, region_end)

        focus_ali.ids[focus_index] = header

        # write target sequence to file
        target_sequence_file = prefix + ".fa"
        with open(target_sequence_file, "w") as f:
            write_fasta([(header, focus_seq_nogap)], f)

        # swap target sequence to first position if it is not
        # the first sequence in alignment;
        # this is particularly important for hhfilter run
        # because target sequence might otherwise be filtered out
        if focus_index != 0:
            indices = np.arange(0, len(focus_ali))
            indices[0] = focus_index
            indices[focus_index] = 0
            focus_index = 0
            focus_ali = focus_ali.select(sequences=indices)

        # write the raw focus alignment for hmmbuild
        focus_fasta_file = prefix + "_raw_focus_input.fasta"
        with open(focus_fasta_file, "w") as f:
            focus_ali.write(f, "fasta")

        return focus_fasta_file, target_sequence_file, region_start, region_end
Esempio n. 14
0
def secondary_structure(**kwargs):
    """
    Predict or load secondary structure for an
    input sequence

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    residues : pandas.DataFrame
        Table with sequence and secondary structure
        in columns i, A_i and sec_struct_3state
    """
    check_required(
        kwargs,
        [
            "prefix", "target_sequence_file",
            "segments", "sec_struct_method",
            "sec_struct_file", "psipred",
        ]
    )

    prefix = kwargs["prefix"]
    create_prefix_folders(prefix)

    secstruct_file = kwargs["sec_struct_file"]
    if secstruct_file is not None:
        verify_resources(
            "Secondary structure prediction file does not exist/is empty",
            secstruct_file
        )
        residues = pd.read_csv(secstruct_file)
    else:
        # make sure target sequence file is there so we can
        # predict secondary structure
        target_seq_file = kwargs["target_sequence_file"]
        verify_resources(
            "Sequence file does not exist/is empty", target_seq_file
        )

        # we need to figure out what the index of the first residue
        # in the target sequence is; obtain first index from segment
        # information if possible
        if kwargs["segments"] is not None:
            s = Segment.from_list(kwargs["segments"][0])
            first_index = s.region_start
        else:
            # otherwise try to get it from sequence file
            first_index = None

            with open(target_seq_file) as f:
                header, _ = next(read_fasta(f))
                if header is not None:
                    _, first_index, _ = parse_header(header)

                # if we cannot identify first index from header,
                # do not make guesses but fail
                if first_index is None:
                    raise InvalidParameterError(
                        "Could not unambiguously identify sequence range from "
                        "FASTA header, needs to specified as id/start-end: {}".format(
                            header
                        )
                    )

        # finally, run secondary structure prediction
        if kwargs["sec_struct_method"] == "psipred":
            # store psipred output in a separate directory
            output_dir = path.join(path.dirname(prefix), "psipred")

            # run psipred
            ss2_file, horiz_file = run_psipred(
                target_seq_file, output_dir, binary=kwargs["psipred"]
            )

            # parse output, renumber to first index
            residues = read_psipred_prediction(
                horiz_file, first_index=first_index
            )
        else:
            raise InvalidParameterError(
                "Secondary structure prediction method not implemented: "
                "{}. Valid choices: psipred".format(kwargs["sec_struct_method"])
            )

    # return predicted table
    return residues
Esempio n. 15
0
def genome_distance(**kwargs):
    """
    Protocol:

    Concatenate alignments based on genomic distance

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

        .. todo::

            Explain meaning of parameters in detail.

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        .. todo::

            this is the full list normally returned by alignment protocol, decide which ones to keep.
            Mandatory:

            * alignment_file
            * focus_sequence
            * focus_mode
            * segments

        * alignment_file
        * [raw_alignment_file]
        * statistics_file
        * target_sequence_file
        * sequence_file
        * [annotation_file]
        * frequencies_file
        * identities_file
        * [hittable_file]
        * focus_mode
        * focus_sequence
        * segments
    """
    check_required(kwargs, [
        "prefix",
        "first_raw_focus_alignment_file",
        "second_raw_focus_alignment_file",
        "first_focus_sequence",
        "second_focus_sequence",
        "first_focus_mode",
        "second_focus_mode",
        "first_segments",
        "second_segments",
    ])

    prefix = kwargs["prefix"]

    # make sure input alignments
    verify_resources("Input alignment does not exist",
                     kwargs["first_alignment_file"],
                     kwargs["second_alignment_file"])

    # make sure output directory exists
    create_prefix_folders(prefix)

    # -------------------------------------------------
    # TODO: implement concatenation functionality and
    # postprocessing functionality here
    # -------------------------------------------------

    def _modify_segments(seg_list, seg_prefix):
        # extract segments from list representation into objects
        segs = [Segment.from_list(s) for s in seg_list]
        # update segment IDs
        for i, s in enumerate(segs, start=1):
            s.segment_id = "{}_{}".format(seg_prefix, i)

        return segs

    # merge segments - this allows to have more than one segment per
    # "monomer" alignment
    segments_1 = _modify_segments(kwargs["first_segments"], "A")
    segments_2 = _modify_segments(kwargs["second_segments"], "B")
    segments_complex = segments_1 + segments_2

    # make sure we return all the necessary information:
    # * alignment_file: final concatenated alignment that will go into plmc
    # * focus_sequence: this is the identifier of the concatenated target
    #   sequence which will be passed into plmc with -f

    outcfg = {
        "alignment_file": None,  # TODO: specify
        "focus_mode": True,
        "focus_sequence": None,  # TODO: specify
        "segments": [s.to_list() for s in segments_complex],
        # optional but good to have:
        "num_sites": None,
        "num_sequences": None,
        # "effective_sequences": n_eff # TODO: could compute this like in align stage
        # TODO: there are more outputs that we could add here (not mandatory),
        # e.g. single column frequencies in concatenated alignment
    }

    return outcfg
Esempio n. 16
0
def standard(**kwargs):
    """
    Protocol:
    Predict 3D structure from evolutionary couplings

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * sec_struct_file
        * folding_ec_file
        * folded_structure_files
    """
    check_required(
        kwargs,
        [
            "prefix", "engine", "ec_file", "target_sequence_file",
            "segments", "folding_config_file", "cut_to_alignment_region",
            "sec_struct_method", "reuse_sec_struct",
            "sec_struct_file", "filter_sec_struct_clashes",
            "min_sequence_distance", "fold_probability_cutoffs",
            "fold_lowest_count", "fold_highest_count", "fold_increase",
            "num_models", "psipred", "cpu", "remapped_pdb_files",
            "cleanup",
        ]
    )

    prefix = kwargs["prefix"]

    # make sure output directory exists
    create_prefix_folders(prefix)

    outcfg = {
        "folding_ec_file": prefix + "_CouplingScores_with_clashes.csv",
        "sec_struct_file": prefix + "_secondary_structure.csv",
    }

    # get secondary structure prediction
    # check if we should (and can) reuse output file from previous run
    if kwargs["reuse_sec_struct"] and valid_file(outcfg["sec_struct_file"]):
        residues = pd.read_csv(outcfg["sec_struct_file"])
    else:
        residues = secondary_structure(**kwargs)

    # make pymol secondary structure assignment script
    outcfg["secondary_structure_pml_file"] = prefix + "_ss_draw.pml"
    pymol_secondary_structure(
        residues, outcfg["secondary_structure_pml_file"]
    )

    # load ECs and filter for long-range pairs
    verify_resources(
        "EC file does not exist", kwargs["ec_file"]
    )
    ecs_all = pd.read_csv(kwargs["ec_file"])
    ecs = ecs_all.query("abs(i - j) > {}".format(
        kwargs["min_sequence_distance"])
    )

    # find secondary structure clashes
    ecs = secstruct_clashes(ecs, residues)
    ecs.to_csv(outcfg["folding_ec_file"], index=False)

    # if requested, filter clashes out before folding
    if kwargs["filter_sec_struct_clashes"]:
        ecs_fold = ecs.loc[~ecs.ss_clash]
    else:
        ecs_fold = ecs

    # cut modelled region to aligned region, if selected
    if kwargs["cut_to_alignment_region"]:
        segments = kwargs["segments"]
        # infer region from segment positions if we have it
        if segments is not None:
            positions = Segment.from_list(segments[0]).positions
        else:
            # otherwise get from EC values (could be misleading if
            # EC list is truncated, so only second option)
            positions = set(ecs.i.unique()).union(ecs.j.unique())

        # limit modelled positions to covered region
        first_pos, last_pos = min(positions), max(positions)
        residues.loc[:, "in_model"] = False
        residues.loc[
            (residues.i >= first_pos) & (residues.i <= last_pos),
            "in_model"
        ] = True
    else:
        # otherwise include all positions in model
        residues.loc[:, "in_model"] = True

    # save secondary structure prediction
    residues.to_csv(outcfg["sec_struct_file"], index=False)

    # only use the residues that will be in model for folding
    residues_fold = residues.loc[residues.in_model]

    # after all the setup, now fold the structures...
    # to speed things up, parallelize this to the number of
    # available CPUs
    num_procs = kwargs["cpu"]
    if num_procs is None:
        num_procs = 1

    # first define all the sub-runs...
    folding_runs = []

    # ... based on mixture model probability
    cutoffs = kwargs["fold_probability_cutoffs"]
    if cutoffs is not None and "probability" in ecs_fold.columns:
        if not isinstance(cutoffs, list):
            cutoffs = [cutoffs]

        for c in cutoffs:
            sig_ecs = ecs_fold.query("probability >= @c")
            if len(sig_ecs) > 0:
                folding_runs.append(
                    (sig_ecs,
                     "_significant_ECs_{}".format(c))
                )

    # ... and on simple EC counts/bins
    flc = kwargs["fold_lowest_count"]
    fhc = kwargs["fold_highest_count"]
    fi = kwargs["fold_increase"]
    if flc is not None and fhc is not None and fi is not None:
        num_sites = len(
            set.union(set(ecs.i.unique()), set(ecs.j.unique()))
        )

        # transform fraction of number of sites into discrete number of ECs
        def _discrete_count(x):
            if isinstance(x, float):
                x = ceil(x * num_sites)
            return int(x)

        # range of plots to make
        lowest = _discrete_count(flc)
        highest = _discrete_count(fhc)
        step = _discrete_count(fi)

        # append to list of jobs to run
        folding_runs += [
            (
                ecs_fold.iloc[:c],
                "_{}".format(c)
            )
            for c in range(lowest, highest + 1, step)
        ]

    # set up method to drive the folding of each job
    method = kwargs["engine"]

    # store structures in an auxiliary subdirectory, after folding
    # final models will be moved to main folding dir. Depending
    # on cleanup setting, the aux directory will be removed
    aux_prefix = insert_dir(prefix, "aux", rootname_subdir=False)
    aux_dir = path.dirname(aux_prefix)

    folding_runs = [
        (job_ecs, aux_prefix + job_suffix)
        for (job_ecs, job_suffix) in folding_runs
    ]

    if method == "cns_dgsa":
        folder = partial(
            cns_dgsa_fold,
            residues_fold,
            config_file=kwargs["folding_config_file"],
            num_structures=kwargs["num_models"],
            log_level=None,
            binary=kwargs["cns"]
        )
    else:
        raise InvalidParameterError(
            "Invalid folding engine: {} ".format(method) +
            "Valid selections are: cns_dgsa"
        )

    # then apply folding function to each sub-run
    pool = mp.Pool(processes=num_procs)
    results = pool.starmap(folder, folding_runs)

    # make double sure that the pool is cleaned up,
    # or SIGTERM upon exit will interfere with
    # interrupt signal interception
    pool.close()
    pool.join()

    # merge result dictionaries into one dict
    folded_files = {
        k: v for subres in results for k, v in subres.items()
    }

    # move structures from aux into main folding dir
    fold_dir = path.dirname(prefix)
    prediction_files = []
    for name, file_path in folded_files.items():
        # move file (use copy to allow overwriting)
        shutil.copy(file_path, fold_dir)

        # update file path to main folding dir,
        # and put in a flat list of result files
        prediction_files.append(
            file_path.replace(aux_prefix, prefix)
        )

    outcfg["folded_structure_files"] = prediction_files

    # remove aux dir if cleanup is requested
    if kwargs["cleanup"]:
        shutil.rmtree(aux_dir)

    # apply ranking to predicted models
    ranking = dihedral_ranking(prediction_files, residues)

    # apply clustering (all available methods), but only
    # if we have something to cluster
    if len(prediction_files) > 1:
        clustering = maxcluster_clustering_table(
            prediction_files, binary=kwargs["maxcluster"]
        )

        # join ranking with clustering
        ranking = ranking.merge(clustering, on="filename", how="left")

    # sort by score (best models first)
    ranking = ranking.sort_values(by="ranking_score", ascending=False)

    # store as file
    outcfg["folding_ranking_file"] = prefix + "_ranking.csv"
    ranking.to_csv(outcfg["folding_ranking_file"], index=False)

    # apply comparison to existing structures
    if kwargs["remapped_pdb_files"] is not None and len(kwargs["remapped_pdb_files"]) > 0:
        experimental_files = kwargs["remapped_pdb_files"]

        comp_all, comp_singles = compare_models_maxcluster(
            list(experimental_files.keys()), prediction_files,
            norm_by_intersection=True, distance_cutoff=None,
            binary=kwargs["maxcluster"]
        )

        # merge with ranking and save
        comparison = ranking.merge(
            comp_all, on="filename", how="left"
        ).sort_values(by="tm", ascending=False)
        outcfg["folding_comparison_file"] = prefix + "_comparison.csv"
        comparison.to_csv(outcfg["folding_comparison_file"], index=False)

        # also store comparison to structures in individual files
        ind_comp_files = {}
        for filename, comp_single in comp_singles.items():
            comparison_s = ranking.merge(
                comp_single, on="filename", how="left"
            ).sort_values(by="tm", ascending=False)
            basename = path.splitext(path.split(filename)[1])[0]
            ind_file = path.join(fold_dir, basename + ".csv")

            # map back to original key from remapped_pdb_files as a key for this list
            ind_comp_files[ind_file] = experimental_files[filename]
            comparison_s.to_csv(ind_file, index=False)

        outcfg["folding_individual_comparison_files"] = ind_comp_files

    return outcfg
Esempio n. 17
0
def standard(**kwargs):
    """
    Protocol:
    Compare ECs for single proteins (or domains)
    to 3D structure information

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * mutation_matrix_file
        * [mutation_dataset_predicted_file]
    """
    check_required(kwargs, [
        "prefix",
        "model_file",
        "mutation_dataset_file",
    ])

    prefix = kwargs["prefix"]

    outcfg = {
        "mutation_matrix_file": prefix + "_single_mutant_matrix.csv",
        "mutation_matrix_plot_files": [],
    }

    # make sure model file exists
    verify_resources("Model parameter file does not exist",
                     kwargs["model_file"])

    # make sure output directory exists
    create_prefix_folders(prefix)

    # load couplings object, and create independent model
    c = CouplingsModel(kwargs["model_file"])
    c0 = c.to_independent_model()

    for model, type_ in [(c, "Epistatic"), (c0, "Independent")]:
        # interactive plot using bokeh
        filename = prefix + "_{}_model".format(type_.lower(), )
        output_file(filename + ".html", "{} model".format(type_))
        fig = evcouplings.visualize.mutations.plot_mutation_matrix(
            model, engine="bokeh")
        save(fig)
        outcfg["mutation_matrix_plot_files"].append(filename + ".html")

        # static matplotlib plot
        evcouplings.visualize.mutations.plot_mutation_matrix(model)
        plt.savefig(filename + ".pdf", bbox_inches="tight")
        outcfg["mutation_matrix_plot_files"].append(filename + ".pdf")

    # create single mutation matrix table,
    # add prediction by independent model and
    # save to file
    singles = single_mutant_matrix(c, output_column="prediction_epistatic")

    singles = predict_mutation_table(c0, singles, "prediction_independent")

    singles.to_csv(outcfg["mutation_matrix_file"], index=False)

    # Pymol scripts
    outcfg["mutations_epistatic_pml_files"] = []
    for model in ["epistatic", "independent"]:
        pml_filename = prefix + "_{}_model.pml".format(model)
        evcouplings.visualize.mutations.mutation_pymol_script(
            singles, pml_filename, effect_column="prediction_" + model)
        outcfg["mutations_epistatic_pml_files"].append(pml_filename)

    # predict experimental dataset if given
    dataset_file = kwargs["mutation_dataset_file"]
    if dataset_file is not None:
        verify_resources("Dataset file does not exist", dataset_file)
        data = pd.read_csv(dataset_file, comment="#")

        # add epistatic model prediction
        data_pred = predict_mutation_table(c, data, "prediction_epistatic")

        # add independent model prediction
        data_pred = predict_mutation_table(c0, data_pred,
                                           "prediction_independent")

        outcfg[
            "mutation_dataset_predicted_file"] = prefix + "_dataset_predicted.csv"
        data_pred.to_csv(outcfg["mutation_dataset_predicted_file"],
                         index=False)

    return outcfg
Esempio n. 18
0
def complex(**kwargs):
    """
    Protocol:
    Mutation effect prediction and visualization for protein complexes

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * mutation_matrix_file
        * [mutation_dataset_predicted_file]
    """
    check_required(
        kwargs, ["prefix", "model_file", "mutation_dataset_file", "segments"])

    prefix = kwargs["prefix"]

    outcfg = {
        "mutation_matrix_file": prefix + "_single_mutant_matrix.csv",
        "mutation_matrix_plot_files": [],
    }

    # make sure model file exists
    verify_resources("Model parameter file does not exist",
                     kwargs["model_file"])

    # make sure output directory exists
    create_prefix_folders(prefix)

    # load segments to create couplings object
    segment_objects = []
    for segment_list in kwargs["segments"]:
        segment_objects.append(Segment.from_list(segment_list))

    first_segment_name = Segment.from_list(kwargs["segments"][0]).segment_id
    second_segment_name = Segment.from_list(kwargs["segments"][1]).segment_id

    first_chain_name = Segment.from_list(
        kwargs["segments"][0]).default_chain_name()
    second_chain_name = Segment.from_list(
        kwargs["segments"][1]).default_chain_name()

    # load couplings object
    c = MultiSegmentCouplingsModel(kwargs["model_file"], *segment_objects)

    # create the independent model
    c0 = c.to_independent_model()

    # create the inter-protein only Jij model
    ci = c.to_inter_segment_model()

    for model, type_ in [(c, "Epistatic"), (c0, "Independent"),
                         (ci, "Inter_segment")]:
        # interactive plot using bokeh
        filename = prefix + "_{}_model".format(type_.lower(), )
        output_file(filename + ".html", "{} model".format(type_))
        fig = evcouplings.visualize.mutations.plot_mutation_matrix(
            model, engine="bokeh")
        save(fig)
        outcfg["mutation_matrix_plot_files"].append(filename + ".html")

        # static matplotlib plot
        evcouplings.visualize.mutations.plot_mutation_matrix(model)
        plt.savefig(filename + ".pdf", bbox_inches="tight")
        outcfg["mutation_matrix_plot_files"].append(filename + ".pdf")

    # create single mutation matrix table,
    # add prediction by independent model and
    # save to file
    singles = single_mutant_matrix(c, output_column="prediction_epistatic")

    singles = predict_mutation_table(c0, singles, "prediction_independent")

    singles = predict_mutation_table(ci, singles, "prediction_inter_segment")

    singles.to_csv(outcfg["mutation_matrix_file"], index=False)

    # Pymol scripts
    outcfg["mutations_epistatic_pml_files"] = []
    for model in ["epistatic", "independent", "inter_segment"]:
        pml_filename = prefix + "_{}_model.pml".format(model)
        evcouplings.visualize.mutations.mutation_pymol_script(
            singles,
            pml_filename,
            effect_column="prediction_" + model,
            segment_to_chain_mapping={
                first_segment_name: first_chain_name,
                second_segment_name: second_chain_name
            })
        outcfg["mutations_epistatic_pml_files"].append(pml_filename)

    # predict experimental dataset if given
    dataset_file = kwargs["mutation_dataset_file"]
    if dataset_file is not None:
        verify_resources("Dataset file does not exist", dataset_file)
        data = pd.read_csv(dataset_file, comment="#", sep=",")

        if "segment" not in data.columns:
            raise ValueError("Input mutation dataset file does not contain "
                             "a column called 'segment' to specify the "
                             "protein of origin for each mutation")

        # add epistatic model prediction
        data_pred = predict_mutation_table(c, data, "prediction_epistatic")

        # add independent model prediction
        data_pred = predict_mutation_table(c0, data_pred,
                                           "prediction_independent")

        data_pred = predict_mutation_table(ci, data_pred, "inter_segment")

        outcfg[
            "mutation_dataset_predicted_file"] = prefix + "_dataset_predicted.csv"
        data_pred.to_csv(outcfg["mutation_dataset_predicted_file"],
                         index=False)

    return outcfg
Esempio n. 19
0
def infer_plmc(**kwargs):
    """
    Run EC computation on alignment. This function contains
    the functionality shared between monomer and complex EC
    inference.
    
    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required
    
    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        raw_ec_file
        model_file
        num_sites
        num_sequences
        effective_sequences

        focus_mode (passed through)
        focus_sequence (passed through)
        segments (passed through)

    """
    check_required(
        kwargs,
        [
            "prefix", "alignment_file",
            "focus_mode", "focus_sequence", "theta",
            "alphabet", "segments", "ignore_gaps", "iterations",
            "lambda_h", "lambda_J", "lambda_group",
            "scale_clusters",
            "cpu", "plmc", "reuse_ecs",
        ]
    )

    prefix = kwargs["prefix"]

    # for now disable option to not save model, since
    # otherwise mutate stage will crash. To remove model
    # file at end, use delete option in management section.
    """
    if kwargs["save_model"]:
        model = prefix + ".model"
    else:
        model = None
    """
    model = prefix + ".model"

    outcfg = {
        "model_file": model,
        "raw_ec_file": prefix + "_ECs.txt",
        "ec_file": prefix + "_CouplingScores.csv",
        # the following are passed through stage...
        "focus_mode": kwargs["focus_mode"],
        "focus_sequence": kwargs["focus_sequence"],
        "segments": kwargs["segments"],
    }

    # make sure input alignment exists
    verify_resources(
        "Input alignment does not exist",
        kwargs["alignment_file"]
    )

    # make sure output directory exists
    create_prefix_folders(prefix)

    # regularization strength on couplings J_ij
    lambda_J = kwargs["lambda_J"]

    segments = kwargs["segments"]
    if segments is not None:
        segments = [
            mapping.Segment.from_list(s) for s in segments
        ]

    # first determine size of alphabet;
    # default is amino acid alphabet
    if kwargs["alphabet"] is None:
        alphabet = ALPHABET_PROTEIN
        alphabet_setting = None
    else:
        alphabet = kwargs["alphabet"]

        # allow shortcuts for protein, DNA, RNA
        if alphabet in ALPHABET_MAP:
            alphabet = ALPHABET_MAP[alphabet]

        # if we have protein alphabet, do not set
        # as plmc parameter since default parameter,
        # has some implementation advantages for focus mode
        if alphabet == ALPHABET_PROTEIN:
            alphabet_setting = None
        else:
            alphabet_setting = alphabet

    # scale lambda_J to proportionally compensate
    # for higher number of J_ij compared to h_i?
    if kwargs["lambda_J_times_Lq"]:
        num_symbols = len(alphabet)

        # if we ignore gaps, there is one character less
        if kwargs["ignore_gaps"]:
            num_symbols -= 1

        # second, determine number of uppercase positions
        # that are included in the calculation
        with open(kwargs["alignment_file"]) as f:
            seq_id, seq = next(read_fasta(f))

        # gap character is by convention first char in alphabet
        gap = alphabet[0]
        uppercase = [
            c for c in seq if c == c.upper() or c == gap
        ]
        L = len(uppercase)

        # finally, scale lambda_J
        lambda_J *= (num_symbols - 1) * (L - 1)

    # run plmc... or reuse pre-exisiting results from previous run
    plm_outcfg_file = prefix + ".couplings_standard_plmc.outcfg"

    # determine if to rerun, only possible if previous results
    # were stored in ali_outcfg_file
    if kwargs["reuse_ecs"] and valid_file(plm_outcfg_file):
        plmc_result = read_config_file(plm_outcfg_file)

        # check if the EC/parameter files are there
        required_files = [outcfg["raw_ec_file"]]

        if outcfg["model_file"] is not None:
            required_files += [outcfg["model_file"]]

        verify_resources(
            "Tried to reuse ECs, but empty or "
            "does not exist",
            *required_files
        )

    else:
        # run plmc binary
        plmc_result = ct.run_plmc(
            kwargs["alignment_file"],
            outcfg["raw_ec_file"],
            outcfg["model_file"],
            focus_seq=kwargs["focus_sequence"],
            alphabet=alphabet_setting,
            theta=kwargs["theta"],
            scale=kwargs["scale_clusters"],
            ignore_gaps=kwargs["ignore_gaps"],
            iterations=kwargs["iterations"],
            lambda_h=kwargs["lambda_h"],
            lambda_J=lambda_J,
            lambda_g=kwargs["lambda_group"],
            cpu=kwargs["cpu"],
            binary=kwargs["plmc"],
        )

        # save iteration table to file
        iter_table_file = prefix + "_iteration_table.csv"
        plmc_result.iteration_table.to_csv(
            iter_table_file
        )

        # turn namedtuple into dictionary to make
        # restarting code nicer
        plmc_result = dict(plmc_result._asdict())

        # then replace table with filename so
        # we can store results in config file
        plmc_result["iteration_table"] = iter_table_file

        # save results of search for possible restart
        write_config_file(plm_outcfg_file, plmc_result)

    # store useful information about model in outcfg
    outcfg.update({
        "num_sites": plmc_result["num_valid_sites"],
        "num_valid_sequences": plmc_result["num_valid_seqs"],
        "effective_sequences": plmc_result["effective_samples"],
        "region_start": plmc_result["region_start"],
    })

    # read and sort ECs
    ecs = pairs.read_raw_ec_file(outcfg["raw_ec_file"])

    if segments is not None:
        # create index mapping
        seg_mapper = mapping.SegmentIndexMapper(
            kwargs["focus_mode"], outcfg["region_start"], *segments
        )

        # apply to EC table
        ecs = mapping.segment_map_ecs(ecs, seg_mapper)

    return outcfg, ecs, segments
Esempio n. 20
0
def jackhmmer_search(**kwargs):
    """
    Protocol:

    Iterative jackhmmer search against a sequence database.

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    .. todo::
        explain meaning of parameters in detail.

    Returns
    -------
    outcfg : dict
        Output configuration of the protocol, including
        the following fields:

        * sequence_id (passed through from input)
        * first_index (passed through from input)
        * target_sequence_file
        * sequence_file
        * raw_alignment_file
        * hittable_file
        * focus_mode
        * focus_sequence
        * segments
    """
    check_required(kwargs, [
        "prefix", "sequence_id", "sequence_file", "sequence_download_url",
        "region", "first_index", "use_bitscores", "domain_threshold",
        "sequence_threshold", "database", "iterations", "cpu", "nobias",
        "reuse_alignment", "checkpoints_hmm", "checkpoints_ali", "jackhmmer",
        "extract_annotation"
    ])
    prefix = kwargs["prefix"]

    # make sure output directory exists
    create_prefix_folders(prefix)

    # store search sequence file here
    target_sequence_file = prefix + ".fa"
    full_sequence_file = prefix + "_full.fa"

    # make sure search sequence is defined and load it
    full_seq_file, (full_seq_id, full_seq) = fetch_sequence(
        kwargs["sequence_id"], kwargs["sequence_file"],
        kwargs["sequence_download_url"], full_sequence_file)

    # cut sequence to target region and save in sequence_file
    # (this is the main sequence file used downstream)
    (region_start, region_end), cut_seq = cut_sequence(full_seq,
                                                       kwargs["sequence_id"],
                                                       kwargs["region"],
                                                       kwargs["first_index"],
                                                       target_sequence_file)

    # run jackhmmer... allow to reuse pre-exisiting
    # Stockholm alignment file here
    ali_outcfg_file = prefix + ".align_jackhmmer_search.outcfg"

    # determine if to rerun, only possible if previous results
    # were stored in ali_outcfg_file
    if kwargs["reuse_alignment"] and valid_file(ali_outcfg_file):
        ali = read_config_file(ali_outcfg_file)

        # check if the alignment file itself is also there
        verify_resources(
            "Tried to reuse alignment, but empty or "
            "does not exist", ali["alignment"], ali["domtblout"])
    else:
        # otherwise, we have to run the alignment
        # modify search thresholds to be suitable for jackhmmer
        seq_threshold, domain_threshold = search_thresholds(
            kwargs["use_bitscores"], kwargs["sequence_threshold"],
            kwargs["domain_threshold"], len(cut_seq))

        # run search process
        ali = at.run_jackhmmer(
            query=target_sequence_file,
            database=kwargs[kwargs["database"]],
            prefix=prefix,
            use_bitscores=kwargs["use_bitscores"],
            domain_threshold=domain_threshold,
            seq_threshold=seq_threshold,
            iterations=kwargs["iterations"],
            nobias=kwargs["nobias"],
            cpu=kwargs["cpu"],
            checkpoints_hmm=kwargs["checkpoints_hmm"],
            checkpoints_ali=kwargs["checkpoints_ali"],
            binary=kwargs["jackhmmer"],
        )

        # get rid of huge stdout log file immediately
        # (do not use /dev/null option of jackhmmer function
        # to make no assumption about operating system)
        try:
            os.remove(ali.output)
        except OSError:
            pass

        # turn namedtuple into dictionary to make
        # restarting code nicer
        ali = dict(ali._asdict())

        # save results of search for possible restart
        write_config_file(ali_outcfg_file, ali)

    # prepare output dictionary with result files
    outcfg = {
        "sequence_id": kwargs["sequence_id"],
        "target_sequence_file": target_sequence_file,
        "sequence_file": full_sequence_file,
        "first_index": kwargs["first_index"],
        "focus_mode": True,
        "raw_alignment_file": ali["alignment"],
        "hittable_file": ali["domtblout"],
    }

    # define a single protein segment based on target sequence
    outcfg["segments"] = [
        Segment("aa", kwargs["sequence_id"], region_start, region_end,
                range(region_start, region_end + 1)).to_list()
    ]

    outcfg["focus_sequence"] = "{}/{}-{}".format(kwargs["sequence_id"],
                                                 region_start, region_end)

    return outcfg
Esempio n. 21
0
def hmmbuild_and_search(**kwargs):
    """
    Protocol:

    Build HMM from sequence alignment using hmmbuild and 
    search against a sequence database using hmmsearch.
    
    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the protocol, including
        the following fields:

        * target_sequence_file
        * sequence_file
        * raw_alignment_file
        * hittable_file
        * focus_mode
        * focus_sequence
        * segments
    """
    def _format_alignment_for_hmmbuild(input_alignment_file, **kwargs):
        # this file is starting point of pipeline;
        # check if input alignment actually exists

        verify_resources("Input alignment does not exist",
                         input_alignment_file)

        # first try to autodetect format of alignment
        with open(input_alignment_file) as f:
            format = detect_format(f)
            if format is None:
                raise InvalidParameterError(
                    "Format of input alignment {} could not be "
                    "automatically detected.".format(input_alignment_file))

        with open(input_alignment_file) as f:
            ali_raw = Alignment.from_file(f, format)

        # Target sequence of alignment
        sequence_id = kwargs["sequence_id"]

        if sequence_id is None:
            raise InvalidParameterError(
                "Parameter sequence_id must be defined")

        # First, find focus sequence in alignment
        focus_index = None
        for i, id_ in enumerate(ali_raw.ids):
            if id_.startswith(sequence_id):
                focus_index = i
                break

        # if we didn't find it, cannot continue
        if focus_index is None:
            raise InvalidParameterError(
                "Target sequence {} could not be found in alignment".format(
                    sequence_id))

        # identify what columns (non-gap) to keep for focus
        # this should be all columns in the raw_focus_alignment_file
        # but checking anyway
        focus_seq = ali_raw[focus_index]
        focus_cols = np.array([
            c not in [ali_raw._match_gap, ali_raw._insert_gap]
            for c in focus_seq
        ])

        # extract focus alignment
        focus_ali = ali_raw.select(columns=focus_cols)
        focus_seq_nogap = "".join(focus_ali[focus_index])

        # determine region of sequence. If first_index is given,
        # use that in any case, otherwise try to autodetect
        full_focus_header = ali_raw.ids[focus_index]
        focus_id = full_focus_header.split()[0]

        # try to extract region from sequence header
        id_, region_start, region_end = parse_header(focus_id)

        # override with first_index if given
        if kwargs["first_index"] is not None:
            region_start = kwargs["first_index"]
            region_end = region_start + len(focus_seq_nogap) - 1

        if region_start is None or region_end is None:
            raise InvalidParameterError(
                "Could not extract region information " +
                "from sequence header {} ".format(full_focus_header) +
                "and first_index parameter is not given.")

        # resubstitute full sequence ID from identifier
        # and region information
        header = "{}/{}-{}".format(id_, region_start, region_end)

        focus_ali.ids[focus_index] = header

        # write target sequence to file
        target_sequence_file = prefix + ".fa"
        with open(target_sequence_file, "w") as f:
            write_fasta([(header, focus_seq_nogap)], f)

        # swap target sequence to first position if it is not
        # the first sequence in alignment;
        # this is particularly important for hhfilter run
        # because target sequence might otherwise be filtered out
        if focus_index != 0:
            indices = np.arange(0, len(focus_ali))
            indices[0] = focus_index
            indices[focus_index] = 0
            focus_index = 0
            focus_ali = focus_ali.select(sequences=indices)

        # write the raw focus alignment for hmmbuild
        focus_fasta_file = prefix + "_raw_focus_input.fasta"
        with open(focus_fasta_file, "w") as f:
            focus_ali.write(f, "fasta")

        return focus_fasta_file, target_sequence_file, region_start, region_end

    # define the gap threshold for inclusion in HMM's build by HMMbuild.
    SYMFRAC_HMMBUILD = 0.0

    # check for required options
    check_required(kwargs, [
        "prefix", "sequence_id", "alignment_file", "use_bitscores",
        "domain_threshold", "sequence_threshold", "database", "cpu", "nobias",
        "reuse_alignment", "hmmbuild", "hmmsearch"
    ])
    prefix = kwargs["prefix"]

    # make sure output directory exists
    create_prefix_folders(prefix)

    # prepare input alignment for hmmbuild
    focus_fasta_file, target_sequence_file, region_start, region_end = \
        _format_alignment_for_hmmbuild(
            kwargs["alignment_file"], **kwargs
        )

    # run hmmbuild_and_search... allow to reuse pre-exisiting
    # Stockholm alignment file here
    ali_outcfg_file = prefix + ".align_hmmbuild_and_search.outcfg"

    # determine if to rerun, only possible if previous results
    # were stored in ali_outcfg_file
    if kwargs["reuse_alignment"] and valid_file(ali_outcfg_file):
        ali = read_config_file(ali_outcfg_file)

        # check if the alignment file itself is also there
        verify_resources(
            "Tried to reuse alignment, but empty or "
            "does not exist", ali["alignment"], ali["domtblout"])
    else:
        # otherwise, we have to run the alignment
        # modify search thresholds to be suitable for hmmsearch
        sequence_length = region_end - region_start + 1
        seq_threshold, domain_threshold = search_thresholds(
            kwargs["use_bitscores"], kwargs["sequence_threshold"],
            kwargs["domain_threshold"], sequence_length)

        # create the hmm
        hmmbuild_result = at.run_hmmbuild(
            alignment_file=focus_fasta_file,
            prefix=prefix,
            symfrac=SYMFRAC_HMMBUILD,
            cpu=kwargs["cpu"],
            binary=kwargs["hmmbuild"],
        )
        hmmfile = hmmbuild_result.hmmfile

        # run the alignment from the hmm
        ali = at.run_hmmsearch(
            hmmfile=hmmfile,
            database=kwargs[kwargs["database"]],
            prefix=prefix,
            use_bitscores=kwargs["use_bitscores"],
            domain_threshold=domain_threshold,
            seq_threshold=seq_threshold,
            nobias=kwargs["nobias"],
            cpu=kwargs["cpu"],
            binary=kwargs["hmmsearch"],
        )

        # get rid of huge stdout log file immediately
        try:
            os.remove(ali.output)
        except OSError:
            pass

        # turn namedtuple into dictionary to make
        # restarting code nicer
        ali = dict(ali._asdict())
        # only item from hmmsearch_result to save is the hmmfile
        ali["hmmfile"] = hmmfile

        # save results of search for possible restart
        write_config_file(ali_outcfg_file, ali)

    # prepare output dictionary with result files
    outcfg = {
        "sequence_file": target_sequence_file,
        "first_index": region_start,
        "input_raw_focus_alignment": focus_fasta_file,
        "target_sequence_file": target_sequence_file,
        "focus_mode": True,
        "raw_alignment_file": ali["alignment"],
        "hittable_file": ali["domtblout"],
    }

    # convert the raw output alignment to fasta format
    # and add the appropriate query sequecne
    raw_focus_alignment_file = _make_hmmsearch_raw_fasta(outcfg, prefix)
    outcfg["raw_focus_alignment_file"] = raw_focus_alignment_file

    # define a single protein segment based on target sequence
    outcfg["segments"] = [
        Segment("aa", kwargs["sequence_id"], region_start, region_end,
                range(region_start, region_end + 1)).to_list()
    ]

    outcfg["focus_sequence"] = "{}/{}-{}".format(kwargs["sequence_id"],
                                                 region_start, region_end)

    return outcfg
Esempio n. 22
0
def complex(**kwargs):
    """
    Protocol:
    Compare ECs for a complex to
    3D structure

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * ec_file_compared_all
        * ec_file_compared_all_longrange
        * pdb_structure_hits
        * distmap_monomer
        * distmap_multimer
        * contact_map_files
        * remapped_pdb_files
    """
    check_required(kwargs, [
        "prefix", "ec_file", "min_sequence_distance", "pdb_mmtf_dir",
        "atom_filter", "first_compare_multimer", "second_compare_multimer",
        "distance_cutoff", "first_sequence_id", "second_sequence_id",
        "first_sequence_file", "second_sequence_file", "first_segments",
        "second_segments", "first_target_sequence_file",
        "second_target_sequence_file", "scale_sizes"
    ])

    prefix = kwargs["prefix"]

    outcfg = {
        # initialize output EC files
        "ec_compared_all_file": prefix + "_CouplingScoresCompared_all.csv",
        "ec_compared_longrange_file":
        prefix + "_CouplingScoresCompared_longrange.csv",
        "ec_compared_inter_file": prefix + "_CouplingScoresCompared_inter.csv",

        # initialize output inter distancemap files
        "distmap_inter": prefix + "_distmap_inter",
        "inter_contacts_file": prefix + "_inter_contacts_file"
    }

    # Add PDB comparison files for first and second monomer
    for monomer_prefix in ["first", "second"]:
        outcfg = {
            **outcfg,
            monomer_prefix + "_pdb_structure_hits_file":
            "{}_{}_structure_hits.csv".format(prefix, monomer_prefix),
            monomer_prefix + "_pdb_structure_hits_unfiltered_file":
            "{}_{}_structure_hits_unfitered.csv".format(
                prefix, monomer_prefix),
            monomer_prefix + "_distmap_monomer":
            "{}_{}_distance_map_monomer".format(prefix, monomer_prefix),
            monomer_prefix + "_distmap_multimer":
            "{}_{}_distance_map_multimer".format(prefix, monomer_prefix),
        }

    # make sure EC file exists
    verify_resources("EC file does not exist", kwargs["ec_file"])

    # make sure output directory exists
    create_prefix_folders(prefix)

    # store auxiliary files here (too much for average user)
    aux_prefix = insert_dir(prefix, "aux", rootname_subdir=False)
    create_prefix_folders(aux_prefix)

    # store auxiliary files here (too much for average user)
    first_aux_prefix = insert_dir(aux_prefix,
                                  "first_monomer",
                                  rootname_subdir=False)
    create_prefix_folders(first_aux_prefix)

    # store auxiliary files here (too much for average user)
    second_aux_prefix = insert_dir(aux_prefix,
                                   "second_monomer",
                                   rootname_subdir=False)
    create_prefix_folders(second_aux_prefix)

    # Step 1: Identify 3D structures for comparison
    def _identify_monomer_structures(name_prefix, outcfg, aux_prefix):
        # create a dictionary with kwargs for just the current monomer
        # remove the "prefix" kwargs so that we can replace with the
        # aux prefix when calling _identify_structures
        # only replace first occurrence of name_prefix
        monomer_kwargs = {
            k.replace(name_prefix + "_", "", 1): v
            for k, v in kwargs.items() if "prefix" not in k
        }

        # this field needs to be set explicitly else it gets overwritten by concatenated file
        monomer_kwargs["alignment_file"] = kwargs[name_prefix +
                                                  "_alignment_file"]
        monomer_kwargs["raw_focus_alignment_file"] = kwargs[
            name_prefix + "_raw_focus_alignment_file"]

        # identify structures for that monomer
        sifts_map, sifts_map_full = _identify_structures(**monomer_kwargs,
                                                         prefix=aux_prefix)

        # save selected PDB hits
        sifts_map.hits.to_csv(outcfg[name_prefix + "_pdb_structure_hits_file"],
                              index=False)

        # also save full list of hits
        sifts_map_full.hits.to_csv(
            outcfg[name_prefix + "_pdb_structure_hits_unfiltered_file"],
            index=False)
        return outcfg, sifts_map

    outcfg, first_sifts_map = _identify_monomer_structures(
        "first", outcfg, first_aux_prefix)
    outcfg, second_sifts_map = _identify_monomer_structures(
        "second", outcfg, second_aux_prefix)

    # get the segment names from the kwargs
    segment_list = kwargs["segments"]

    # Make sure user provided exactly two segments
    if len(segment_list) != 2:
        raise InvalidParameterError(
            "Compare stage for protein complexes requires exactly two segments"
        )

    first_segment_name = kwargs["segments"][0][0]
    second_segment_name = kwargs["segments"][1][0]

    # Step 2: Compute distance maps
    def _compute_monomer_distance_maps(sifts_map, name_prefix, chain_name):

        # prepare a sequence map to remap the structures we have found
        verify_resources("Target sequence file does not exist",
                         kwargs[name_prefix + "_target_sequence_file"])

        # create target sequence map for remapping structure
        with open(kwargs[name_prefix + "_target_sequence_file"]) as f:
            header, seq = next(read_fasta(f))

        # create target sequence map for remapping structure
        seq_id, seq_start, seq_end = parse_header(header)
        seqmap = dict(zip(range(seq_start, seq_end + 1), seq))

        # compute distance maps and save
        # (but only if we found some structure)
        if len(sifts_map.hits) > 0:
            d_intra = intra_dists(sifts_map,
                                  structures,
                                  atom_filter=kwargs["atom_filter"],
                                  output_prefix=aux_prefix + "_" +
                                  name_prefix + "_distmap_intra")
            d_intra.to_file(outcfg[name_prefix + "_distmap_monomer"])

            # save contacts to separate file
            outcfg[
                name_prefix +
                "_monomer_contacts_file"] = prefix + "_" + name_prefix + "_contacts_monomer.csv"
            d_intra.contacts(kwargs["distance_cutoff"]).to_csv(
                outcfg[name_prefix + "_monomer_contacts_file"], index=False)

            # compute multimer distances, if requested;
            # note that d_multimer can be None if there
            # are no structures with multiple chains
            if kwargs[name_prefix + "_compare_multimer"]:
                d_multimer = multimer_dists(sifts_map,
                                            structures,
                                            atom_filter=kwargs["atom_filter"],
                                            output_prefix=aux_prefix + "_" +
                                            name_prefix + "_distmap_multimer")
            else:
                d_multimer = None

            # if we have a multimer contact map, save it
            if d_multimer is not None:
                d_multimer.to_file(outcfg[name_prefix + "_distmap_multimer"])
                outcfg[
                    name_prefix +
                    "_multimer_contacts_file"] = prefix + name_prefix + "_contacts_multimer.csv"

                # save contacts to separate file
                d_multimer.contacts(kwargs["distance_cutoff"]).to_csv(
                    outcfg[name_prefix + "_multimer_contacts_file"],
                    index=False)
            else:
                outcfg[name_prefix + "_distmap_multimer"] = None

            # create remapped structures (e.g. for
            # later comparison of folding results)
            # remap structures, swap mapping index and filename in
            # dictionary so we have a list of files in the dict keys
            outcfg[name_prefix + "_remapped_pdb_files"] = {
                filename: mapping_index
                for mapping_index, filename in remap_chains(
                    sifts_map,
                    aux_prefix,
                    seqmap,
                    chain_name=chain_name,
                    raise_missing=kwargs["raise_missing"]).items()
            }

        else:
            # if no structures, cannot compute distance maps
            d_intra = None
            d_multimer = None
            outcfg[name_prefix + "_distmap_monomer"] = None
            outcfg[name_prefix + "_distmap_multimer"] = None
            outcfg[name_prefix + "remapped_pdb_files"] = None

        return d_intra, d_multimer, seqmap

    # load all structures for both monomers
    all_structures = set(first_sifts_map.hits.pdb_id).union(
        set(second_sifts_map.hits.pdb_id))
    structures = load_structures(all_structures,
                                 kwargs["pdb_mmtf_dir"],
                                 raise_missing=False)

    d_intra_i, d_multimer_i, seqmap_i = _compute_monomer_distance_maps(
        first_sifts_map, "first", "A")
    d_intra_j, d_multimer_j, seqmap_j = _compute_monomer_distance_maps(
        second_sifts_map, "second", "B")

    # compute inter distance map if sifts map for each monomer exists
    if len(first_sifts_map.hits) > 0 and len(second_sifts_map.hits) > 0:
        d_inter = inter_dists(first_sifts_map,
                              second_sifts_map,
                              raise_missing=kwargs["raise_missing"])
        # if there were overlapping PDBs, save the results
        if d_inter is not None:
            d_inter.to_file(outcfg["distmap_inter"])

            # save contacts to separate file
            d_inter.contacts(kwargs["distance_cutoff"]).to_csv(
                outcfg["inter_contacts_file"], index=False)

    else:
        outcfg["inter_contacts_file"] = None
        d_inter = None

    # # Step 3: Compare ECs to distance maps
    ec_table = pd.read_csv(kwargs["ec_file"])

    for out_file, min_seq_dist in [
        ("ec_compared_longrange_file", kwargs["min_sequence_distance"]),
        ("ec_compared_all_file", 0),
    ]:

        # compare ECs only if we have an intra distance map
        # for at least one monomer - inter can't exist unless
        # we have both monomers
        if (d_intra_i is not None) or (d_intra_j is not None):
            # compare distances individually for each segment pair
            ecs_intra_i = ec_table.query(
                "segment_i == segment_j == @first_segment_name")
            if d_intra_i is not None:
                ecs_intra_i_compared = coupling_scores_compared(
                    ecs_intra_i,
                    d_intra_i,
                    d_multimer_i,
                    dist_cutoff=kwargs["distance_cutoff"],
                    output_file=None,
                    min_sequence_dist=min_seq_dist)
            else:
                # If no distance map, the distance is saved as np.nan
                ecs_intra_i_compared = ecs_intra_i.assign(dist=np.nan)

            ecs_intra_j = ec_table.query(
                "segment_i == segment_j == @second_segment_name")
            if d_intra_j is not None:
                ecs_intra_j_compared = coupling_scores_compared(
                    ecs_intra_j,
                    d_intra_j,
                    d_multimer_j,
                    dist_cutoff=kwargs["distance_cutoff"],
                    output_file=None,
                    min_sequence_dist=min_seq_dist)
            else:
                ecs_intra_j_compared = ecs_intra_j.assign(dist=np.nan)

            ecs_inter = ec_table.query("segment_i != segment_j")
            if d_inter is not None:
                ecs_inter_compared = coupling_scores_compared(
                    ecs_inter,
                    d_inter,
                    dist_map_multimer=None,
                    dist_cutoff=kwargs["distance_cutoff"],
                    output_file=None,
                    min_sequence_dist=
                    None  # does not apply for inter-protein ECs
                )
            else:
                ecs_inter_compared = ecs_inter.assign(dist=np.nan)

            # combine the tables
            ec_table_compared = pd.concat([
                ecs_inter_compared, ecs_intra_i_compared, ecs_intra_j_compared
            ])

            # rename the precision column to "segmentwise_precision"
            # because we calculated precision for each segment independently
            ec_table_compared = ec_table_compared.rename(
                columns={"precision": "segmentwise_precision"})
            # TODO: change "cn" to "score" eventually
            ec_table_compared = ec_table_compared.sort_values("cn",
                                                              ascending=False)

            # add the total precision
            # TODO: implement different cutoffs for intra vs inter contacts
            ec_table_compared = add_precision(
                ec_table_compared, dist_cutoff=kwargs["distance_cutoff"])

            # save to file
            # all ecs
            ec_table_compared.to_csv(outcfg[out_file])

            # save the inter ECs to a file
            ecs_inter_compared.to_csv(outcfg["ec_compared_inter_file"])

    # create the inter-ecs line drawing script
    if outcfg["ec_compared_inter_file"] is not None and kwargs[
            "plot_highest_count"] is not None:
        inter_ecs = ec_table.query("segment_i != segment_j")

        outcfg[
            "ec_lines_compared_pml_file"] = prefix + "_draw_ec_lines_compared.pml"

        pairs.ec_lines_pymol_script(
            inter_ecs.iloc[:kwargs["plot_highest_count"], :],
            outcfg["ec_lines_compared_pml_file"],
            distance_cutoff=kwargs["distance_cutoff"],
            chain={
                first_segment_name: "A",
                second_segment_name: "B"
            })

    # Remap the complex crystal structures, if available
    if len(first_sifts_map.hits) > 0 and len(second_sifts_map.hits) > 0:
        outcfg["complex_remapped_pdb_files"] = {
            filename: mapping_index
            for mapping_index, filename in remap_complex_chains(
                first_sifts_map,
                second_sifts_map,
                seqmap_i,
                seqmap_j,
                output_prefix=aux_prefix,
                raise_missing=kwargs["raise_missing"]).items()
        }

    # Step 4: Make contact map plots
    # if no structures available, defaults to EC-only plot
    outcfg["contact_map_files"] = _make_complex_contact_maps(
        ec_table, d_intra_i, d_multimer_i, d_intra_j, d_multimer_j, d_inter,
        first_segment_name, second_segment_name, **kwargs)

    return outcfg
Esempio n. 23
0
def mean_field(**kwargs):
    """
    Protocol:

    Infer ECs from alignment using mean field direct coupling analysis.

    For now, mean field DCA can only be run in focus mode, gaps
    included.

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required.

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * raw_ec_file
        * model_file
        * num_sites
        * num_sequences
        * effective_sequences

        * focus_mode (passed through)
        * focus_sequence (passed through)
        * segments (passed through)
    """
    check_required(
        kwargs,
        [
            "prefix", "alignment_file", "segments",
            "focus_mode", "focus_sequence", "theta",
            "pseudo_count", "alphabet",
            "min_sequence_distance", # "save_model",
        ]
    )

    if not kwargs["focus_mode"]:
        raise InvalidParameterError(
            "For now, mean field DCA can only be run in focus mode."
        )

    prefix = kwargs["prefix"]

    # option to save model disabled
    """
    if kwargs["save_model"]:
        model = prefix + ".model"
    else:
        model = None
    """
    model = prefix + ".model"

    outcfg = {
        "model_file": model,
        "raw_ec_file": prefix + "_ECs.txt",
        "ec_file": prefix + "_CouplingScores.csv",
        # TODO: the following are passed through stage...
        # keep this or unnecessary?
        "focus_mode": kwargs["focus_mode"],
        "focus_sequence": kwargs["focus_sequence"],
        "segments": kwargs["segments"],
    }

    # make sure input alignment exists
    alignment_file = kwargs["alignment_file"]
    verify_resources(
        "Input alignment does not exist",
        kwargs["alignment_file"]
    )

    # make sure output directory exists
    create_prefix_folders(prefix)

    segments = kwargs["segments"]
    if segments is not None:
        segments = [
            mapping.Segment.from_list(s) for s in segments
        ]

    # determine alphabet
    # default is protein
    if kwargs["alphabet"] is None:
        alphabet = ALPHABET_PROTEIN
    else:
        alphabet = kwargs["alphabet"]

        # allow shortcuts for protein, DNA, RNA
        if alphabet in ALPHABET_MAP:
            alphabet = ALPHABET_MAP[alphabet]

    # read in a2m alignment
    with open(alignment_file) as f:
        input_alignment = Alignment.from_file(
            f, alphabet=alphabet,
            format="fasta"
        )

    # init mean field direct coupling analysis
    mf_dca = MeanFieldDCA(input_alignment)

    # run mean field approximation
    model = mf_dca.fit(
        theta=kwargs["theta"],
        pseudo_count=kwargs["pseudo_count"]
    )

    # write ECs to file
    model.to_raw_ec_file(
        outcfg["raw_ec_file"]
    )

    # write model file
    if outcfg["model_file"] is not None:
        model.to_file(
            outcfg["model_file"],
            file_format="plmc_v2"
        )

    # store useful information about model in outcfg
    outcfg.update({
        "num_sites": model.L,
        "num_sequences": model.N_valid,
        "effective_sequences": float(round(model.N_eff, 1)),
        "region_start": int(model.index_list[0]),
    })

    # read and sort ECs
    ecs = pd.read_csv(
        outcfg["raw_ec_file"], sep=" ",
        # for now, call the last two columns
        # "fn" and "cn" to prevent compare
        # stage from crashing
        names=["i", "A_i", "j", "A_j", "fn", "cn"]
        # names=["i", "A_i", "j", "A_j", "mi", "di"]
    ).sort_values(
        by="cn",
        ascending=False
    )

    # write the sorted ECs table to csv file
    ecs.to_csv(outcfg["ec_file"], index=False)

    # also store longrange ECs as convenience output
    if kwargs["min_sequence_distance"] is not None:
        outcfg["ec_longrange_file"] = prefix + "_CouplingScores_longrange.csv"
        ecs_longrange = ecs.query(
            "abs(i - j) >= {}".format(kwargs["min_sequence_distance"])
        )
        ecs_longrange.to_csv(outcfg["ec_longrange_file"], index=False)

        # also create line-drawing script (for now, only for single segments)
        if segments is None or len(segments) == 1:
            outcfg["ec_lines_pml_file"] = prefix + "_draw_ec_lines.pml"
            L = outcfg["num_sites"]
            ec_lines_pymol_script(
                ecs_longrange.iloc[:L, :],
                outcfg["ec_lines_pml_file"],
                score_column="cn"  # "di
            )

    # compute EC enrichment (for now, for single segments
    # only since enrichment code cannot handle multiple segments)
    if segments is None or len(segments) == 1:
        outcfg["enrichment_file"] = prefix + "_enrichment.csv"
        ecs_enriched = pairs.enrichment(ecs, score="cn")  # "di"
        ecs_enriched.to_csv(outcfg["enrichment_file"], index=False)

        # create corresponding enrichment pymol scripts
        outcfg["enrichment_pml_files"] = []
        for sphere_view, pml_suffix in [
            (True, "_enrichment_spheres.pml"), (False, "_enrichment_sausage.pml")
        ]:
            pml_file = prefix + pml_suffix
            enrichment_pymol_script(ecs_enriched, pml_file, sphere_view=sphere_view)
            outcfg["enrichment_pml_files"].append(pml_file)

    # output EVzoom JSON file if we have stored model file
    if outcfg.get("model_file", None) is not None:
        outcfg["evzoom_file"] = prefix + "_evzoom.json"
        with open(outcfg["evzoom_file"], "w") as f:
            # create JSON output and write to file
            f.write(
                evzoom_json(model) + "\n"
            )

    # dump output config to YAML file for debugging/logging
    write_config_file(prefix + ".couplings_standard.outcfg", outcfg)

    return outcfg
Esempio n. 24
0
def run_psipred(fasta_file, output_dir, binary="runpsipred"):
    """
    Run psipred secondary structure prediction

    psipred output file convention: run_psipred creates
    output files <rootname>.ss2 and <rootname2>.horiz
    in the current working directory, where <rootname>
    is extracted from the basename of the input file
    (e.g. /home/test/<rootname>.fa)

    Parameters
    ----------
    fasta_file : str
        Input sequence file in FASTA format
    output_dir : str
        Directory in which output will be saved
    binary : str, optional (default: "cns")
        Path of psipred executable (runpsipred)

    Returns
    -------
    ss2_file : str
        Absolute path to prediction output in "VFORMAT"
    horiz_file : str
        Absolute path to prediction output in "HFORMAT"

    Raises
    ------
    ExternalToolError
        If call to psipred fails
    """
    # make sure we have absolute path
    binary = path.abspath(binary)
    fasta_file = path.abspath(fasta_file)
    output_dir = path.abspath(output_dir)

    # make sure input file is valid
    verify_resources("Input FASTA file is invalid", fasta_file)

    # make sure output directory exists
    makedirs(output_dir)

    # execute psipred;
    # we have to start it from output directory so
    # result files end up there (this is hardcoded
    # in runpsipred)
    return_code, stdout, stderr = run(
        [binary, fasta_file],
        working_dir=output_dir,
    )

    # determine where psipred will store output based
    # on logic from runpsipred script
    rootname, _ = path.splitext(path.basename(fasta_file))
    output_prefix = path.join(output_dir, rootname)

    # construct paths to output files in vertical and horizontal formats
    ss2_file = output_prefix + ".ss2"
    horiz_file = output_prefix + ".horiz"

    # make sure we actually predicted something
    verify_resources("psipred output is invalid", ss2_file, horiz_file)

    return ss2_file, horiz_file
Esempio n. 25
0
def existing(**kwargs):
    """
    Protocol:

    Use external sequence alignment and extract all relevant
    information from there (e.g. sequence, region, etc.),
    then apply gap & fragment filtering as usual

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * sequence_id (passed through from input)
        * alignment_file
        * raw_focus_alignment_file
        * statistics_file
        * sequence_file
        * first_index
        * target_sequence_file
        * annotation_file (None)
        * frequencies_file
        * identities_file
        * focus_mode
        * focus_sequence
        * segments
    """
    check_required(kwargs, [
        "prefix", "input_alignment", "sequence_id", "first_index",
        "extract_annotation"
    ])

    prefix = kwargs["prefix"]

    # make sure output directory exists
    create_prefix_folders(prefix)

    # this file is starting point of pipeline;
    # check if input alignment actually exists
    input_alignment = kwargs["input_alignment"]
    verify_resources("Input alignment does not exist", input_alignment)

    # first try to autodetect format of alignment
    with open(input_alignment) as f:
        format = detect_format(f)
        if format is None:
            raise InvalidParameterError(
                "Format of input alignment {} could not be "
                "automatically detected.".format(input_alignment))

    with open(input_alignment) as f:
        ali_raw = Alignment.from_file(f, format)

    # save annotation in sequence headers (species etc.)
    annotation_file = None
    if kwargs["extract_annotation"]:
        annotation_file = prefix + "_annotation.csv"
        from_anno_line = (format == "stockholm")
        annotation = extract_header_annotation(ali_raw,
                                               from_annotation=from_anno_line)
        annotation.to_csv(annotation_file, index=False)

    # Target sequence of alignment
    sequence_id = kwargs["sequence_id"]

    if sequence_id is None:
        raise InvalidParameterError("Parameter sequence_id must be defined")

    # First, find focus sequence in alignment
    focus_index = None
    for i, id_ in enumerate(ali_raw.ids):
        if id_.startswith(sequence_id):
            focus_index = i
            break

    # if we didn't find it, cannot continue
    if focus_index is None:
        raise InvalidParameterError(
            "Target sequence {} could not be found in alignment".format(
                sequence_id))

    # identify what columns (non-gap) to keep for focus
    focus_seq = ali_raw[focus_index]
    focus_cols = np.array([
        c not in [ali_raw._match_gap, ali_raw._insert_gap] for c in focus_seq
    ])

    # extract focus alignment
    focus_ali = ali_raw.select(columns=focus_cols)
    focus_seq_nogap = "".join(focus_ali[focus_index])

    # determine region of sequence. If first_index is given,
    # use that in any case, otherwise try to autodetect
    full_focus_header = ali_raw.ids[focus_index]
    focus_id = full_focus_header.split()[0]

    # try to extract region from sequence header
    id_, region_start, region_end = parse_header(focus_id)

    # override with first_index if given
    if kwargs["first_index"] is not None:
        region_start = kwargs["first_index"]
        region_end = region_start + len(focus_seq_nogap) - 1

    if region_start is None or region_end is None:
        raise InvalidParameterError(
            "Could not extract region information " +
            "from sequence header {} ".format(full_focus_header) +
            "and first_index parameter is not given.")

    # resubstitute full sequence ID from identifier
    # and region information
    header = "{}/{}-{}".format(id_, region_start, region_end)

    focus_ali.ids[focus_index] = header

    # write target sequence to file
    target_sequence_file = prefix + ".fa"
    with open(target_sequence_file, "w") as f:
        write_fasta([(header, focus_seq_nogap)], f)

    # apply sequence identity and fragment filters,
    # and gap threshold
    mod_outcfg, ali = modify_alignment(focus_ali, focus_index, id_,
                                       region_start, **kwargs)

    # generate output configuration of protocol
    outcfg = {
        **mod_outcfg,
        "sequence_id": sequence_id,
        "sequence_file": target_sequence_file,
        "first_index": region_start,
        "target_sequence_file": target_sequence_file,
        "focus_sequence": header,
        "focus_mode": True,
    }

    if annotation_file is not None:
        outcfg["annotation_file"] = annotation_file

    # dump config to YAML file for debugging/logging
    write_config_file(prefix + ".align_existing.outcfg", outcfg)

    # return results of protocol
    return outcfg
Esempio n. 26
0
def run_hmmscan(query,
                database,
                prefix,
                use_model_threshold=True,
                threshold_type="cut_ga",
                use_bitscores=True,
                domain_threshold=None,
                seq_threshold=None,
                nobias=False,
                cpu=None,
                stdout_redirect=None,
                binary="hmmscan"):
    """
    Run hmmscan of HMMs in database against sequences in query
    to identify matches of these HMMs.
    Refer to HMMER Userguide for explanation of these parameters.

    Parameters
    ----------
    query : str
        File containing query sequence(s)
    database : str
        File containing HMM database (prepared with hmmpress)
    prefix : str
        Prefix path for output files. Folder structure in
        the prefix will be created if not existing.
    use_model_threshold: bool (default: True)
        Use model-specific inclusion thresholds from
        HMM database rather than global bitscore/E-value
        thresholds (use_bitscores, domain_threshold and
        seq_threshold are overriden by this flag).
    threshold-type: {"cut_ga", "cut_nc", "cut_tc"} (default: "cut_ga")
        Use gathering (default), noise or trusted cutoff
        to define scan hits. Please refer to HMMER manual for
        details.
    use_bitscores : bool
        Use bitscore inclusion thresholds rather than E-values.
        Overriden by use_model_threshold flag.
    domain_threshold : int or float or str
        Inclusion threshold applied on the domain level
        (e.g. "1E-03" or 0.001 or 50)
    seq_threshold : int or float or str
        Inclusion threshold applied on the sequence level
        (e.g. "1E-03" or 0.001 or 50)
    nobias : bool, optional (default: False)
        Turn of bias correction
    cpu : int, optional (default: None)
        Number of CPUs to use for search. Uses all if None.
    stdout_redirect : str, optional (default: None)
        Redirect bulky stdout instead of storing
        with rest of results (use "/dev/null" to dispose)
    binary : str (default: "hmmscan")
        Path to hmmscan binary (put in PATH for
        default to work)

    Returns
    -------
    HmmscanResult
        namedtuple with fields corresponding to the different
        output files (prefix, output, tblout, domtblout, pfamtblout)

    Raises
    ------
    ExternalToolError, ResourceError
    """
    verify_resources("Input file does not exist or is empty", query, database)

    create_prefix_folders(prefix)

    result = HmmscanResult(
        prefix,
        prefix + ".output" if stdout_redirect is None else stdout_redirect,
        prefix + ".tblout", prefix + ".domtblout", prefix + ".pfamtblout")

    cmd = [
        binary,
        "-o",
        result.output,
        "--tblout",
        result.tblout,
        "--domtblout",
        result.domtblout,
        "--pfamtblout",
        result.pfamtblout,
        "--notextw",
        "--acc",
    ]

    # number of CPUs
    if cpu is not None:
        cmd += ["--cpu", str(cpu)]

    # bias correction filter
    if nobias:
        cmd += ["--nobias"]

    # either use model-specific threshold, or custom
    # bitscore/E-value thresholds
    if use_model_threshold:
        THRESHOLD_CHOICES = ["cut_ga", "cut_nc", "cut_tc"]
        if threshold_type not in THRESHOLD_CHOICES:
            raise ValueError("Invalid model threshold, valid choices are: " +
                             ", ".join(THRESHOLD_CHOICES))

        cmd += ["--" + threshold_type]
    else:
        if seq_threshold is None or domain_threshold is None:
            raise ValueError("Must define sequence- and domain-level reporting"
                             "thresholds, or use gathering threshold instead.")

        if use_bitscores:
            cmd += [
                "-T",
                str(seq_threshold),
                "--domT",
                str(domain_threshold),
            ]
        else:
            cmd += [
                "-E",
                str(seq_threshold),
                "--domE",
                str(domain_threshold),
            ]

    cmd += [database, query]

    return_code, stdout, stderr = run(cmd)

    # also check we actually created a table with hits
    verify_resources(
        "hmmscan did not return results: "
        "stdout={} stderr={} file={}".format(stdout, stderr, result.domtblout),
        result.domtblout)

    return result
Esempio n. 27
0
def standard(**kwargs):
    """
    Protocol:
    Compare ECs for single proteins (or domains)
    to 3D structure information

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * ec_file_compared_all
        * ec_file_compared_all_longrange
        * pdb_structure_hits
        * distmap_monomer
        * distmap_multimer
        * contact_map_files
        * remapped_pdb_files
    """
    check_required(kwargs, [
        "prefix",
        "ec_file",
        "min_sequence_distance",
        "pdb_mmtf_dir",
        "atom_filter",
        "compare_multimer",
        "distance_cutoff",
        "target_sequence_file",
        "scale_sizes",
    ])

    prefix = kwargs["prefix"]

    outcfg = {
        "ec_compared_all_file": prefix + "_CouplingScoresCompared_all.csv",
        "ec_compared_longrange_file":
        prefix + "_CouplingScoresCompared_longrange.csv",
        "pdb_structure_hits_file": prefix + "_structure_hits.csv",
        "pdb_structure_hits_unfiltered_file":
        prefix + "_structure_hits_unfiltered.csv",
        # cannot have the distmap files end with "_file" because there are
        # two files (.npy and .csv), which would cause problems with automatic
        # checking if those files exist
        "distmap_monomer": prefix + "_distance_map_monomer",
        "distmap_multimer": prefix + "_distance_map_multimer",
    }

    # make sure EC file exists
    verify_resources("EC file does not exist", kwargs["ec_file"])

    # make sure output directory exists
    create_prefix_folders(prefix)

    # store auxiliary files here (too much for average user)
    aux_prefix = insert_dir(prefix, "aux", rootname_subdir=False)
    create_prefix_folders(aux_prefix)

    # Step 1: Identify 3D structures for comparison
    sifts_map, sifts_map_full = _identify_structures(
        **{
            **kwargs,
            "prefix": aux_prefix,
        })

    # save selected PDB hits
    sifts_map.hits.to_csv(outcfg["pdb_structure_hits_file"], index=False)

    # also save full list of hits
    sifts_map_full.hits.to_csv(outcfg["pdb_structure_hits_unfiltered_file"],
                               index=False)

    # Step 2: Compute distance maps

    # load all structures at once
    structures = load_structures(sifts_map.hits.pdb_id,
                                 kwargs["pdb_mmtf_dir"],
                                 raise_missing=False)

    # compute distance maps and save
    # (but only if we found some structure)
    if len(sifts_map.hits) > 0:
        d_intra = intra_dists(sifts_map,
                              structures,
                              atom_filter=kwargs["atom_filter"],
                              output_prefix=aux_prefix + "_distmap_intra")
        d_intra.to_file(outcfg["distmap_monomer"])

        # save contacts to separate file
        outcfg["monomer_contacts_file"] = prefix + "_contacts_monomer.csv"
        d_intra.contacts(kwargs["distance_cutoff"]).to_csv(
            outcfg["monomer_contacts_file"], index=False)

        # compute multimer distances, if requested;
        # note that d_multimer can be None if there
        # are no structures with multiple chains
        if kwargs["compare_multimer"]:
            d_multimer = multimer_dists(sifts_map,
                                        structures,
                                        atom_filter=kwargs["atom_filter"],
                                        output_prefix=aux_prefix +
                                        "_distmap_multimer")
        else:
            d_multimer = None

        # if we have a multimer contact mapin the end, save it
        if d_multimer is not None:
            d_multimer.to_file(outcfg["distmap_multimer"])
            outcfg[
                "multimer_contacts_file"] = prefix + "_contacts_multimer.csv"

            # save contacts to separate file
            d_multimer.contacts(kwargs["distance_cutoff"]).to_csv(
                outcfg["multimer_contacts_file"], index=False)
        else:
            outcfg["distmap_multimer"] = None

        # at this point, also create remapped structures (e.g. for
        # later comparison of folding results)
        verify_resources("Target sequence file does not exist",
                         kwargs["target_sequence_file"])

        # create target sequence map for remapping structure
        with open(kwargs["target_sequence_file"]) as f:
            header, seq = next(read_fasta(f))

        seq_id, seq_start, seq_end = parse_header(header)
        seqmap = dict(zip(range(seq_start, seq_end + 1), seq))

        # remap structures, swap mapping index and filename in
        # dictionary so we have a list of files in the dict keys
        outcfg["remapped_pdb_files"] = {
            filename: mapping_index
            for mapping_index, filename in remap_chains(
                sifts_map, aux_prefix, seqmap).items()
        }
    else:
        # if no structures, can not compute distance maps
        d_intra = None
        d_multimer = None
        outcfg["distmap_monomer"] = None
        outcfg["distmap_multimer"] = None
        outcfg["remapped_pdb_files"] = None

    # Step 3: Compare ECs to distance maps

    ec_table = pd.read_csv(kwargs["ec_file"])

    # identify number of sites in EC model
    num_sites = len(
        set.union(set(ec_table.i.unique()), set(ec_table.j.unique())))

    for out_file, min_seq_dist in [
        ("ec_compared_longrange_file", kwargs["min_sequence_distance"]),
        ("ec_compared_all_file", 0),
    ]:
        # compare ECs only if we minimally have intra distance map
        if d_intra is not None:
            coupling_scores_compared(ec_table,
                                     d_intra,
                                     d_multimer,
                                     dist_cutoff=kwargs["distance_cutoff"],
                                     output_file=outcfg[out_file],
                                     min_sequence_dist=min_seq_dist)
        else:
            outcfg[out_file] = None

    # also create line-drawing script if we made the csv
    if outcfg["ec_compared_longrange_file"] is not None:
        ecs_longrange = pd.read_csv(outcfg["ec_compared_longrange_file"])

        outcfg[
            "ec_lines_compared_pml_file"] = prefix + "_draw_ec_lines_compared.pml"
        pairs.ec_lines_pymol_script(ecs_longrange.iloc[:num_sites, :],
                                    outcfg["ec_lines_compared_pml_file"],
                                    distance_cutoff=kwargs["distance_cutoff"])

    # Step 4: Make contact map plots
    # if no structures available, defaults to EC-only plot

    outcfg["contact_map_files"] = _make_contact_maps(ec_table, d_intra,
                                                     d_multimer, **kwargs)

    return outcfg
Esempio n. 28
0
def genome_distance(**kwargs):
    """
    Protocol:

    Concatenate alignments based on genomic distance

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * alignment_file
        * raw_alignment_file
        * focus_mode
        * focus_sequence
        * segments
        * frequencies_file
        * identities_file
        * num_sequences
        * num_sites
        * raw_focus_alignment_file
        * statistics_file

    """

    check_required(
        kwargs,
        [
            "prefix",
            "first_alignment_file", "second_alignment_file",
            "first_focus_sequence", "second_focus_sequence",
            "first_focus_mode", "second_focus_mode",
            "first_region_start", "second_region_start",
            "first_segments", "second_segments",
            "genome_distance_threshold",
            "first_genome_location_file",
            "second_genome_location_file",
            "first_annotation_file",
            "second_annotation_file"
        ]
    )

    prefix = kwargs["prefix"]

    # make sure input alignments exist
    verify_resources(
        "Input alignment does not exist",
        kwargs["first_alignment_file"], kwargs["second_alignment_file"]
    )

    verify_resources(
        "Genome location file does not exist",
        kwargs["first_genome_location_file"],
        kwargs["second_genome_location_file"]
    )

    # make sure output directory exists
    create_prefix_folders(prefix)

    # load the information for each monomer alignment
    alignment_1 = kwargs["first_alignment_file"]
    alignment_2 = kwargs["second_alignment_file"]

    genome_location_filename_1 = kwargs["first_genome_location_file"]
    genome_location_filename_2 = kwargs["second_genome_location_file"]

    gene_location_table_1 = pd.read_csv(genome_location_filename_1, header=0)
    gene_location_table_2 = pd.read_csv(genome_location_filename_2, header=0)

    # find all possible matches
    possible_partners = find_possible_partners(
        gene_location_table_1, gene_location_table_2
    )

    # find the best reciprocal matches
    id_pairing_unfiltered = best_reciprocal_matching(possible_partners)

    # filter best reciprocal matches by genome distance threshold
    if kwargs["genome_distance_threshold"]:
        distance_threshold = kwargs["genome_distance_threshold"]
        id_pairing = id_pairing_unfiltered.query("distance < @distance_threshold")
    else:
        id_pairing = id_pairing_unfiltered

    id_pairing.loc[:, "id_1"] = id_pairing.loc[:, "uniprot_id_1"]
    id_pairing.loc[:, "id_2"] = id_pairing.loc[:, "uniprot_id_2"]

    # write concatenated alignment with distance filtering
    # TODO: save monomer alignments?
    target_seq_id, target_seq_index, raw_ali, mon_ali_1, mon_ali_2 = \
        write_concatenated_alignment(
            id_pairing,
            alignment_1,
            alignment_2,
            kwargs["first_focus_sequence"],
            kwargs["second_focus_sequence"]
        )

    # save the alignment files
    raw_alignment_file = prefix + "_raw.fasta"
    with open(raw_alignment_file, "w") as of:
        raw_ali.write(of)

    mon_alignment_file_1 = prefix + "_monomer_1.fasta"
    with open(mon_alignment_file_1, "w") as of:
        mon_ali_1.write(of)   

    mon_alignment_file_2 = prefix + "_monomer_2.fasta"
    with open(mon_alignment_file_2, "w") as of:
        mon_ali_2.write(of)   

    # filter the alignment
    aln_outcfg, _ = modify_alignment(
        raw_ali,
        target_seq_index,
        target_seq_id,
        kwargs["first_region_start"],
        **kwargs
    )

    # make sure we return all the necessary information:
    # * alignment_file: final concatenated alignment that will go into plmc
    # * focus_sequence: this is the identifier of the concatenated target
    #   sequence which will be passed into plmc with -f
    outcfg = aln_outcfg
    outcfg["raw_alignment_file"] = raw_alignment_file
    outcfg["first_concatenated_monomer_alignment_file"] = mon_alignment_file_1
    outcfg["second_concatenated_monomer_alignment_file"] = mon_alignment_file_2
    outcfg["focus_sequence"] = target_seq_id

    # Update the segments
    outcfg = modify_complex_segments(outcfg, **kwargs)

    # Describe the statistics of the concatenation
    outcfg = _run_describe_concatenation(outcfg, **kwargs)

    # plot the genome distance distribution
    outcfg["distance_plot_file"] = prefix + "_distplot.pdf"
    plot_distance_distribution(id_pairing_unfiltered, outcfg["distance_plot_file"])

    return outcfg
Esempio n. 29
0
    def _compute_monomer_distance_maps(sifts_map, name_prefix, chain_name):

        # prepare a sequence map to remap the structures we have found
        verify_resources("Target sequence file does not exist",
                         kwargs[name_prefix + "_target_sequence_file"])

        # create target sequence map for remapping structure
        with open(kwargs[name_prefix + "_target_sequence_file"]) as f:
            header, seq = next(read_fasta(f))

        # create target sequence map for remapping structure
        seq_id, seq_start, seq_end = parse_header(header)
        seqmap = dict(zip(range(seq_start, seq_end + 1), seq))

        # compute distance maps and save
        # (but only if we found some structure)
        if len(sifts_map.hits) > 0:
            d_intra = intra_dists(sifts_map,
                                  structures,
                                  atom_filter=kwargs["atom_filter"],
                                  output_prefix=aux_prefix + "_" +
                                  name_prefix + "_distmap_intra")
            d_intra.to_file(outcfg[name_prefix + "_distmap_monomer"])

            # save contacts to separate file
            outcfg[
                name_prefix +
                "_monomer_contacts_file"] = prefix + "_" + name_prefix + "_contacts_monomer.csv"
            d_intra.contacts(kwargs["distance_cutoff"]).to_csv(
                outcfg[name_prefix + "_monomer_contacts_file"], index=False)

            # compute multimer distances, if requested;
            # note that d_multimer can be None if there
            # are no structures with multiple chains
            if kwargs[name_prefix + "_compare_multimer"]:
                d_multimer = multimer_dists(sifts_map,
                                            structures,
                                            atom_filter=kwargs["atom_filter"],
                                            output_prefix=aux_prefix + "_" +
                                            name_prefix + "_distmap_multimer")
            else:
                d_multimer = None

            # if we have a multimer contact map, save it
            if d_multimer is not None:
                d_multimer.to_file(outcfg[name_prefix + "_distmap_multimer"])
                outcfg[
                    name_prefix +
                    "_multimer_contacts_file"] = prefix + name_prefix + "_contacts_multimer.csv"

                # save contacts to separate file
                d_multimer.contacts(kwargs["distance_cutoff"]).to_csv(
                    outcfg[name_prefix + "_multimer_contacts_file"],
                    index=False)
            else:
                outcfg[name_prefix + "_distmap_multimer"] = None

            # create remapped structures (e.g. for
            # later comparison of folding results)
            # remap structures, swap mapping index and filename in
            # dictionary so we have a list of files in the dict keys
            outcfg[name_prefix + "_remapped_pdb_files"] = {
                filename: mapping_index
                for mapping_index, filename in remap_chains(
                    sifts_map,
                    aux_prefix,
                    seqmap,
                    chain_name=chain_name,
                    raise_missing=kwargs["raise_missing"]).items()
            }

        else:
            # if no structures, cannot compute distance maps
            d_intra = None
            d_multimer = None
            outcfg[name_prefix + "_distmap_monomer"] = None
            outcfg[name_prefix + "_distmap_multimer"] = None
            outcfg[name_prefix + "remapped_pdb_files"] = None

        return d_intra, d_multimer, seqmap
Esempio n. 30
0
def complex_dock(**kwargs):
    """
    Protocol:
    Predict 3D structure from evolutionary couplings

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * docking_restraints_files
    """
    check_required(kwargs, [
        "prefix",
        "ec_file",
        "segments",
        "dock_probability_cutoffs",
        "dock_lowest_count",
        "dock_highest_count",
        "dock_increase",
    ])

    prefix = kwargs["prefix"]
    outcfg = {}

    # make sure output directory exists
    create_prefix_folders(prefix)

    verify_resources("EC file does not exist and/or is empty",
                     kwargs["ec_file"])

    ecs_all = pd.read_csv(kwargs["ec_file"])
    ecs_dock = ecs_all.query("segment_i != segment_j")

    # define the sub-runs ...
    folding_runs = []

    # ... based on mixture model probability
    cutoffs = kwargs["dock_probability_cutoffs"]

    if cutoffs is not None and "probability" in ecs_dock.columns:
        if not isinstance(cutoffs, list):
            cutoffs = [cutoffs]

        for c in cutoffs:
            sig_ecs = ecs_dock.query("probability >= @c")
            if len(sig_ecs) > 0:
                folding_runs.append(
                    (sig_ecs, "_significant_ECs_{}_restraints.tbl".format(c)))

    # ... and on simple EC counts/bins
    flc = kwargs["dock_lowest_count"]
    fhc = kwargs["dock_highest_count"]
    fi = kwargs["dock_increase"]
    if flc is not None and fhc is not None and fi is not None:
        num_sites = len(set(ecs_dock.i.unique())) + len(
            set(ecs_dock.j.unique()))

        # transform fraction of number of sites into discrete number of ECs
        def _discrete_count(x):
            if isinstance(x, float):
                x = ceil(x * num_sites)
            return int(x)

        # range of plots to make
        lowest = _discrete_count(flc)
        highest = _discrete_count(fhc)
        step = _discrete_count(fi)

        # append to list of jobs to run
        folding_runs += [(ecs_dock.iloc[:c], "_{}_restraints.tbl".format(c))
                         for c in range(lowest, highest + 1, step)]

    outcfg["docking_restraint_files"] = []
    for job_ecs, job_suffix in folding_runs:
        job_filename = prefix + job_suffix
        docking_restraints(job_ecs, job_filename, haddock_dist_restraint)
        outcfg["docking_restraint_files"].append(job_filename)

    return outcfg