Exemple #1
0
def load_structures(pdb_ids, structure_dir=None, raise_missing=True):
    """
    Load PDB structures from files / web

    Parameters
    ----------
    pdb_ids : Iterable
        List / iterable containing PDB identifiers
        to be loaded.
    structure_dir : str, optional (default: None)
        Path to directory with structures. Structures
        filenames must be in the format 5p21.mmtf.
        If a file can not be found, will try to fetch
        from web instead.
    raise_missing : bool, optional (default: True)
        Raise a ResourceError exception if any of the
        PDB IDs cannot be loaded. If False, missing
        entries will be ignored.

    Returns
    -------
    structures : dict(str -> PDB)
        Dictionary containing loaded structures.
        Keys (PDB identifiers) will be lower-case.

    Raises
    ------
    ResourceError
        Raised if raise_missing is True and any of the given
        PDB IDs cannot be loaded.
    """
    # collect loaded structures in dict(id -> PDB)
    structures = {}

    # load structure by structure
    for pdb_id in set(pdb_ids):
        pdb_id = pdb_id.lower()

        has_file = False
        if structure_dir is not None:
            structure_file = path.join(structure_dir, pdb_id + ".mmtf")
            has_file = valid_file(structure_file)

        try:
            # see if we can load locally from disk
            if has_file:
                structures[pdb_id] = PDB.from_file(structure_file)
            else:
                # otherwise fetch from web
                structures[pdb_id] = PDB.from_id(pdb_id)
        except (ResourceError, UnicodeDecodeError):
            # ResourceError: invalid PDB ID
            # UnicodeDecodeError: some random problem with mmtf library
            if raise_missing:
                raise

    return structures
Exemple #2
0
def create_archive(config, outcfg, output_file):
    """
    Create archive of files generated by pipeline

    Parameters
    ----------
    config : dict-like
        Input configuration of job. Uses 
        config["management"]["archive"] (list of key 
        used to index outcfg) to determine
        which files should be added to archive
    outcfg : dict-like
        Output configuration of job
    output_file : str
        Store archive file to this path
    """
    # determine keys (corresponding to files) in
    # outcfg that should be stored
    outkeys = config.get("management", {}).get("archive", None)

    # if no output keys are requested, nothing to do
    if outkeys is None or len(outkeys) == 0:
        return

    # create archive
    with tarfile.open(output_file, "w:gz") as tar:
        # add files based on keys one by one
        for k in outkeys:
            # skip missing keys or ones not defined
            if k not in outcfg or outcfg[k] is None:
                continue

            # distinguish between files and lists of files
            if k.endswith("files"):
                for f in outcfg[k]:
                    if valid_file(f):
                        tar.add(f)
            else:
                if valid_file(outcfg[k]):
                    tar.add(outcfg[k])
Exemple #3
0
    def __init__(self, sifts_table_file, sequence_file=None):
        """
        Create new SIFTS mapper from mapping table.

        Note that creation of the mapping files, if not existing,
        takes a while.

        Parameters
        ----------
        sifts_table_file : str
            Path to *corrected* SIFTS pdb_chain_uniprot.csv
            To generate this file, point to an empty file path.
        sequence_file : str, optional (default: None)
            Path to file containing all UniProt sequences
            in SIFTS (used for homology-based identification
            of structures).
            Note: This file can be created using the
            create_sequence_file() method.
        """
        # test if table exists, if not, download and modify
        if not valid_file(sifts_table_file):
            self._create_mapping_table(sifts_table_file)

        self.table = pd.read_csv(sifts_table_file, comment="#")

        # final table has still some entries where lengths do not match,
        # remove these
        self.table = self.table.query(
            "(resseq_end - resseq_start) == (uniprot_end - uniprot_start)")

        self.sequence_file = sequence_file

        # if path for sequence file given, but not there, create
        if sequence_file is not None and not valid_file(sequence_file):
            self.create_sequence_file(sequence_file)

        # add Uniprot ID column if we have sequence mapping
        # from FASTA file
        if self.sequence_file is not None:
            self._add_uniprot_ids()
Exemple #4
0
def _protein_monomer_plot(ali_table, data):
    """
    # TODO
    """
    import seaborn as sns
    sns.set_palette("Paired", len(ali_table), None)

    FONTSIZE = 16
    # set up plot and grid
    fig = plt.figure(figsize=(15, 15))
    gridsize = ((3, 2))
    ax_cov = plt.subplot2grid(gridsize, (0, 0), colspan=1)
    ax_distr = plt.subplot2grid(gridsize, (0, 1), colspan=1)
    ax_gaps = plt.subplot2grid(gridsize, (1, 0), colspan=2)
    ax_sig = plt.subplot2grid(gridsize, (2, 0), colspan=1)
    ax_comp = plt.subplot2grid(gridsize, (2, 1), colspan=1)

    # 1) Number of sequences, coverage
    l_seqs = ax_cov.plot(ali_table.domain_threshold,
                         ali_table.N_eff / ali_table.num_cov,
                         "ok-",
                         label="# Sequences")
    ax_cov.set_xlabel("Domain inclusion threshold")
    ax_cov.set_ylabel("# effective sequences / L")
    ax_cov.set_title("Sequences and coverage", fontsize=FONTSIZE)
    ax_cov.legend(loc="lower left")

    ax_cov2 = ax_cov.twinx()
    l_cov = ax_cov2.plot(ali_table.domain_threshold,
                         ali_table.num_cov / ali_table.seqlen,
                         "o-",
                         label="Coverage",
                         color="#2079b4")
    ax_cov2.set_ylabel("Coverage (% of region)")
    ax_cov2.legend(loc="lower right")
    ax_cov2.set_ylim(0, 1)

    # 2) sequence identity & coverage distributions
    for (domain_threshold, subjob), subdata in sorted(data.items()):
        # sequence identities to query
        if valid_file(subdata["identities"]):
            ids = pd.read_csv(subdata["identities"]).identity_to_query.dropna()
            ax_distr.hist(ids,
                          histtype="step",
                          range=(0, 1.0),
                          bins=100,
                          normed=True,
                          cumulative=True,
                          linewidth=3,
                          label=str(domain_threshold))

            ali_table.loc[ali_table.prefix == subjob,
                          "average_identity"] = ids.mean()

        # coverage distribution
        if valid_file(subdata["frequencies"]):
            freqs = pd.read_csv(subdata["frequencies"])
            # print(freqs.head())
            ax_gaps.plot(freqs.i,
                         1 - freqs.loc[:, "-"],
                         "o",
                         linewidth=3,
                         label=str(domain_threshold))
            mincov = subdata["minimum_column_coverage"]
            if mincov > 1:
                mincov /= 100

            ax_gaps.axhline(mincov, ls="--", color="k")

    ax_distr.set_xlabel("% sequence identity to query")
    ax_distr.set_title("Sequence identity distribution", fontsize=FONTSIZE)
    ax_distr.set_xlim(0, 1)
    ax_distr.set_ylim(0, 1)
    ax_distr.legend()

    ax_gaps.set_title("Gap statistics", fontsize=FONTSIZE)
    ax_gaps.set_xlabel("Sequence index")
    ax_gaps.set_ylabel("Column coverage (1 - % gaps)")
    ax_gaps.autoscale(enable=True, axis='x', tight=True)
    ax_gaps.set_ylim(0, 1)
    ax_gaps.legend(loc="best")

    # number of significant ECs, EC precision
    if "num_significant" in ali_table.columns:
        ax_sig.plot(ali_table.domain_threshold,
                    ali_table.num_significant / ali_table.num_cov, "ok-")

    ax_sig.set_title("Significant ECs", fontsize=FONTSIZE)
    ax_sig.set_xlabel("Domain inclusion threshold")
    ax_sig.set_ylabel("Fraction of significant ECs (% of L)")

    if "precision" in ali_table.columns:
        ax_comp.plot(ali_table.domain_threshold, ali_table.precision, "ok-")

    ax_comp.set_title("Comparison to 3D (top L ECs)", fontsize=FONTSIZE)
    ax_comp.set_xlabel("Domain inclusion threshold")
    ax_comp.set_ylabel("EC precision")
    ax_comp.set_ylim(0, 1)

    return fig
def jackhmmer_search(**kwargs):
    """
    Protocol:

    Iterative jackhmmer search against a sequence database.

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    .. todo::
        explain meaning of parameters in detail.

    Returns
    -------
    outcfg : dict
        Output configuration of the protocol, including
        the following fields:

        * sequence_id (passed through from input)
        * first_index (passed through from input)
        * target_sequence_file
        * sequence_file
        * raw_alignment_file
        * hittable_file
        * focus_mode
        * focus_sequence
        * segments
    """
    check_required(kwargs, [
        "prefix", "sequence_id", "sequence_file", "sequence_download_url",
        "region", "first_index", "use_bitscores", "domain_threshold",
        "sequence_threshold", "database", "iterations", "cpu", "nobias",
        "reuse_alignment", "checkpoints_hmm", "checkpoints_ali", "jackhmmer",
        "extract_annotation"
    ])
    prefix = kwargs["prefix"]

    # make sure output directory exists
    create_prefix_folders(prefix)

    # store search sequence file here
    target_sequence_file = prefix + ".fa"
    full_sequence_file = prefix + "_full.fa"

    # make sure search sequence is defined and load it
    full_seq_file, (full_seq_id, full_seq) = fetch_sequence(
        kwargs["sequence_id"], kwargs["sequence_file"],
        kwargs["sequence_download_url"], full_sequence_file)

    # cut sequence to target region and save in sequence_file
    # (this is the main sequence file used downstream)
    (region_start, region_end), cut_seq = cut_sequence(full_seq,
                                                       kwargs["sequence_id"],
                                                       kwargs["region"],
                                                       kwargs["first_index"],
                                                       target_sequence_file)

    # run jackhmmer... allow to reuse pre-exisiting
    # Stockholm alignment file here
    ali_outcfg_file = prefix + ".align_jackhmmer_search.outcfg"

    # determine if to rerun, only possible if previous results
    # were stored in ali_outcfg_file
    if kwargs["reuse_alignment"] and valid_file(ali_outcfg_file):
        ali = read_config_file(ali_outcfg_file)

        # check if the alignment file itself is also there
        verify_resources(
            "Tried to reuse alignment, but empty or "
            "does not exist", ali["alignment"], ali["domtblout"])
    else:
        # otherwise, we have to run the alignment
        # modify search thresholds to be suitable for jackhmmer
        seq_threshold, domain_threshold = search_thresholds(
            kwargs["use_bitscores"], kwargs["sequence_threshold"],
            kwargs["domain_threshold"], len(cut_seq))

        # run search process
        ali = at.run_jackhmmer(
            query=target_sequence_file,
            database=kwargs[kwargs["database"]],
            prefix=prefix,
            use_bitscores=kwargs["use_bitscores"],
            domain_threshold=domain_threshold,
            seq_threshold=seq_threshold,
            iterations=kwargs["iterations"],
            nobias=kwargs["nobias"],
            cpu=kwargs["cpu"],
            checkpoints_hmm=kwargs["checkpoints_hmm"],
            checkpoints_ali=kwargs["checkpoints_ali"],
            binary=kwargs["jackhmmer"],
        )

        # get rid of huge stdout log file immediately
        # (do not use /dev/null option of jackhmmer function
        # to make no assumption about operating system)
        try:
            os.remove(ali.output)
        except OSError:
            pass

        # turn namedtuple into dictionary to make
        # restarting code nicer
        ali = dict(ali._asdict())

        # save results of search for possible restart
        write_config_file(ali_outcfg_file, ali)

    # prepare output dictionary with result files
    outcfg = {
        "sequence_id": kwargs["sequence_id"],
        "target_sequence_file": target_sequence_file,
        "sequence_file": full_sequence_file,
        "first_index": kwargs["first_index"],
        "focus_mode": True,
        "raw_alignment_file": ali["alignment"],
        "hittable_file": ali["domtblout"],
    }

    # define a single protein segment based on target sequence
    outcfg["segments"] = [
        Segment("aa", kwargs["sequence_id"], region_start, region_end,
                range(region_start, region_end + 1)).to_list()
    ]

    outcfg["focus_sequence"] = "{}/{}-{}".format(kwargs["sequence_id"],
                                                 region_start, region_end)

    return outcfg
def _make_hmmsearch_raw_fasta(alignment_result, prefix):
    """
    HMMsearch results do not contain the query sequence
    so we must construct a raw_fasta file with the query 
    sequence as the first hit, to ensure proper numbering. 
    The search result is filtered to only contain the columns with
    match states to the HMM, which has a one to one mapping to the
    query sequence.

    Paramters
    ---------
    alignment_result : dict
        Alignment result dictionary, output by run_hmmsearch
    prefix : str
        Prefix for file creation

    Returns
    -------
    str
        path to raw focus alignment file

    """
    def _add_gaps_to_query(query_sequence_ali, ali):

        # get the index of columns that do not contain match states (indicated by an x)
        gap_index = [
            i for i, x in enumerate(ali.annotation["GC"]["RF"]) if x != "x"
        ]
        # get the index of columns that contain match states (indicated by an x)
        match_index = [
            i for i, x in enumerate(ali.annotation["GC"]["RF"]) if x == "x"
        ]

        # ensure that the length of the match states
        # match the length of the sequence
        if len(match_index) != query_sequence_ali.L:
            raise ValueError("HMMsearch result {} does not have a one-to-one"
                             " mapping to the query sequence columns".format(
                                 ar["raw_alignment_file"]))

        gapped_query_sequence = ""
        seq = list(query_sequence_ali.matrix[0, :])

        # loop through every position in the HMMsearch hits
        for i in range(len(ali.annotation["GC"]["RF"])):
            # if that position should be a gap, add a gap
            if i in gap_index:
                gapped_query_sequence += "-"
            # if that position should be a letter, pop the next
            # letter in the query sequence
            else:
                gapped_query_sequence += seq.pop(0)

        new_sequence_ali = Alignment.from_dict(
            {query_sequence_ali.ids[0]: gapped_query_sequence})
        return new_sequence_ali

    # open the sequence file
    with open(alignment_result["target_sequence_file"]) as a:
        query_sequence_ali = Alignment.from_file(a, format="fasta")

    # if the provided alignment is empty, just return the target sequence
    raw_focus_alignment_file = prefix + "_raw.fasta"
    if not valid_file(alignment_result["raw_alignment_file"]):
        # write the query sequence to a fasta file
        with open(raw_focus_alignment_file, "w") as of:
            query_sequence_ali.write(of)

        # return as an alignment object
        return raw_focus_alignment_file

    # else, open the HMM search result
    with open(alignment_result["raw_alignment_file"]) as a:
        ali = Alignment.from_file(a, format="stockholm")

    # make sure that the stockholm alignment contains the match annotation
    if not ("GC" in ali.annotation and "RF" in ali.annotation["GC"]):
        raise ValueError("Stockholm alignment {} missing RF"
                         " annotation of match states".format(
                             alignment_result["raw_alignment_file"]))

    # add insertions to the query sequence in order to preserve correct
    # numbering of match sequences
    gapped_sequence_ali = _add_gaps_to_query(query_sequence_ali, ali)

    # write a new alignment file with the query sequence as
    # the first entry

    with open(raw_focus_alignment_file, "w") as of:
        gapped_sequence_ali.write(of)
        ali.write(of)

    return raw_focus_alignment_file
def hmmbuild_and_search(**kwargs):
    """
    Protocol:

    Build HMM from sequence alignment using hmmbuild and 
    search against a sequence database using hmmsearch.
    
    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the protocol, including
        the following fields:

        * target_sequence_file
        * sequence_file
        * raw_alignment_file
        * hittable_file
        * focus_mode
        * focus_sequence
        * segments
    """
    def _format_alignment_for_hmmbuild(input_alignment_file, **kwargs):
        # this file is starting point of pipeline;
        # check if input alignment actually exists

        verify_resources("Input alignment does not exist",
                         input_alignment_file)

        # first try to autodetect format of alignment
        with open(input_alignment_file) as f:
            format = detect_format(f)
            if format is None:
                raise InvalidParameterError(
                    "Format of input alignment {} could not be "
                    "automatically detected.".format(input_alignment_file))

        with open(input_alignment_file) as f:
            ali_raw = Alignment.from_file(f, format)

        # Target sequence of alignment
        sequence_id = kwargs["sequence_id"]

        if sequence_id is None:
            raise InvalidParameterError(
                "Parameter sequence_id must be defined")

        # First, find focus sequence in alignment
        focus_index = None
        for i, id_ in enumerate(ali_raw.ids):
            if id_.startswith(sequence_id):
                focus_index = i
                break

        # if we didn't find it, cannot continue
        if focus_index is None:
            raise InvalidParameterError(
                "Target sequence {} could not be found in alignment".format(
                    sequence_id))

        # identify what columns (non-gap) to keep for focus
        # this should be all columns in the raw_focus_alignment_file
        # but checking anyway
        focus_seq = ali_raw[focus_index]
        focus_cols = np.array([
            c not in [ali_raw._match_gap, ali_raw._insert_gap]
            for c in focus_seq
        ])

        # extract focus alignment
        focus_ali = ali_raw.select(columns=focus_cols)
        focus_seq_nogap = "".join(focus_ali[focus_index])

        # determine region of sequence. If first_index is given,
        # use that in any case, otherwise try to autodetect
        full_focus_header = ali_raw.ids[focus_index]
        focus_id = full_focus_header.split()[0]

        # try to extract region from sequence header
        id_, region_start, region_end = parse_header(focus_id)

        # override with first_index if given
        if kwargs["first_index"] is not None:
            region_start = kwargs["first_index"]
            region_end = region_start + len(focus_seq_nogap) - 1

        if region_start is None or region_end is None:
            raise InvalidParameterError(
                "Could not extract region information " +
                "from sequence header {} ".format(full_focus_header) +
                "and first_index parameter is not given.")

        # resubstitute full sequence ID from identifier
        # and region information
        header = "{}/{}-{}".format(id_, region_start, region_end)

        focus_ali.ids[focus_index] = header

        # write target sequence to file
        target_sequence_file = prefix + ".fa"
        with open(target_sequence_file, "w") as f:
            write_fasta([(header, focus_seq_nogap)], f)

        # swap target sequence to first position if it is not
        # the first sequence in alignment;
        # this is particularly important for hhfilter run
        # because target sequence might otherwise be filtered out
        if focus_index != 0:
            indices = np.arange(0, len(focus_ali))
            indices[0] = focus_index
            indices[focus_index] = 0
            focus_index = 0
            focus_ali = focus_ali.select(sequences=indices)

        # write the raw focus alignment for hmmbuild
        focus_fasta_file = prefix + "_raw_focus_input.fasta"
        with open(focus_fasta_file, "w") as f:
            focus_ali.write(f, "fasta")

        return focus_fasta_file, target_sequence_file, region_start, region_end

    # define the gap threshold for inclusion in HMM's build by HMMbuild.
    SYMFRAC_HMMBUILD = 0.0

    # check for required options
    check_required(kwargs, [
        "prefix", "sequence_id", "alignment_file", "use_bitscores",
        "domain_threshold", "sequence_threshold", "database", "cpu", "nobias",
        "reuse_alignment", "hmmbuild", "hmmsearch"
    ])
    prefix = kwargs["prefix"]

    # make sure output directory exists
    create_prefix_folders(prefix)

    # prepare input alignment for hmmbuild
    focus_fasta_file, target_sequence_file, region_start, region_end = \
        _format_alignment_for_hmmbuild(
            kwargs["alignment_file"], **kwargs
        )

    # run hmmbuild_and_search... allow to reuse pre-exisiting
    # Stockholm alignment file here
    ali_outcfg_file = prefix + ".align_hmmbuild_and_search.outcfg"

    # determine if to rerun, only possible if previous results
    # were stored in ali_outcfg_file
    if kwargs["reuse_alignment"] and valid_file(ali_outcfg_file):
        ali = read_config_file(ali_outcfg_file)

        # check if the alignment file itself is also there
        verify_resources(
            "Tried to reuse alignment, but empty or "
            "does not exist", ali["alignment"], ali["domtblout"])
    else:
        # otherwise, we have to run the alignment
        # modify search thresholds to be suitable for hmmsearch
        sequence_length = region_end - region_start + 1
        seq_threshold, domain_threshold = search_thresholds(
            kwargs["use_bitscores"], kwargs["sequence_threshold"],
            kwargs["domain_threshold"], sequence_length)

        # create the hmm
        hmmbuild_result = at.run_hmmbuild(
            alignment_file=focus_fasta_file,
            prefix=prefix,
            symfrac=SYMFRAC_HMMBUILD,
            cpu=kwargs["cpu"],
            binary=kwargs["hmmbuild"],
        )
        hmmfile = hmmbuild_result.hmmfile

        # run the alignment from the hmm
        ali = at.run_hmmsearch(
            hmmfile=hmmfile,
            database=kwargs[kwargs["database"]],
            prefix=prefix,
            use_bitscores=kwargs["use_bitscores"],
            domain_threshold=domain_threshold,
            seq_threshold=seq_threshold,
            nobias=kwargs["nobias"],
            cpu=kwargs["cpu"],
            binary=kwargs["hmmsearch"],
        )

        # get rid of huge stdout log file immediately
        try:
            os.remove(ali.output)
        except OSError:
            pass

        # turn namedtuple into dictionary to make
        # restarting code nicer
        ali = dict(ali._asdict())
        # only item from hmmsearch_result to save is the hmmfile
        ali["hmmfile"] = hmmfile

        # save results of search for possible restart
        write_config_file(ali_outcfg_file, ali)

    # prepare output dictionary with result files
    outcfg = {
        "sequence_file": target_sequence_file,
        "first_index": region_start,
        "input_raw_focus_alignment": focus_fasta_file,
        "target_sequence_file": target_sequence_file,
        "focus_mode": True,
        "raw_alignment_file": ali["alignment"],
        "hittable_file": ali["domtblout"],
    }

    # convert the raw output alignment to fasta format
    # and add the appropriate query sequecne
    raw_focus_alignment_file = _make_hmmsearch_raw_fasta(outcfg, prefix)
    outcfg["raw_focus_alignment_file"] = raw_focus_alignment_file

    # define a single protein segment based on target sequence
    outcfg["segments"] = [
        Segment("aa", kwargs["sequence_id"], region_start, region_end,
                range(region_start, region_end + 1)).to_list()
    ]

    outcfg["focus_sequence"] = "{}/{}-{}".format(kwargs["sequence_id"],
                                                 region_start, region_end)

    return outcfg
Exemple #8
0
def infer_plmc(**kwargs):
    """
    Run EC computation on alignment. This function contains
    the functionality shared between monomer and complex EC
    inference.
    
    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required
    
    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        raw_ec_file
        model_file
        num_sites
        num_sequences
        effective_sequences

        focus_mode (passed through)
        focus_sequence (passed through)
        segments (passed through)

    """
    check_required(
        kwargs,
        [
            "prefix", "alignment_file",
            "focus_mode", "focus_sequence", "theta",
            "alphabet", "segments", "ignore_gaps", "iterations",
            "lambda_h", "lambda_J", "lambda_group",
            "scale_clusters",
            "cpu", "plmc", "reuse_ecs",
        ]
    )

    prefix = kwargs["prefix"]

    # for now disable option to not save model, since
    # otherwise mutate stage will crash. To remove model
    # file at end, use delete option in management section.
    """
    if kwargs["save_model"]:
        model = prefix + ".model"
    else:
        model = None
    """
    model = prefix + ".model"

    outcfg = {
        "model_file": model,
        "raw_ec_file": prefix + "_ECs.txt",
        "ec_file": prefix + "_CouplingScores.csv",
        # the following are passed through stage...
        "focus_mode": kwargs["focus_mode"],
        "focus_sequence": kwargs["focus_sequence"],
        "segments": kwargs["segments"],
    }

    # make sure input alignment exists
    verify_resources(
        "Input alignment does not exist",
        kwargs["alignment_file"]
    )

    # make sure output directory exists
    create_prefix_folders(prefix)

    # regularization strength on couplings J_ij
    lambda_J = kwargs["lambda_J"]

    segments = kwargs["segments"]
    if segments is not None:
        segments = [
            mapping.Segment.from_list(s) for s in segments
        ]

    # first determine size of alphabet;
    # default is amino acid alphabet
    if kwargs["alphabet"] is None:
        alphabet = ALPHABET_PROTEIN
        alphabet_setting = None
    else:
        alphabet = kwargs["alphabet"]

        # allow shortcuts for protein, DNA, RNA
        if alphabet in ALPHABET_MAP:
            alphabet = ALPHABET_MAP[alphabet]

        # if we have protein alphabet, do not set
        # as plmc parameter since default parameter,
        # has some implementation advantages for focus mode
        if alphabet == ALPHABET_PROTEIN:
            alphabet_setting = None
        else:
            alphabet_setting = alphabet

    # scale lambda_J to proportionally compensate
    # for higher number of J_ij compared to h_i?
    if kwargs["lambda_J_times_Lq"]:
        num_symbols = len(alphabet)

        # if we ignore gaps, there is one character less
        if kwargs["ignore_gaps"]:
            num_symbols -= 1

        # second, determine number of uppercase positions
        # that are included in the calculation
        with open(kwargs["alignment_file"]) as f:
            seq_id, seq = next(read_fasta(f))

        # gap character is by convention first char in alphabet
        gap = alphabet[0]
        uppercase = [
            c for c in seq if c == c.upper() or c == gap
        ]
        L = len(uppercase)

        # finally, scale lambda_J
        lambda_J *= (num_symbols - 1) * (L - 1)

    # run plmc... or reuse pre-exisiting results from previous run
    plm_outcfg_file = prefix + ".couplings_standard_plmc.outcfg"

    # determine if to rerun, only possible if previous results
    # were stored in ali_outcfg_file
    if kwargs["reuse_ecs"] and valid_file(plm_outcfg_file):
        plmc_result = read_config_file(plm_outcfg_file)

        # check if the EC/parameter files are there
        required_files = [outcfg["raw_ec_file"]]

        if outcfg["model_file"] is not None:
            required_files += [outcfg["model_file"]]

        verify_resources(
            "Tried to reuse ECs, but empty or "
            "does not exist",
            *required_files
        )

    else:
        # run plmc binary
        plmc_result = ct.run_plmc(
            kwargs["alignment_file"],
            outcfg["raw_ec_file"],
            outcfg["model_file"],
            focus_seq=kwargs["focus_sequence"],
            alphabet=alphabet_setting,
            theta=kwargs["theta"],
            scale=kwargs["scale_clusters"],
            ignore_gaps=kwargs["ignore_gaps"],
            iterations=kwargs["iterations"],
            lambda_h=kwargs["lambda_h"],
            lambda_J=lambda_J,
            lambda_g=kwargs["lambda_group"],
            cpu=kwargs["cpu"],
            binary=kwargs["plmc"],
        )

        # save iteration table to file
        iter_table_file = prefix + "_iteration_table.csv"
        plmc_result.iteration_table.to_csv(
            iter_table_file
        )

        # turn namedtuple into dictionary to make
        # restarting code nicer
        plmc_result = dict(plmc_result._asdict())

        # then replace table with filename so
        # we can store results in config file
        plmc_result["iteration_table"] = iter_table_file

        # save results of search for possible restart
        write_config_file(plm_outcfg_file, plmc_result)

    # store useful information about model in outcfg
    outcfg.update({
        "num_sites": plmc_result["num_valid_sites"],
        "num_valid_sequences": plmc_result["num_valid_seqs"],
        "effective_sequences": plmc_result["effective_samples"],
        "region_start": plmc_result["region_start"],
    })

    # read and sort ECs
    ecs = pairs.read_raw_ec_file(outcfg["raw_ec_file"])

    if segments is not None:
        # create index mapping
        seg_mapper = mapping.SegmentIndexMapper(
            kwargs["focus_mode"], outcfg["region_start"], *segments
        )

        # apply to EC table
        ecs = mapping.segment_map_ecs(ecs, seg_mapper)

    return outcfg, ecs, segments
Exemple #9
0
def create_archive(config, outcfg, prefix):
    """
    Create archive of files generated by pipeline

    Parameters
    ----------
    config : dict-like
        Input configuration of job. Uses
        config["management"]["archive"] (list of key
        used to index outcfg) to determine
        which files should be added to archive
    outcfg : dict-like
        Output configuration of job
    prefix : str
        Prefix of job, will be used to define archive
        file path (prefix + archive type-specific extension)
    """
    # allowed output archive formats
    ALLOWED_FORMATS = ["targz", "zip"]

    # determine selected output format, default is .tar.gz
    archive_format = config.get("management", {}).get("archive_format",
                                                      "targz")

    # determine keys (corresponding to files) in
    # outcfg that should be stored
    archive_keys = config.get("management", {}).get("archive", None)

    # if no files selected for archiving, return immediately
    if archive_keys is None:
        return

    # check if selected format is valid
    if archive_format not in ALLOWED_FORMATS:
        raise InvalidParameterError(
            "Invalid format for output archive: {}. ".format(archive_format) +
            "Valid options are: " + ", ".join(ALLOWED_FORMATS))

    # create explicit list of files that would go into archive and are valid files
    archive_files = [(file_path, file_key, idx)
                     for (file_path, file_key,
                          idx) in iterate_files(outcfg, subset=archive_keys)
                     if valid_file(file_path)]

    # if there are no file, don't create archive
    if len(archive_files) == 0:
        return

    if archive_format == "targz":
        final_archive_file = prefix + ".tar.gz"
        with tarfile.open(final_archive_file, "w:gz") as tar:
            for (file_path, file_key, idx) in archive_files:
                tar.add(file_path)
    elif archive_format == "zip":
        final_archive_file = prefix + ".zip"
        with zipfile.ZipFile(final_archive_file, "w",
                             zipfile.ZIP_DEFLATED) as zip_:
            for (file_path, file_key, idx) in archive_files:
                zip_.write(file_path)

    return final_archive_file
Exemple #10
0
def run_jobs(configs, global_config, overwrite=False, workdir=None):
    """
    Submit config to pipeline

    Parameters
    ----------
    configs : dict
        Configurations for individual subjobs
    global_config : dict
        Master configuration (if only one job,
        the contents of this dictionary will be
        equal to the single element of config_files)
    """
    python = executable
    pipeline_path = path.abspath(pipeline.__file__)
    summarize_path = path.abspath(summarize.__file__)

    cmd_base = "{} {}".format(python, pipeline_path)
    summ_base = "{} {}".format(python, summarize_path)

    # determine output directory for config files
    prefix = global_config["global"]["prefix"]

    # integrate working directory into output prefix
    # if it is given; if prefix contains an absolute path,
    # this will override the workdir according to
    # implementation of path.join()
    if workdir is not None:
        out_prefix = path.join(workdir, prefix)
    else:
        out_prefix = prefix

    # save configuration file, make sure we do not overwrite previous run
    # if overwrite protection is activated
    # (but only if it is a valid configuration file with contents)
    cfg_filename = CONFIG_NAME.format(out_prefix)

    if not overwrite and valid_file(cfg_filename):
        raise InvalidParameterError(
            "Existing configuration file {} ".format(cfg_filename) +
            "indicates current prefix {} ".format(prefix) +
            "would overwrite existing results. Use --yolo " +
            "flag to deactivate overwrite protection (e.g. for "
            "restarting a job or running a different stage)."
        )

    # make sure working directory exists
    create_prefix_folders(cfg_filename)

    # write global config file
    write_config_file(cfg_filename, global_config)

    # also write individual subjob configuration files
    # (we have to write these before submitting, since
    # the job summarizer needs the paths to all files)
    for subjob_prefix, subjob_cfg in configs.items():
        # determine working dir for each subjob, since subjob
        # prefix may contain slashes leading to subfolder creation
        if workdir is not None:
            subjob_out_prefix = path.join(workdir, subjob_prefix)
        else:
            subjob_out_prefix = subjob_prefix

        subcfg_filename = CONFIG_NAME.format(subjob_out_prefix)

        # make sure output subfolder exists
        create_prefix_folders(subcfg_filename)

        # write subjob configuration file
        write_config_file(subcfg_filename, subjob_cfg)

    # now create list of subjob config files relative to working
    # directory (above, we allow to run submitted in arbitrary directory)
    config_files = [
        CONFIG_NAME.format(subjob_prefix) for subjob_prefix in configs
    ]

    # create command for summarizer (needs to know all subjob config files)
    summ_cmd = "{} {} {} {}".format(
        summ_base,
        global_config["pipeline"],
        global_config["global"]["prefix"],
        " ".join(config_files)
    )

    # create submitter from global (pre-unrolling) configuration
    submitter = utils.SubmitterFactory(
        global_config["environment"]["engine"],
        db_path=out_prefix + "_job_database.txt"
    )

    # collect individual submitted jobs here
    commands = []

    # prepare individual jobs for submission
    for job, job_cfg in configs.items():
        job_prefix = job_cfg["global"]["prefix"]
        job_cfg_file = CONFIG_NAME.format(job)

        # set job status in database to pending
        pipeline.update_job_status(job_cfg, status=database.EStatus.PEND)

        # create submission command
        env = job_cfg["environment"]
        cmd = utils.Command(
            [
                "{} {}".format(cmd_base, job_cfg_file),
                summ_cmd
            ],
            name=job_prefix,
            environment=env["configuration"],
            workdir=workdir,
            resources={
                utils.EResource.queue: env["queue"],
                utils.EResource.time: env["time"],
                utils.EResource.mem: env["memory"],
                utils.EResource.nodes: env["cores"],
                utils.EResource.out: job_prefix + "_stdout.log",
                utils.EResource.error: job_prefix + "_stderr.log",
            }
        )

        # store job for later dependency creation
        commands.append(cmd)

        # finally, submit job
        submitter.submit(cmd)

    # submit final summarizer
    # (hold for now - summarizer is run after each subjob finishes)

    # wait for all runs to finish (but only if blocking)
    submitter.join()
Exemple #11
0
def cns_dgsa_fold(residues,
                  ec_pairs,
                  prefix,
                  config_file=None,
                  secstruct_column="sec_struct_3state",
                  num_structures=20,
                  min_cycles=5,
                  log_level=None,
                  binary="cns"):
    """
    Predict 3D structure coordinates using distance geometry
    and simulated annealing-based folding protocol
     
    Parameters
    ----------
    residues : pandas.DataFrame
        Table containing positions (column i), residue
        type (column A_i), and secondary structure for
        each position
    ec_pairs : pandas.DataFrame
        Table with EC pairs that will be turned
        into distance restraints
        (with columns i, j, A_i, A_j)
    prefix : str
        Prefix for output files (can include directories).
        Folders will be created automatically.
    config_file : str, optional (default: None)
        Path to config file with folding settings. If None,
        will use default settings included in package
        (restraints.yml)
    secstruct_column : str, optional (default: sec_struct_3state)
        Column name in residues dataframe from which secondary
        structure will be extracted (has to be H, E, or C).
    num_structures : int, optional (default: 20)
        Number of trial structures to generate
    min_cycles : int, optional (default: 5)
        Number of minimization cycles at end of protocol
    log_level : {None, "quiet", "verbose"}, optional (default: None)
        Don't keep CNS log files, or switch to different degrees
        of verbosity ("verbose" needed to obtain violation information)
    binary : str, optional (default: "cns")
        Path of CNS binary

    Returns
    -------
    final_models : dict
        Mapping from model name to path of model
    """
    def _run_inp(inp_str, output_prefix):
        with open(output_prefix + ".inp", "w") as f:
            f.write(inp_str)

        if log_level is not None:
            log_file = output_prefix + ".log"
        else:
            log_file = None

        run_cns(inp_str, log_file=log_file, binary=binary)

    # make sure output directory exists
    create_prefix_folders(prefix)

    # CNS doesn't like paths above a certain length, so we
    # will change into working directory and keep paths short.
    # For this reason, extract path and filename prefix
    dir_, rootname = path.split(prefix)
    cwd = os.getcwd()

    if dir_ != "":
        os.chdir(dir_)

    # create restraints (EC pairs and secondary structure-based)
    ec_tbl = rootname + "_couplings.tbl"
    ss_dist_tbl = rootname + "_ss_distance.tbl"
    ss_angle_tbl = rootname + "_ss_angle.tbl"

    ec_dist_restraints(ec_pairs, ec_tbl, cns_dist_restraint, config_file)

    secstruct_dist_restraints(residues, ss_dist_tbl, cns_dist_restraint,
                              config_file, secstruct_column)

    secstruct_angle_restraints(residues, ss_angle_tbl, cns_dihedral_restraint,
                               config_file, secstruct_column)

    # create sequence file
    seq = "".join(residues.A_i)
    seq_file = rootname + ".seq"
    cns_seq_file(seq, seq_file)

    # set up input files for folding
    # make molecular topology file (will be written to mtf_file)
    mtf_file = rootname + ".mtf"
    _run_inp(
        cns_mtf_inp(seq_file,
                    mtf_file,
                    first_index=residues.i.min(),
                    disulfide_bridges=None), mtf_file)

    # make extended PDB file (will be in extended_file)
    extended_file = rootname + "_extended.pdb"
    _run_inp(cns_extended_inp(mtf_file, extended_file), extended_file)

    # fold using dg_sa protocol (filenames will have suffixes _1, _2, ...)

    # have to pass either quiet or verbose to CNS (but will not store
    # log file if log_level is None).
    if log_level is None:
        dgsa_log_level = "quiet"
    else:
        dgsa_log_level = log_level

    _run_inp(
        cns_dgsa_inp(extended_file,
                     mtf_file,
                     rootname,
                     ec_tbl,
                     ss_dist_tbl,
                     ss_angle_tbl,
                     num_structures=num_structures,
                     log_level=dgsa_log_level), rootname + "_dgsa")

    # add hydrogen atoms and minimize (for all
    # generated candidate structures from dg_sa)

    # keep track of final predicted structures
    final_models = {}

    for i in range(1, num_structures + 1):
        input_root = "{}_{}".format(rootname, i)
        input_model = input_root + ".pdb"

        # check if we actually got the model from dg_sa
        if not valid_file(input_model):
            continue

        # run generate_easy protocol to add hydrogen atoms
        easy_pdb = input_root + "_h.pdb"
        easy_mtf = input_root + "_h.mtf"
        _run_inp(cns_generate_easy_inp(input_model, easy_pdb, easy_mtf),
                 input_root + "_h")

        # then minimize
        min_pdb = input_root + "_hMIN.pdb"

        _run_inp(
            cns_minimize_inp(easy_pdb,
                             easy_mtf,
                             min_pdb,
                             num_cycles=min_cycles), input_root + "_hMIN")

        if valid_file(min_pdb):
            final_models[min_pdb] = path.join(dir_, min_pdb)

    # change back into original directory
    os.chdir(cwd)

    return final_models
Exemple #12
0
def run_plmc(alignment, couplings_file, param_file=None,
             focus_seq=None, alphabet=None, theta=None,
             scale=None, ignore_gaps=False, iterations=None,
             lambda_h=None, lambda_J=None, lambda_g=None,
             cpu=None, binary="plmc"):
    """
    Run plmc on sequence alignment and store
    files with model parameters and pair couplings.

    Parameters
    ----------
    alignment : str
        Path to input sequence alignment
    couplings_file : str
        Output path for file with evolutionary couplings
        (folder will be created)
    param_file : str
        Output path for binary file containing model
        parameters (folder will be created)
    focus_seq : str, optional (default: None)
        Name of focus sequence, if None, non-focus mode
        will be used
    alphabet : str, optional (default: None)
        Alphabet for model inference. If None, standard
        amino acid alphabet including gap will be used.
        First character in string corresponds to gap
        character (relevant for ignore_gaps).
    theta : float, optional (default: None)
        Sequences with pairwise identity >= theta
        will be clustered and their sequence weights
        downweighted as 1 / num_cluster_members.
        Important: Note that plmc will be parametrized using
        1 - theta. If None, default value in plmc will be used,
        which corresponds to theta=0.8 (plmc setting 0.2).
    scale : float, optional (default: None)
        Scale weights of clusters by this value.
        If None, default value in plmc (1.0) will be used
    ignore_gaps : bool, optional (default: False)
        Exclude gaps from parameter inference. Gap
        character is first character of alphabet
        parameter.
    iterations : int, optional (default: None)
        Maximum iterations for optimization.
    lambda_h : float, optional (default: None)
        l2 regularization strength on fields.
        If None, plmc default will be used.
    lambda_J : float, optional (default: None)
        l2-regularization strength on couplings.
        If None, plmc default will be used
    lambda_g : float, optional (default: None)
        group l1-regularization strength on couplings
        If None, plmc default will be used.
    cpu : Number of cores to use for running plmc.
        Note that plmc has to be compiled in openmp
        mode to runnable with multiple cores.
        Can also be set to "max".
    binary : str, optional (default: "plmc")
        Path to plmc binary

    Returns
    -------
    PlmcResult
        namedtuple containing output files and
        parsed fields from console output of plmc

    Raises
    ------
    ExternalToolError
    """
    create_prefix_folders(couplings_file)

    # Make sure input alignment exists
    verify_resources(
        "Alignment file does not exist", alignment
    )

    cmd = [
        binary,
        "-c", couplings_file,
    ]

    # store eij file if explicitly requested
    if param_file is not None:
        create_prefix_folders(param_file)
        cmd += ["-o", param_file]

    # focus sequence mode and ID
    if focus_seq is not None:
        # TODO: for now split exclude sequence
        # region from focus seq name, otherwise
        # plmc does not remap names. If this
        # behaviour changes in plmc, remove the
        # following line.
        focus_seq = focus_seq.split("/")[0]
        cmd += ["-f", focus_seq]

    # exclude gaps from calculation?
    if ignore_gaps:
        cmd += ["-g"]

    # maximum number of iterations, can also be "max"
    if iterations is not None:
        cmd += ["-m", str(iterations)]

    # set custom alphabet
    # (first character is gap by default in nogap mode)
    if alphabet is not None:
        cmd += ["-a", alphabet]

    # sequence reweighting
    if theta is not None:
        # transform into plmc convention (1-theta)
        theta = 1.0 - theta
        cmd += ["-t", str(theta)]

    # cluster weight
    if scale is not None:
        cmd += ["-s", str(scale)]

    # L2 regularization weight for fields
    if lambda_h is not None:
        cmd += ["-lh", str(lambda_h)]

    # L2 regularization weight for pair couplings
    if lambda_J is not None:
        cmd += ["-le", str(lambda_J)]

    # Group L1 regularization weight for pair couplings
    if lambda_g is not None:
        cmd += ["-lg", str(lambda_g)]

    # Number of cores to use for calculation
    if cpu is not None:
        cmd += ["-n", str(cpu)]

    # finally also add input alignment (main parameter)
    cmd += [alignment]

    # TODO: for now do not check returncode because sometimes
    # returncode == -11 (segfault) despite successful calculation
    return_code, stdout, stderr = run(cmd, check_returncode=False)

    # TODO: remove this segfault-hunting output once fixed
    if return_code != 0:
        # if not a segfault, still raise exception
        if return_code != -11:
            from evcouplings.utils.system import ExternalToolError
            raise ExternalToolError(
                "Call failed:\ncmd={}\nreturncode={}\nstdout={}\nstderr={}".format(
                    cmd, return_code, stdout, stderr
                )
            )

        print("PLMC NON-ZERO RETURNCODE:", return_code)
        print(cmd)
        print(" ".join(cmd))
        print("stdout:", stdout)
        print("stderr:", stderr)

    iter_df, out_fields = parse_plmc_log(stderr)

    # also check we actually calculated couplings...
    if not valid_file(couplings_file):
        raise ResourceError(
            "plmc returned no couplings: stdout={} stderr={} file={}".format(
                stdout, stderr, couplings_file
            )
        )

    # ... and parameter file, if requested
    if param_file and not valid_file(param_file):
        raise ResourceError(
            "plmc returned no parameter file: stdout={} stderr={} file={}".format(
                stdout, stderr, param_file
            )
        )

    return PlmcResult(
        couplings_file, param_file,
        iter_df, *out_fields
    )
def standard(**kwargs):
    """
    Protocol:
    Predict 3D structure from evolutionary couplings

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * sec_struct_file
        * folding_ec_file
        * folded_structure_files
    """
    check_required(
        kwargs,
        [
            "prefix", "engine", "ec_file", "target_sequence_file",
            "segments", "folding_config_file", "cut_to_alignment_region",
            "sec_struct_method", "reuse_sec_struct",
            "sec_struct_file", "filter_sec_struct_clashes",
            "min_sequence_distance", "fold_probability_cutoffs",
            "fold_lowest_count", "fold_highest_count", "fold_increase",
            "num_models", "psipred", "cpu", "remapped_pdb_files",
            "cleanup",
        ]
    )

    prefix = kwargs["prefix"]

    # make sure output directory exists
    create_prefix_folders(prefix)

    outcfg = {
        "folding_ec_file": prefix + "_CouplingScores_with_clashes.csv",
        "sec_struct_file": prefix + "_secondary_structure.csv",
    }

    # get secondary structure prediction
    # check if we should (and can) reuse output file from previous run
    if kwargs["reuse_sec_struct"] and valid_file(outcfg["sec_struct_file"]):
        residues = pd.read_csv(outcfg["sec_struct_file"])
    else:
        residues = secondary_structure(**kwargs)

    # make pymol secondary structure assignment script
    outcfg["secondary_structure_pml_file"] = prefix + "_ss_draw.pml"
    pymol_secondary_structure(
        residues, outcfg["secondary_structure_pml_file"]
    )

    # load ECs and filter for long-range pairs
    verify_resources(
        "EC file does not exist", kwargs["ec_file"]
    )
    ecs_all = pd.read_csv(kwargs["ec_file"])
    ecs = ecs_all.query("abs(i - j) > {}".format(
        kwargs["min_sequence_distance"])
    )

    # find secondary structure clashes
    ecs = secstruct_clashes(ecs, residues)
    ecs.to_csv(outcfg["folding_ec_file"], index=False)

    # if requested, filter clashes out before folding
    if kwargs["filter_sec_struct_clashes"]:
        ecs_fold = ecs.loc[~ecs.ss_clash]
    else:
        ecs_fold = ecs

    # cut modelled region to aligned region, if selected
    if kwargs["cut_to_alignment_region"]:
        segments = kwargs["segments"]
        # infer region from segment positions if we have it
        if segments is not None:
            positions = Segment.from_list(segments[0]).positions
        else:
            # otherwise get from EC values (could be misleading if
            # EC list is truncated, so only second option)
            positions = set(ecs.i.unique()).union(ecs.j.unique())

        # limit modelled positions to covered region
        first_pos, last_pos = min(positions), max(positions)
        residues.loc[:, "in_model"] = False
        residues.loc[
            (residues.i >= first_pos) & (residues.i <= last_pos),
            "in_model"
        ] = True
    else:
        # otherwise include all positions in model
        residues.loc[:, "in_model"] = True

    # save secondary structure prediction
    residues.to_csv(outcfg["sec_struct_file"], index=False)

    # only use the residues that will be in model for folding
    residues_fold = residues.loc[residues.in_model]

    # after all the setup, now fold the structures...
    # to speed things up, parallelize this to the number of
    # available CPUs
    num_procs = kwargs["cpu"]
    if num_procs is None:
        num_procs = 1

    # first define all the sub-runs...
    folding_runs = []

    # ... based on mixture model probability
    cutoffs = kwargs["fold_probability_cutoffs"]
    if cutoffs is not None and "probability" in ecs_fold.columns:
        if not isinstance(cutoffs, list):
            cutoffs = [cutoffs]

        for c in cutoffs:
            sig_ecs = ecs_fold.query("probability >= @c")
            if len(sig_ecs) > 0:
                folding_runs.append(
                    (sig_ecs,
                     "_significant_ECs_{}".format(c))
                )

    # ... and on simple EC counts/bins
    flc = kwargs["fold_lowest_count"]
    fhc = kwargs["fold_highest_count"]
    fi = kwargs["fold_increase"]
    if flc is not None and fhc is not None and fi is not None:
        num_sites = len(
            set.union(set(ecs.i.unique()), set(ecs.j.unique()))
        )

        # transform fraction of number of sites into discrete number of ECs
        def _discrete_count(x):
            if isinstance(x, float):
                x = ceil(x * num_sites)
            return int(x)

        # range of plots to make
        lowest = _discrete_count(flc)
        highest = _discrete_count(fhc)
        step = _discrete_count(fi)

        # append to list of jobs to run
        folding_runs += [
            (
                ecs_fold.iloc[:c],
                "_{}".format(c)
            )
            for c in range(lowest, highest + 1, step)
        ]

    # set up method to drive the folding of each job
    method = kwargs["engine"]

    # store structures in an auxiliary subdirectory, after folding
    # final models will be moved to main folding dir. Depending
    # on cleanup setting, the aux directory will be removed
    aux_prefix = insert_dir(prefix, "aux", rootname_subdir=False)
    aux_dir = path.dirname(aux_prefix)

    folding_runs = [
        (job_ecs, aux_prefix + job_suffix)
        for (job_ecs, job_suffix) in folding_runs
    ]

    if method == "cns_dgsa":
        folder = partial(
            cns_dgsa_fold,
            residues_fold,
            config_file=kwargs["folding_config_file"],
            num_structures=kwargs["num_models"],
            log_level=None,
            binary=kwargs["cns"]
        )
    else:
        raise InvalidParameterError(
            "Invalid folding engine: {} ".format(method) +
            "Valid selections are: cns_dgsa"
        )

    # then apply folding function to each sub-run
    pool = mp.Pool(processes=num_procs)
    results = pool.starmap(folder, folding_runs)

    # make double sure that the pool is cleaned up,
    # or SIGTERM upon exit will interfere with
    # interrupt signal interception
    pool.close()
    pool.join()

    # merge result dictionaries into one dict
    folded_files = {
        k: v for subres in results for k, v in subres.items()
    }

    # move structures from aux into main folding dir
    fold_dir = path.dirname(prefix)
    prediction_files = []
    for name, file_path in folded_files.items():
        # move file (use copy to allow overwriting)
        shutil.copy(file_path, fold_dir)

        # update file path to main folding dir,
        # and put in a flat list of result files
        prediction_files.append(
            file_path.replace(aux_prefix, prefix)
        )

    outcfg["folded_structure_files"] = prediction_files

    # remove aux dir if cleanup is requested
    if kwargs["cleanup"]:
        shutil.rmtree(aux_dir)

    # apply ranking to predicted models
    ranking = dihedral_ranking(prediction_files, residues)

    # apply clustering (all available methods), but only
    # if we have something to cluster
    if len(prediction_files) > 1:
        clustering = maxcluster_clustering_table(
            prediction_files, binary=kwargs["maxcluster"]
        )

        # join ranking with clustering
        ranking = ranking.merge(clustering, on="filename", how="left")

    # sort by score (best models first)
    ranking = ranking.sort_values(by="ranking_score", ascending=False)

    # store as file
    outcfg["folding_ranking_file"] = prefix + "_ranking.csv"
    ranking.to_csv(outcfg["folding_ranking_file"], index=False)

    # apply comparison to existing structures
    if kwargs["remapped_pdb_files"] is not None and len(kwargs["remapped_pdb_files"]) > 0:
        experimental_files = kwargs["remapped_pdb_files"]

        comp_all, comp_singles = compare_models_maxcluster(
            list(experimental_files.keys()), prediction_files,
            norm_by_intersection=True, distance_cutoff=None,
            binary=kwargs["maxcluster"]
        )

        # merge with ranking and save
        comparison = ranking.merge(
            comp_all, on="filename", how="left"
        ).sort_values(by="tm", ascending=False)
        outcfg["folding_comparison_file"] = prefix + "_comparison.csv"
        comparison.to_csv(outcfg["folding_comparison_file"], index=False)

        # also store comparison to structures in individual files
        ind_comp_files = {}
        for filename, comp_single in comp_singles.items():
            comparison_s = ranking.merge(
                comp_single, on="filename", how="left"
            ).sort_values(by="tm", ascending=False)
            basename = path.splitext(path.split(filename)[1])[0]
            ind_file = path.join(fold_dir, basename + ".csv")

            # map back to original key from remapped_pdb_files as a key for this list
            ind_comp_files[ind_file] = experimental_files[filename]
            comparison_s.to_csv(ind_file, index=False)

        outcfg["folding_individual_comparison_files"] = ind_comp_files

    return outcfg
Exemple #14
0
def substitute_config(**kwargs):
    """
    Substitute command line arguments into config file

    Parameters
    ----------
    **kwargs
        Command line parameters to be substituted
        into configuration file

    Returns
    -------
    dict
        Updated configuration
    """
    # mapping of command line parameters to config file entries
    CONFIG_MAP = {
        "prefix": ("global", "prefix"),
        "protein": ("global", "sequence_id"),
        "seqfile": ("global", "sequence_file"),
        "alignment": ("align", "input_alignment"),
        "iterations": ("align", "iterations"),
        "id": ("align", "seqid_filter"),
        "seqcov": ("align", "minimum_sequence_coverage"),
        "colcov": ("align", "minimum_column_coverage"),
        "theta": ("global", "theta"),
        "plmiter": ("couplings", "iterations"),
        "queue": ("environment", "queue"),
        "time": ("environment", "time"),
        "cores": ("environment", "cores"),
        "memory": ("environment", "memory"),
    }

    # try to read in configuration
    config_file = kwargs["config"]
    if not valid_file(config_file):
        raise ResourceError(
            "Config file does not exist or is empty: {}".format(config_file))

    config = read_config_file(config_file, preserve_order=True)

    # substitute command-line parameters into configuration
    # (if straightforward substitution)
    for param, value in kwargs.items():
        if param in CONFIG_MAP and value is not None:
            outer, inner = CONFIG_MAP[param]
            config[outer][inner] = value

    # make sure that number of CPUs requested by
    # programs within pipeline does not exceed
    # number of cores requested in environment
    if config["environment"]["cores"] is not None:
        config["global"]["cpu"] = config["environment"]["cores"]

    # handle the more complicated parameters

    # If alignment is given, run "existing" protocol
    if kwargs.get("alignment", None) is not None:
        # TODO: think about what to do if sequence_file is given
        # (will not be used)
        config["align"]["protocol"] = "existing"

    # subregion of protein
    if kwargs.get("region", None) is not None:
        region = kwargs["region"]
        m = re.search("(\d+)-(\d+)", region)
        if m:
            start, end = map(int, m.groups())
            config["global"]["region"] = [start, end]
        else:
            raise InvalidParameterError(
                "Region string does not have format "
                "start-end (e.g. 5-123):".format(region))

    # pipeline stages to run
    if kwargs.get("stages", None) is not None:
        config["stages"] = kwargs["stages"].replace(" ", "").split(",")

    # sequence alignment input database
    if kwargs.get("database", None) is not None:
        db = kwargs["database"]
        # check if we have a predefined sequence database
        # if so, use it; otherwise, interpret as file path
        if db in config["databases"]:
            config["align"]["database"] = db
        else:
            config["align"]["database"] = "custom"
            config["databases"]["custom"] = db

    # make sure bitscore and E-value thresholds are exclusively set
    if kwargs.get("bitscores", None) is not None and kwargs.get(
            "evalues", None) is not None:
        raise InvalidParameterError(
            "Can not specify bitscore and E-value threshold at the same time.")

    if kwargs.get("bitscores", None) is not None:
        thresholds = kwargs["bitscores"]
        bitscore = True
    elif kwargs.get("evalues", None) is not None:
        thresholds = kwargs["evalues"]
        bitscore = False
    else:
        thresholds = None

    if thresholds is not None:
        T = thresholds.replace(" ", "").split(",")
        try:
            x_cast = [(float(t) if "." in t else int(t)) for t in T]
        except ValueError:
            raise InvalidParameterError(
                "Bitscore/E-value threshold(s) must be numeric: "
                "{}".format(thresholds))

        config["align"]["use_bitscores"] = bitscore

        # check if we have a single threshold (single job)
        # or if we need to create an array of jobs
        if len(x_cast) == 1:
            config["align"]["domain_threshold"] = x_cast[0]
            config["align"]["sequence_threshold"] = x_cast[0]
        else:
            config["batch"] = {}
            for t in x_cast:
                sub_prefix = ("_b" if bitscore else "_e") + str(t)
                config["batch"][sub_prefix] = {
                    "align": {
                        "domain_threshold": t,
                        "sequence_threshold": t,
                    }
                }

    return config
Exemple #15
0
def run_jobs(configs,
             global_config,
             overwrite=False,
             workdir=None,
             abort_on_error=True,
             environment=None):
    """
    Submit config to pipeline

    Parameters
    ----------
    configs : dict
        Configurations for individual subjobs
    global_config : dict
        Master configuration (if only one job,
        the contents of this dictionary will be
        equal to the single element of config_files)
    overwrite : bool, optional (default: False)
        If True, allows overwriting previous run of the same
        config, otherwise will fail if results from previous
        execution are present
    workdir : str, optional (default: None)
        Workdir in which to run job (will combine
        workdir and prefix in joint path)
    abort_on_error : bool, optional (default: True)
        Abort entire job submission if error occurs for
        one of the jobs by propagating RuntimeError
    environment : str, optional (default: None)
        Allow to pass value for environment parameter
        of submitter, will override environment.configuration
        from global_config (e.g., for setting environment
        variables like passwords)

    Returns
    -------
    job_ids : dict
        Mapping from subjob prefix (keys in configs parameter)
        to identifier returned by submitter for each of the jobs
        that was *successfully* submitted (i.e. missing keys from
        configs param indicate these jobs could not be submitted).

    Raises
    ------
    RuntimeError
        If error encountered during submission and abort_on_error
        is True
    """
    cmd_base = environ.get("EVCOUPLINGS_RUNCFG_APP") or "evcouplings_runcfg"
    summ_base = environ.get(
        "EVCOUPLINGS_SUMMARIZE_APP") or "evcouplings_summarize"

    # determine output directory for config files
    prefix = global_config["global"]["prefix"]

    # integrate working directory into output prefix
    # if it is given; if prefix contains an absolute path,
    # this will override the workdir according to
    # implementation of path.join()
    if workdir is not None:
        out_prefix = path.join(workdir, prefix)
    else:
        out_prefix = prefix

    # save configuration file, make sure we do not overwrite previous run
    # if overwrite protection is activated
    # (but only if it is a valid configuration file with contents)
    cfg_filename = CONFIG_NAME.format(out_prefix)

    if not overwrite and valid_file(cfg_filename):
        raise InvalidParameterError(
            "Existing configuration file {} ".format(cfg_filename) +
            "indicates current prefix {} ".format(prefix) +
            "would overwrite existing results. Use --yolo " +
            "flag to deactivate overwrite protection (e.g. for "
            "restarting a job or running a different stage).")

    # make sure working directory exists
    create_prefix_folders(cfg_filename)

    # write global config file
    write_config_file(cfg_filename, global_config)

    # also write individual subjob configuration files
    # (we have to write these before submitting, since
    # the job summarizer needs the paths to all files)
    for subjob_prefix, subjob_cfg in configs.items():
        # determine working dir for each subjob, since subjob
        # prefix may contain slashes leading to subfolder creation
        if workdir is not None:
            subjob_out_prefix = path.join(workdir, subjob_prefix)
        else:
            subjob_out_prefix = subjob_prefix

        subcfg_filename = CONFIG_NAME.format(subjob_out_prefix)

        # make sure output subfolder exists
        create_prefix_folders(subcfg_filename)

        # write subjob configuration file
        write_config_file(subcfg_filename, subjob_cfg)

    # now create list of subjob config files relative to working
    # directory (above, we allow to run submitted in arbitrary directory)
    config_files = [
        CONFIG_NAME.format(subjob_prefix) for subjob_prefix in configs
    ]

    # create command for summarizer (needs to know all subjob config files)
    summ_cmd = "{} {} {} {}".format(summ_base, global_config["pipeline"],
                                    global_config["global"]["prefix"],
                                    " ".join(config_files))

    # create submitter from global (pre-unrolling) configuration
    submitter = utils.SubmitterFactory(global_config["environment"]["engine"],
                                       db_path=out_prefix +
                                       "_job_database.txt")

    # collect individual submitted jobs here
    commands = []

    # record subjob IDs returned by submitter for each job
    job_ids = {}

    # prepare individual jobs for submission
    for job, job_cfg in configs.items():
        job_prefix = job_cfg["global"]["prefix"]
        job_cfg_file = CONFIG_NAME.format(job)

        # create submission command
        env = job_cfg["environment"]
        cmd = utils.Command(
            ["{} {}".format(cmd_base, job_cfg_file), summ_cmd],
            name=job_prefix,
            environment=environment or env["configuration"],
            workdir=workdir,
            resources={
                utils.EResource.queue: env["queue"],
                utils.EResource.time: env["time"],
                utils.EResource.mem: env["memory"],
                utils.EResource.nodes: env["cores"],
                utils.EResource.out: job_prefix + "_stdout.log",
                utils.EResource.error: job_prefix + "_stderr.log",
            })

        # store job for later dependency creation
        commands.append(cmd)

        tracker = get_result_tracker(job_cfg)

        try:
            # finally, submit job
            current_job_id = submitter.submit(cmd)

            # store run identifier returned by submitter
            # TODO: consider storing current_job_id using tracker right away
            job_ids[job] = current_job_id

            # set job status in database to pending
            tracker.update(status=EStatus.PEND)

        except RuntimeError as e:
            # set job as failed in database
            tracker.update(status=EStatus.FAIL, message=str(e))

            # fail entire job submission if requested
            if abort_on_error:
                raise

    # submit final summarizer
    # (hold for now - summarizer is run after each subjob finishes)

    # wait for all runs to finish (but only if blocking)
    submitter.join()

    # return job identifiers
    return job_ids
Exemple #16
0
def protein_complex(prefix, configs):
    """
    Create results summary for run using
    protein_complex pipeline

    """
    # TODO: this is only designed to work with skewnormal threshold
    MIN_PROBABILITY = 0.9

    # number of inter ECs to check for precision
    NUM_INTER = 5

    # TODO: create segments global variable and import
    FIRST_SEGMENT = "A_1"
    SECOND_SEGMENT = "B_1"

    ali_table = pd.DataFrame()
    prefix_to_cfgs = {}
    data = defaultdict(lambda: defaultdict())

    # go through all config files
    for cfg_file in configs:
        # check if the file exists and has contents
        # since run might not yet have finished or crashed
        if valid_file(cfg_file):
            # job input configuration
            C = read_config_file(cfg_file)
            sub_prefix = C["global"]["prefix"]
            sub_index = (sub_prefix)

            final_state_cfg = sub_prefix + FINAL_CONFIG_SUFFIX
            if not valid_file(final_state_cfg):
                continue

            # read final output state of job
            R = read_config_file(final_state_cfg)
            data[sub_index]["identities"] = R["identities_file"]
            data[sub_index]["frequencies"] = R["frequencies_file"]
            data[sub_index]["minimum_column_coverage"] = C["concatenate"][
                "minimum_column_coverage"]

            stat_file = R["statistics_file"]
            ec_file = R.get("ec_file", "")
            ec_comp_file = R.get("ec_compared_longrange_file", "")
            concat_stat_file = R.get("concatentation_statistics_file", "")
            first_stat_file = R.get("first_statistics_file", "")
            second_stat_file = R.get("second_statistics_file", "")

            prefix_to_cfgs[(sub_prefix)] = (C, R)

            # read and modify alignment statistics
            if valid_file(stat_file):
                # get alignment stats for current job
                stat_df = pd.read_csv(stat_file)
                n_eff = R["effective_sequences"]

                if n_eff is not None:
                    stat_df.loc[0, "N_eff"] = n_eff

                L = stat_df.loc[0, "num_cov"]

                # try to get concatenation statistics in addition
                if valid_file(concat_stat_file):
                    concat_stat_df = pd.read_csv(concat_stat_file)

                    # get and save n sequences per monomer aln
                    n_seqs_1 = concat_stat_df.loc[0, "num_seqs_1"]
                    n_seqs_2 = concat_stat_df.loc[0, "num_seqs_2"]
                    stat_df.loc[0, "first_n_seqs"] = int(n_seqs_1)
                    stat_df.loc[0, "second_n_seqs"] = int(n_seqs_2)

                    # get and save median n paralogs per monomer aln
                    n_paralogs_1 = concat_stat_df.loc[
                        0, "median_num_per_species_1"]
                    n_paralogs_2 = concat_stat_df.loc[
                        0, "median_num_per_species_2"]
                    stat_df.loc[0, "median_num_per_species_1"] = n_paralogs_1
                    stat_df.loc[0, "median_num_per_species_2"] = n_paralogs_2

                # try to get number of significant ECs in addition
                if valid_file(ec_file):
                    ecs = pd.read_csv(ec_file)

                    #number of significant monomer Ecs
                    min_seq_dist = C["compare"]["min_sequence_distance"]
                    num_sig = len(
                        ecs.query(
                            "abs(i-j) >= @min_seq_dist and probability >= @MIN_PROBABILITY"
                        ))

                    # number of inter-protein ECs significant
                    num_sig_inter = len(
                        ecs.query(
                            "segment_i != segment_j and probability >= @MIN_PROBABILITY"
                        ))
                    stat_df.loc[0, "num_significant"] = int(num_sig)

                    #rank of top inter contact
                    top_inter_rank = ecs.query(
                        "segment_i != segment_j").index[0]
                    stat_df.loc[0, "top_inter_rank"] = int(top_inter_rank)

                # try to get EC precision in addition
                if valid_file(ec_comp_file):
                    ec_comp = pd.read_csv(ec_comp_file)
                    ec_comp_1 = ec_comp.query(
                        "segment_i == segment_j == @FIRST_SEGMENT")
                    ec_comp_2 = ec_comp.query(
                        "segment_i == segment_j == @SECOND_SEGMENT")
                    ec_comp_inter = ec_comp.query("segment_i != segment_j")

                    # use the monomer statistics files to figure out how many sites in each monomer
                    if valid_file(first_stat_file) and valid_file(
                            second_stat_file):
                        stats_1 = pd.read_csv(first_stat_file)
                        L_1 = L = stats_1.loc[0, "num_cov"]

                        stats_2 = pd.read_csv(second_stat_file)
                        L_2 = L = stats_2.loc[0, "num_cov"]

                        # precision of monomer 1
                        stat_df.loc[
                            0, "first_monomer_precision"] = ec_comp_1.iloc[
                                L_1]["segmentwise_precision"]

                        # precicions of monomer 2
                        stat_df.loc[
                            0, "second_monomer_precision"] = ec_comp_2.iloc[
                                L_2]["segmentwise_precision"]

                        # precision of top 5 inter
                        stat_df.loc[0, "inter_precision"] = ec_comp_inter.iloc[
                            NUM_INTER]["segmentwise_precision"]

                # finally, append to global table
                ali_table = ali_table.append(stat_df)

    # save ali statistics table
    table_file = prefix + "_job_statistics_summary.csv"
    lock_table = filelock.FileLock(table_file)
    with lock_table:
        ali_table.to_csv(table_file, index=False, float_format="%.3f")

    return ali_table
Exemple #17
0
def protein_monomer(prefix, configs):
    """
    Create results summary for run using
    protein_monomer pipeline

    # TODO
    """
    MIN_PROBABILITY = 0.9

    ali_table = pd.DataFrame()
    prefix_to_cfgs = {}
    data = defaultdict(lambda: defaultdict())

    # go through all config files
    for cfg_file in configs:
        # check if the file exists and has contents
        # since run might not yet have finished or crashed
        if valid_file(cfg_file):
            # job input configuration
            C = read_config_file(cfg_file)
            sub_prefix = C["global"]["prefix"]
            domain_threshold = C["align"]["domain_threshold"]
            sub_index = (domain_threshold, sub_prefix)

            final_state_cfg = sub_prefix + FINAL_CONFIG_SUFFIX
            if not valid_file(final_state_cfg):
                continue

            # read final output state of job
            R = read_config_file(final_state_cfg)
            data[sub_index]["identities"] = R["identities_file"]
            data[sub_index]["frequencies"] = R["frequencies_file"]
            data[sub_index]["minimum_column_coverage"] = C["align"][
                "minimum_column_coverage"]

            stat_file = R["statistics_file"]
            ec_file = R.get("ec_file", "")
            ec_comp_file = R.get("ec_compared_longrange_file", "")

            prefix_to_cfgs[(sub_prefix)] = (C, R)

            # read and modify alignment statistics
            if valid_file(stat_file):
                # get alignment stats for current job
                stat_df = pd.read_csv(stat_file)
                n_eff = R["effective_sequences"]

                if n_eff is not None:
                    stat_df.loc[0, "N_eff"] = n_eff

                stat_df.loc[0, "domain_threshold"] = domain_threshold
                L = stat_df.loc[0, "num_cov"]

                # try to get number of significant ECs in addition
                if valid_file(ec_file):
                    ecs = pd.read_csv(ec_file)
                    min_seq_dist = C["compare"]["min_sequence_distance"]
                    num_sig = len(
                        ecs.query(
                            "abs(i-j) >= @min_seq_dist and probability >= @MIN_PROBABILITY"
                        ))
                    stat_df.loc[0, "num_significant"] = num_sig

                # try to get EC precision in addition
                if valid_file(ec_comp_file):
                    ec_comp = pd.read_csv(ec_comp_file)
                    stat_df.loc[0, "precision"] = ec_comp.iloc[L]["precision"]

                # finally, append to global table
                ali_table = ali_table.append(stat_df)

    # sort table by sequence search threshold
    ali_table = ali_table.sort_values(by="domain_threshold")

    # when saving files, have to aquire lock to make sure
    # jobs don't start overwriting results

    # make plots and save
    fig = _protein_monomer_plot(ali_table, data)
    plot_file = prefix + "_job_statistics_summary.pdf"
    lock_plot = filelock.FileLock(plot_file)
    with lock_plot:
        fig.savefig(plot_file, bbox_inches="tight")

    # save ali statistics table
    table_file = prefix + "_job_statistics_summary.csv"
    lock_table = filelock.FileLock(table_file)
    with lock_table:
        ali_table.to_csv(table_file, index=False, float_format="%.3f")

    return ali_table
Exemple #18
0
def standard(**kwargs):
    """
    Protocol:

    Infer ECs from alignment using plmc.

    .. todo::

        1. make EC enrichment calculation segment-ready
        2. explain meaning of parameters in detail.

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * raw_ec_file
        * model_file
        * num_sites
        * num_sequences
        * effective_sequences
        * focus_mode (passed through)
        * focus_sequence (passed through)
        * segments (passed through)
    """
    check_required(
        kwargs,
        [
            "prefix", "alignment_file",
            "focus_mode", "focus_sequence", "theta",
            "alphabet", "segments", "ignore_gaps", "iterations",
            "lambda_h", "lambda_J", "lambda_group",
            "scale_clusters",
            "cpu", "plmc", "reuse_ecs",
            "min_sequence_distance", # "save_model",
        ]
    )

    prefix = kwargs["prefix"]

    # for now disable option to not save model, since
    # otherwise mutate stage will crash. To remove model
    # file at end, use delete option in management section.
    """
    if kwargs["save_model"]:
        model = prefix + ".model"
    else:
        model = None
    """
    model = prefix + ".model"

    outcfg = {
        "model_file": model,
        "raw_ec_file": prefix + "_ECs.txt",
        "ec_file": prefix + "_CouplingScores.csv",
        # TODO: the following are passed through stage...
        # keep this or unnecessary?
        "focus_mode": kwargs["focus_mode"],
        "focus_sequence": kwargs["focus_sequence"],
        "segments": kwargs["segments"],
    }

    # make sure input alignment exists
    verify_resources(
        "Input alignment does not exist",
        kwargs["alignment_file"]
    )

    # make sure output directory exists
    create_prefix_folders(prefix)

    # regularization strength on couplings J_ij
    lambda_J = kwargs["lambda_J"]

    segments = kwargs["segments"]
    if segments is not None:
        segments = [
            mapping.Segment.from_list(s) for s in segments
        ]

    # first determine size of alphabet;
    # default is amino acid alphabet
    if kwargs["alphabet"] is None:
        alphabet = ALPHABET_PROTEIN
        alphabet_setting = None
    else:
        alphabet = kwargs["alphabet"]

        # allow shortcuts for protein, DNA, RNA
        if alphabet in ALPHABET_MAP:
            alphabet = ALPHABET_MAP[alphabet]

        # if we have protein alphabet, do not set
        # as plmc parameter since default parameter,
        # has some implementation advantages for focus mode
        if alphabet == ALPHABET_PROTEIN:
            alphabet_setting = None
        else:
            alphabet_setting = alphabet

    # scale lambda_J to proportionally compensate
    # for higher number of J_ij compared to h_i?
    if kwargs["lambda_J_times_Lq"]:
        num_symbols = len(alphabet)

        # if we ignore gaps, there is one character less
        if kwargs["ignore_gaps"]:
            num_symbols -= 1

        # second, determine number of uppercase positions
        # that are included in the calculation
        with open(kwargs["alignment_file"]) as f:
            seq_id, seq = next(read_fasta(f))

        # gap character is by convention first char in alphabet
        gap = alphabet[0]
        uppercase = [
            c for c in seq if c == c.upper() or c == gap
        ]
        L = len(uppercase)

        # finally, scale lambda_J
        lambda_J *= (num_symbols - 1) * (L - 1)

    # run plmc... or reuse pre-exisiting results from previous run
    plm_outcfg_file = prefix + ".couplings_standard_plmc.outcfg"

    # determine if to rerun, only possible if previous results
    # were stored in ali_outcfg_file
    if kwargs["reuse_ecs"] and valid_file(plm_outcfg_file):
        plmc_result = read_config_file(plm_outcfg_file)

        # check if the EC/parameter files are there
        required_files = [outcfg["raw_ec_file"]]

        if outcfg["model_file"] is not None:
            required_files += [outcfg["model_file"]]

        verify_resources(
            "Tried to reuse ECs, but empty or "
            "does not exist",
            *required_files
        )

    else:
        # run plmc binary
        plmc_result = ct.run_plmc(
            kwargs["alignment_file"],
            outcfg["raw_ec_file"],
            outcfg["model_file"],
            focus_seq=kwargs["focus_sequence"],
            alphabet=alphabet_setting,
            theta=kwargs["theta"],
            scale=kwargs["scale_clusters"],
            ignore_gaps=kwargs["ignore_gaps"],
            iterations=kwargs["iterations"],
            lambda_h=kwargs["lambda_h"],
            lambda_J=lambda_J,
            lambda_g=kwargs["lambda_group"],
            cpu=kwargs["cpu"],
            binary=kwargs["plmc"],
        )

        # save iteration table to file
        iter_table_file = prefix + "_iteration_table.csv"
        plmc_result.iteration_table.to_csv(
            iter_table_file
        )

        # turn namedtuple into dictionary to make
        # restarting code nicer
        plmc_result = dict(plmc_result._asdict())

        # then replace table with filename so
        # we can store results in config file
        plmc_result["iteration_table"] = iter_table_file

        # save results of search for possible restart
        write_config_file(plm_outcfg_file, plmc_result)

    # store useful information about model in outcfg
    outcfg.update({
        "num_sites": plmc_result["num_valid_sites"],
        "num_sequences": plmc_result["num_valid_seqs"],
        "effective_sequences": plmc_result["effective_samples"],
        "region_start": plmc_result["region_start"],
    })

    # read and sort ECs
    ecs = pairs.read_raw_ec_file(outcfg["raw_ec_file"])

    # add mixture model probability
    ecs = pairs.add_mixture_probability(ecs)

    if segments is not None:  # and (len(segments) > 1 or not kwargs["focus_mode"]):
        # create index mapping
        seg_mapper = mapping.SegmentIndexMapper(
            kwargs["focus_mode"], outcfg["region_start"], *segments
        )

        # apply to EC table
        ecs = mapping.segment_map_ecs(ecs, seg_mapper)

    # write updated table to csv file
    ecs.to_csv(outcfg["ec_file"], index=False)

    # also store longrange ECs as convenience output
    if kwargs["min_sequence_distance"] is not None:
        outcfg["ec_longrange_file"] = prefix + "_CouplingScores_longrange.csv"
        ecs_longrange = ecs.query(
            "abs(i - j) >= {}".format(kwargs["min_sequence_distance"])
        )
        ecs_longrange.to_csv(outcfg["ec_longrange_file"], index=False)

        # also create line-drawing script (for now, only for single segments)
        if segments is None or len(segments) == 1:
            outcfg["ec_lines_pml_file"] = prefix + "_draw_ec_lines.pml"
            L = outcfg["num_sites"]
            ec_lines_pymol_script(
                ecs_longrange.iloc[:L, :],
                outcfg["ec_lines_pml_file"]
            )

    # compute EC enrichment (for now, for single segments
    # only since enrichment code cannot handle multiple segments)
    if segments is None or len(segments) == 1:
        outcfg["enrichment_file"] = prefix + "_enrichment.csv"
        ecs_enriched = pairs.enrichment(ecs)
        ecs_enriched.to_csv(outcfg["enrichment_file"], index=False)

        # create corresponding enrichment pymol scripts
        outcfg["enrichment_pml_files"] = []
        for sphere_view, pml_suffix in [
            (True, "_enrichment_spheres.pml"), (False, "_enrichment_sausage.pml")
        ]:
            pml_file = prefix + pml_suffix
            enrichment_pymol_script(ecs_enriched, pml_file, sphere_view=sphere_view)
            outcfg["enrichment_pml_files"].append(pml_file)

    # output EVzoom JSON file if we have stored model file
    if outcfg.get("model_file", None) is not None:
        outcfg["evzoom_file"] = prefix + "_evzoom.json"
        with open(outcfg["evzoom_file"], "w") as f:
            # load parameters
            c = CouplingsModel(outcfg["model_file"])

            # create JSON output and write to file
            f.write(
                evzoom_json(c) + "\n"
            )

    # dump output config to YAML file for debugging/logging
    write_config_file(prefix + ".couplings_standard.outcfg", outcfg)

    return outcfg