コード例 #1
0
ファイル: restraints.py プロジェクト: wangdang511/EVcouplings
def _docking_config(config_file=None):
    """
    Load docking configuration

    Parameters
    ----------
    config_file: str, optional (default: None)
        Path to configuration file. If None,
        loads default configuration included
        with package.

    Returns
    -------
    dict
        Loaded configuration
    """
    if config_file is None:
        # get path of config within package
        config_file = resource_filename(
            __name__, "cns_templates/haddock_restraints.yml")

    # check if config file exists and read
    verify_resources("Folding config file does not exist or is empty",
                     config_file)

    return read_config_file(config_file)
コード例 #2
0
    def __init__(self, logreg_model_file=None, min_n_eff_over_l=0.375):
        """
        Create new logistic regression-based
        EC rescorer

        Parameters
        ----------
        logreg_model_file : str, optional (default: None)
            Specify path to yml file with logistic regression
            model parameters; if None, will use default
            model included with package
            (evcouplings/couplings/scoring_models/logistic_regression_all.yml)
        min_n_eff_over_l : float, optional (default: 0.3)
            Minimum number of effective sequences per model site required
            for rescoring to be applied; otherwise standard score will
            be returned and all probabilities will be set to 0. The
            default value will be divided by theta for the rescored run,
            the default of 0.375 derives from N_eff/L = 0.3 at theta = 0.8
        """
        # by default load internal classifier included with package
        if logreg_model_file is None:
            logreg_model_file = resource_filename(
                __name__, "scoring_models/logistic_regression_all.yml")

        # load classifier from param file
        logreg_model_serialized = read_config_file(logreg_model_file)

        # deserialize and store classifier
        self.classifier, self.feature_names = logreg_classifier_from_dict(
            logreg_model_serialized)

        # store min N_eff/L requirement
        self.min_n_eff_over_l = min_n_eff_over_l
コード例 #3
0
ファイル: pipeline.py プロジェクト: tetukas/EVcouplings
def run(**kwargs):
    """
    EVcouplings pipeline execution from a
    configuration file (single thread, no
    batch or environment configuration)
    
    Parameters
    ----------
    kwargs
        See click.option decorators for app()
    """
    config_file = kwargs["config"]
    verify_resources("Config file does not exist or is empty.", config_file)

    # read configuration and execute
    config = read_config_file(config_file)

    # execute configuration in "wrapped" mode
    # that handles exceptions and internal interrupts
    return execute_wrapped(**config)
コード例 #4
0
def protein_monomer(prefix, configs):
    """
    Create results summary for run using
    protein_monomer pipeline

    # TODO
    """
    MIN_PROBABILITY = 0.9

    ali_table = pd.DataFrame()
    prefix_to_cfgs = {}
    data = defaultdict(lambda: defaultdict())

    # go through all config files
    for cfg_file in configs:
        # check if the file exists and has contents
        # since run might not yet have finished or crashed
        if valid_file(cfg_file):
            # job input configuration
            C = read_config_file(cfg_file)
            sub_prefix = C["global"]["prefix"]
            domain_threshold = C["align"]["domain_threshold"]
            sub_index = (domain_threshold, sub_prefix)

            final_state_cfg = sub_prefix + FINAL_CONFIG_SUFFIX
            if not valid_file(final_state_cfg):
                continue

            # read final output state of job
            R = read_config_file(final_state_cfg)
            data[sub_index]["identities"] = R["identities_file"]
            data[sub_index]["frequencies"] = R["frequencies_file"]
            data[sub_index]["minimum_column_coverage"] = C["align"][
                "minimum_column_coverage"]

            stat_file = R["statistics_file"]
            ec_file = R.get("ec_file", "")
            ec_comp_file = R.get("ec_compared_longrange_file", "")

            prefix_to_cfgs[(sub_prefix)] = (C, R)

            # read and modify alignment statistics
            if valid_file(stat_file):
                # get alignment stats for current job
                stat_df = pd.read_csv(stat_file)
                n_eff = R["effective_sequences"]

                if n_eff is not None:
                    stat_df.loc[0, "N_eff"] = n_eff

                stat_df.loc[0, "domain_threshold"] = domain_threshold
                L = stat_df.loc[0, "num_cov"]

                # try to get number of significant ECs in addition
                if valid_file(ec_file):
                    ecs = pd.read_csv(ec_file)
                    min_seq_dist = C["compare"]["min_sequence_distance"]
                    num_sig = len(
                        ecs.query(
                            "abs(i-j) >= @min_seq_dist and probability >= @MIN_PROBABILITY"
                        ))
                    stat_df.loc[0, "num_significant"] = num_sig

                # try to get EC precision in addition
                if valid_file(ec_comp_file):
                    ec_comp = pd.read_csv(ec_comp_file)
                    stat_df.loc[0, "precision"] = ec_comp.iloc[L]["precision"]

                # finally, append to global table
                ali_table = ali_table.append(stat_df)

    # sort table by sequence search threshold
    ali_table = ali_table.sort_values(by="domain_threshold")

    # when saving files, have to aquire lock to make sure
    # jobs don't start overwriting results

    # make plots and save
    fig = _protein_monomer_plot(ali_table, data)
    plot_file = prefix + "_job_statistics_summary.pdf"
    lock_plot = filelock.FileLock(plot_file)
    with lock_plot:
        fig.savefig(plot_file, bbox_inches="tight")

    # save ali statistics table
    table_file = prefix + "_job_statistics_summary.csv"
    lock_table = filelock.FileLock(table_file)
    with lock_table:
        ali_table.to_csv(table_file, index=False, float_format="%.3f")

    return ali_table
コード例 #5
0
def protein_complex(prefix, configs):
    """
    Create results summary for run using
    protein_complex pipeline

    """
    # TODO: this is only designed to work with skewnormal threshold
    MIN_PROBABILITY = 0.9

    # number of inter ECs to check for precision
    NUM_INTER = 5

    # TODO: create segments global variable and import
    FIRST_SEGMENT = "A_1"
    SECOND_SEGMENT = "B_1"

    ali_table = pd.DataFrame()
    prefix_to_cfgs = {}
    data = defaultdict(lambda: defaultdict())

    # go through all config files
    for cfg_file in configs:
        # check if the file exists and has contents
        # since run might not yet have finished or crashed
        if valid_file(cfg_file):
            # job input configuration
            C = read_config_file(cfg_file)
            sub_prefix = C["global"]["prefix"]
            sub_index = (sub_prefix)

            final_state_cfg = sub_prefix + FINAL_CONFIG_SUFFIX
            if not valid_file(final_state_cfg):
                continue

            # read final output state of job
            R = read_config_file(final_state_cfg)
            data[sub_index]["identities"] = R["identities_file"]
            data[sub_index]["frequencies"] = R["frequencies_file"]
            data[sub_index]["minimum_column_coverage"] = C["concatenate"][
                "minimum_column_coverage"]

            stat_file = R["statistics_file"]
            ec_file = R.get("ec_file", "")
            ec_comp_file = R.get("ec_compared_longrange_file", "")
            concat_stat_file = R.get("concatentation_statistics_file", "")
            first_stat_file = R.get("first_statistics_file", "")
            second_stat_file = R.get("second_statistics_file", "")

            prefix_to_cfgs[(sub_prefix)] = (C, R)

            # read and modify alignment statistics
            if valid_file(stat_file):
                # get alignment stats for current job
                stat_df = pd.read_csv(stat_file)
                n_eff = R["effective_sequences"]

                if n_eff is not None:
                    stat_df.loc[0, "N_eff"] = n_eff

                L = stat_df.loc[0, "num_cov"]

                # try to get concatenation statistics in addition
                if valid_file(concat_stat_file):
                    concat_stat_df = pd.read_csv(concat_stat_file)

                    # get and save n sequences per monomer aln
                    n_seqs_1 = concat_stat_df.loc[0, "num_seqs_1"]
                    n_seqs_2 = concat_stat_df.loc[0, "num_seqs_2"]
                    stat_df.loc[0, "first_n_seqs"] = int(n_seqs_1)
                    stat_df.loc[0, "second_n_seqs"] = int(n_seqs_2)

                    # get and save median n paralogs per monomer aln
                    n_paralogs_1 = concat_stat_df.loc[
                        0, "median_num_per_species_1"]
                    n_paralogs_2 = concat_stat_df.loc[
                        0, "median_num_per_species_2"]
                    stat_df.loc[0, "median_num_per_species_1"] = n_paralogs_1
                    stat_df.loc[0, "median_num_per_species_2"] = n_paralogs_2

                # try to get number of significant ECs in addition
                if valid_file(ec_file):
                    ecs = pd.read_csv(ec_file)

                    #number of significant monomer Ecs
                    min_seq_dist = C["compare"]["min_sequence_distance"]
                    num_sig = len(
                        ecs.query(
                            "abs(i-j) >= @min_seq_dist and probability >= @MIN_PROBABILITY"
                        ))

                    # number of inter-protein ECs significant
                    num_sig_inter = len(
                        ecs.query(
                            "segment_i != segment_j and probability >= @MIN_PROBABILITY"
                        ))
                    stat_df.loc[0, "num_significant"] = int(num_sig)

                    #rank of top inter contact
                    top_inter_rank = ecs.query(
                        "segment_i != segment_j").index[0]
                    stat_df.loc[0, "top_inter_rank"] = int(top_inter_rank)

                # try to get EC precision in addition
                if valid_file(ec_comp_file):
                    ec_comp = pd.read_csv(ec_comp_file)
                    ec_comp_1 = ec_comp.query(
                        "segment_i == segment_j == @FIRST_SEGMENT")
                    ec_comp_2 = ec_comp.query(
                        "segment_i == segment_j == @SECOND_SEGMENT")
                    ec_comp_inter = ec_comp.query("segment_i != segment_j")

                    # use the monomer statistics files to figure out how many sites in each monomer
                    if valid_file(first_stat_file) and valid_file(
                            second_stat_file):
                        stats_1 = pd.read_csv(first_stat_file)
                        L_1 = L = stats_1.loc[0, "num_cov"]

                        stats_2 = pd.read_csv(second_stat_file)
                        L_2 = L = stats_2.loc[0, "num_cov"]

                        # precision of monomer 1
                        stat_df.loc[
                            0, "first_monomer_precision"] = ec_comp_1.iloc[
                                L_1]["segmentwise_precision"]

                        # precicions of monomer 2
                        stat_df.loc[
                            0, "second_monomer_precision"] = ec_comp_2.iloc[
                                L_2]["segmentwise_precision"]

                        # precision of top 5 inter
                        stat_df.loc[0, "inter_precision"] = ec_comp_inter.iloc[
                            NUM_INTER]["segmentwise_precision"]

                # finally, append to global table
                ali_table = ali_table.append(stat_df)

    # save ali statistics table
    table_file = prefix + "_job_statistics_summary.csv"
    lock_table = filelock.FileLock(table_file)
    with lock_table:
        ali_table.to_csv(table_file, index=False, float_format="%.3f")

    return ali_table
コード例 #6
0
def jackhmmer_search(**kwargs):
    """
    Protocol:

    Iterative jackhmmer search against a sequence database.

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    .. todo::
        explain meaning of parameters in detail.

    Returns
    -------
    outcfg : dict
        Output configuration of the protocol, including
        the following fields:

        * sequence_id (passed through from input)
        * first_index (passed through from input)
        * target_sequence_file
        * sequence_file
        * raw_alignment_file
        * hittable_file
        * focus_mode
        * focus_sequence
        * segments
    """
    check_required(kwargs, [
        "prefix", "sequence_id", "sequence_file", "sequence_download_url",
        "region", "first_index", "use_bitscores", "domain_threshold",
        "sequence_threshold", "database", "iterations", "cpu", "nobias",
        "reuse_alignment", "checkpoints_hmm", "checkpoints_ali", "jackhmmer",
        "extract_annotation"
    ])
    prefix = kwargs["prefix"]

    # make sure output directory exists
    create_prefix_folders(prefix)

    # store search sequence file here
    target_sequence_file = prefix + ".fa"
    full_sequence_file = prefix + "_full.fa"

    # make sure search sequence is defined and load it
    full_seq_file, (full_seq_id, full_seq) = fetch_sequence(
        kwargs["sequence_id"], kwargs["sequence_file"],
        kwargs["sequence_download_url"], full_sequence_file)

    # cut sequence to target region and save in sequence_file
    # (this is the main sequence file used downstream)
    (region_start, region_end), cut_seq = cut_sequence(full_seq,
                                                       kwargs["sequence_id"],
                                                       kwargs["region"],
                                                       kwargs["first_index"],
                                                       target_sequence_file)

    # run jackhmmer... allow to reuse pre-exisiting
    # Stockholm alignment file here
    ali_outcfg_file = prefix + ".align_jackhmmer_search.outcfg"

    # determine if to rerun, only possible if previous results
    # were stored in ali_outcfg_file
    if kwargs["reuse_alignment"] and valid_file(ali_outcfg_file):
        ali = read_config_file(ali_outcfg_file)

        # check if the alignment file itself is also there
        verify_resources(
            "Tried to reuse alignment, but empty or "
            "does not exist", ali["alignment"], ali["domtblout"])
    else:
        # otherwise, we have to run the alignment
        # modify search thresholds to be suitable for jackhmmer
        seq_threshold, domain_threshold = search_thresholds(
            kwargs["use_bitscores"], kwargs["sequence_threshold"],
            kwargs["domain_threshold"], len(cut_seq))

        # run search process
        ali = at.run_jackhmmer(
            query=target_sequence_file,
            database=kwargs[kwargs["database"]],
            prefix=prefix,
            use_bitscores=kwargs["use_bitscores"],
            domain_threshold=domain_threshold,
            seq_threshold=seq_threshold,
            iterations=kwargs["iterations"],
            nobias=kwargs["nobias"],
            cpu=kwargs["cpu"],
            checkpoints_hmm=kwargs["checkpoints_hmm"],
            checkpoints_ali=kwargs["checkpoints_ali"],
            binary=kwargs["jackhmmer"],
        )

        # get rid of huge stdout log file immediately
        # (do not use /dev/null option of jackhmmer function
        # to make no assumption about operating system)
        try:
            os.remove(ali.output)
        except OSError:
            pass

        # turn namedtuple into dictionary to make
        # restarting code nicer
        ali = dict(ali._asdict())

        # save results of search for possible restart
        write_config_file(ali_outcfg_file, ali)

    # prepare output dictionary with result files
    outcfg = {
        "sequence_id": kwargs["sequence_id"],
        "target_sequence_file": target_sequence_file,
        "sequence_file": full_sequence_file,
        "first_index": kwargs["first_index"],
        "focus_mode": True,
        "raw_alignment_file": ali["alignment"],
        "hittable_file": ali["domtblout"],
    }

    # define a single protein segment based on target sequence
    outcfg["segments"] = [
        Segment("aa", kwargs["sequence_id"], region_start, region_end,
                range(region_start, region_end + 1)).to_list()
    ]

    outcfg["focus_sequence"] = "{}/{}-{}".format(kwargs["sequence_id"],
                                                 region_start, region_end)

    return outcfg
コード例 #7
0
def hmmbuild_and_search(**kwargs):
    """
    Protocol:

    Build HMM from sequence alignment using hmmbuild and 
    search against a sequence database using hmmsearch.
    
    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the protocol, including
        the following fields:

        * target_sequence_file
        * sequence_file
        * raw_alignment_file
        * hittable_file
        * focus_mode
        * focus_sequence
        * segments
    """
    def _format_alignment_for_hmmbuild(input_alignment_file, **kwargs):
        # this file is starting point of pipeline;
        # check if input alignment actually exists

        verify_resources("Input alignment does not exist",
                         input_alignment_file)

        # first try to autodetect format of alignment
        with open(input_alignment_file) as f:
            format = detect_format(f)
            if format is None:
                raise InvalidParameterError(
                    "Format of input alignment {} could not be "
                    "automatically detected.".format(input_alignment_file))

        with open(input_alignment_file) as f:
            ali_raw = Alignment.from_file(f, format)

        # Target sequence of alignment
        sequence_id = kwargs["sequence_id"]

        if sequence_id is None:
            raise InvalidParameterError(
                "Parameter sequence_id must be defined")

        # First, find focus sequence in alignment
        focus_index = None
        for i, id_ in enumerate(ali_raw.ids):
            if id_.startswith(sequence_id):
                focus_index = i
                break

        # if we didn't find it, cannot continue
        if focus_index is None:
            raise InvalidParameterError(
                "Target sequence {} could not be found in alignment".format(
                    sequence_id))

        # identify what columns (non-gap) to keep for focus
        # this should be all columns in the raw_focus_alignment_file
        # but checking anyway
        focus_seq = ali_raw[focus_index]
        focus_cols = np.array([
            c not in [ali_raw._match_gap, ali_raw._insert_gap]
            for c in focus_seq
        ])

        # extract focus alignment
        focus_ali = ali_raw.select(columns=focus_cols)
        focus_seq_nogap = "".join(focus_ali[focus_index])

        # determine region of sequence. If first_index is given,
        # use that in any case, otherwise try to autodetect
        full_focus_header = ali_raw.ids[focus_index]
        focus_id = full_focus_header.split()[0]

        # try to extract region from sequence header
        id_, region_start, region_end = parse_header(focus_id)

        # override with first_index if given
        if kwargs["first_index"] is not None:
            region_start = kwargs["first_index"]
            region_end = region_start + len(focus_seq_nogap) - 1

        if region_start is None or region_end is None:
            raise InvalidParameterError(
                "Could not extract region information " +
                "from sequence header {} ".format(full_focus_header) +
                "and first_index parameter is not given.")

        # resubstitute full sequence ID from identifier
        # and region information
        header = "{}/{}-{}".format(id_, region_start, region_end)

        focus_ali.ids[focus_index] = header

        # write target sequence to file
        target_sequence_file = prefix + ".fa"
        with open(target_sequence_file, "w") as f:
            write_fasta([(header, focus_seq_nogap)], f)

        # swap target sequence to first position if it is not
        # the first sequence in alignment;
        # this is particularly important for hhfilter run
        # because target sequence might otherwise be filtered out
        if focus_index != 0:
            indices = np.arange(0, len(focus_ali))
            indices[0] = focus_index
            indices[focus_index] = 0
            focus_index = 0
            focus_ali = focus_ali.select(sequences=indices)

        # write the raw focus alignment for hmmbuild
        focus_fasta_file = prefix + "_raw_focus_input.fasta"
        with open(focus_fasta_file, "w") as f:
            focus_ali.write(f, "fasta")

        return focus_fasta_file, target_sequence_file, region_start, region_end

    # define the gap threshold for inclusion in HMM's build by HMMbuild.
    SYMFRAC_HMMBUILD = 0.0

    # check for required options
    check_required(kwargs, [
        "prefix", "sequence_id", "alignment_file", "use_bitscores",
        "domain_threshold", "sequence_threshold", "database", "cpu", "nobias",
        "reuse_alignment", "hmmbuild", "hmmsearch"
    ])
    prefix = kwargs["prefix"]

    # make sure output directory exists
    create_prefix_folders(prefix)

    # prepare input alignment for hmmbuild
    focus_fasta_file, target_sequence_file, region_start, region_end = \
        _format_alignment_for_hmmbuild(
            kwargs["alignment_file"], **kwargs
        )

    # run hmmbuild_and_search... allow to reuse pre-exisiting
    # Stockholm alignment file here
    ali_outcfg_file = prefix + ".align_hmmbuild_and_search.outcfg"

    # determine if to rerun, only possible if previous results
    # were stored in ali_outcfg_file
    if kwargs["reuse_alignment"] and valid_file(ali_outcfg_file):
        ali = read_config_file(ali_outcfg_file)

        # check if the alignment file itself is also there
        verify_resources(
            "Tried to reuse alignment, but empty or "
            "does not exist", ali["alignment"], ali["domtblout"])
    else:
        # otherwise, we have to run the alignment
        # modify search thresholds to be suitable for hmmsearch
        sequence_length = region_end - region_start + 1
        seq_threshold, domain_threshold = search_thresholds(
            kwargs["use_bitscores"], kwargs["sequence_threshold"],
            kwargs["domain_threshold"], sequence_length)

        # create the hmm
        hmmbuild_result = at.run_hmmbuild(
            alignment_file=focus_fasta_file,
            prefix=prefix,
            symfrac=SYMFRAC_HMMBUILD,
            cpu=kwargs["cpu"],
            binary=kwargs["hmmbuild"],
        )
        hmmfile = hmmbuild_result.hmmfile

        # run the alignment from the hmm
        ali = at.run_hmmsearch(
            hmmfile=hmmfile,
            database=kwargs[kwargs["database"]],
            prefix=prefix,
            use_bitscores=kwargs["use_bitscores"],
            domain_threshold=domain_threshold,
            seq_threshold=seq_threshold,
            nobias=kwargs["nobias"],
            cpu=kwargs["cpu"],
            binary=kwargs["hmmsearch"],
        )

        # get rid of huge stdout log file immediately
        try:
            os.remove(ali.output)
        except OSError:
            pass

        # turn namedtuple into dictionary to make
        # restarting code nicer
        ali = dict(ali._asdict())
        # only item from hmmsearch_result to save is the hmmfile
        ali["hmmfile"] = hmmfile

        # save results of search for possible restart
        write_config_file(ali_outcfg_file, ali)

    # prepare output dictionary with result files
    outcfg = {
        "sequence_file": target_sequence_file,
        "first_index": region_start,
        "input_raw_focus_alignment": focus_fasta_file,
        "target_sequence_file": target_sequence_file,
        "focus_mode": True,
        "raw_alignment_file": ali["alignment"],
        "hittable_file": ali["domtblout"],
    }

    # convert the raw output alignment to fasta format
    # and add the appropriate query sequecne
    raw_focus_alignment_file = _make_hmmsearch_raw_fasta(outcfg, prefix)
    outcfg["raw_focus_alignment_file"] = raw_focus_alignment_file

    # define a single protein segment based on target sequence
    outcfg["segments"] = [
        Segment("aa", kwargs["sequence_id"], region_start, region_end,
                range(region_start, region_end + 1)).to_list()
    ]

    outcfg["focus_sequence"] = "{}/{}-{}".format(kwargs["sequence_id"],
                                                 region_start, region_end)

    return outcfg
コード例 #8
0
def infer_plmc(**kwargs):
    """
    Run EC computation on alignment. This function contains
    the functionality shared between monomer and complex EC
    inference.
    
    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required
    
    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        raw_ec_file
        model_file
        num_sites
        num_sequences
        effective_sequences

        focus_mode (passed through)
        focus_sequence (passed through)
        segments (passed through)

    """
    check_required(
        kwargs,
        [
            "prefix", "alignment_file",
            "focus_mode", "focus_sequence", "theta",
            "alphabet", "segments", "ignore_gaps", "iterations",
            "lambda_h", "lambda_J", "lambda_group",
            "scale_clusters",
            "cpu", "plmc", "reuse_ecs",
        ]
    )

    prefix = kwargs["prefix"]

    # for now disable option to not save model, since
    # otherwise mutate stage will crash. To remove model
    # file at end, use delete option in management section.
    """
    if kwargs["save_model"]:
        model = prefix + ".model"
    else:
        model = None
    """
    model = prefix + ".model"

    outcfg = {
        "model_file": model,
        "raw_ec_file": prefix + "_ECs.txt",
        "ec_file": prefix + "_CouplingScores.csv",
        # the following are passed through stage...
        "focus_mode": kwargs["focus_mode"],
        "focus_sequence": kwargs["focus_sequence"],
        "segments": kwargs["segments"],
    }

    # make sure input alignment exists
    verify_resources(
        "Input alignment does not exist",
        kwargs["alignment_file"]
    )

    # make sure output directory exists
    create_prefix_folders(prefix)

    # regularization strength on couplings J_ij
    lambda_J = kwargs["lambda_J"]

    segments = kwargs["segments"]
    if segments is not None:
        segments = [
            mapping.Segment.from_list(s) for s in segments
        ]

    # first determine size of alphabet;
    # default is amino acid alphabet
    if kwargs["alphabet"] is None:
        alphabet = ALPHABET_PROTEIN
        alphabet_setting = None
    else:
        alphabet = kwargs["alphabet"]

        # allow shortcuts for protein, DNA, RNA
        if alphabet in ALPHABET_MAP:
            alphabet = ALPHABET_MAP[alphabet]

        # if we have protein alphabet, do not set
        # as plmc parameter since default parameter,
        # has some implementation advantages for focus mode
        if alphabet == ALPHABET_PROTEIN:
            alphabet_setting = None
        else:
            alphabet_setting = alphabet

    # scale lambda_J to proportionally compensate
    # for higher number of J_ij compared to h_i?
    if kwargs["lambda_J_times_Lq"]:
        num_symbols = len(alphabet)

        # if we ignore gaps, there is one character less
        if kwargs["ignore_gaps"]:
            num_symbols -= 1

        # second, determine number of uppercase positions
        # that are included in the calculation
        with open(kwargs["alignment_file"]) as f:
            seq_id, seq = next(read_fasta(f))

        # gap character is by convention first char in alphabet
        gap = alphabet[0]
        uppercase = [
            c for c in seq if c == c.upper() or c == gap
        ]
        L = len(uppercase)

        # finally, scale lambda_J
        lambda_J *= (num_symbols - 1) * (L - 1)

    # run plmc... or reuse pre-exisiting results from previous run
    plm_outcfg_file = prefix + ".couplings_standard_plmc.outcfg"

    # determine if to rerun, only possible if previous results
    # were stored in ali_outcfg_file
    if kwargs["reuse_ecs"] and valid_file(plm_outcfg_file):
        plmc_result = read_config_file(plm_outcfg_file)

        # check if the EC/parameter files are there
        required_files = [outcfg["raw_ec_file"]]

        if outcfg["model_file"] is not None:
            required_files += [outcfg["model_file"]]

        verify_resources(
            "Tried to reuse ECs, but empty or "
            "does not exist",
            *required_files
        )

    else:
        # run plmc binary
        plmc_result = ct.run_plmc(
            kwargs["alignment_file"],
            outcfg["raw_ec_file"],
            outcfg["model_file"],
            focus_seq=kwargs["focus_sequence"],
            alphabet=alphabet_setting,
            theta=kwargs["theta"],
            scale=kwargs["scale_clusters"],
            ignore_gaps=kwargs["ignore_gaps"],
            iterations=kwargs["iterations"],
            lambda_h=kwargs["lambda_h"],
            lambda_J=lambda_J,
            lambda_g=kwargs["lambda_group"],
            cpu=kwargs["cpu"],
            binary=kwargs["plmc"],
        )

        # save iteration table to file
        iter_table_file = prefix + "_iteration_table.csv"
        plmc_result.iteration_table.to_csv(
            iter_table_file
        )

        # turn namedtuple into dictionary to make
        # restarting code nicer
        plmc_result = dict(plmc_result._asdict())

        # then replace table with filename so
        # we can store results in config file
        plmc_result["iteration_table"] = iter_table_file

        # save results of search for possible restart
        write_config_file(plm_outcfg_file, plmc_result)

    # store useful information about model in outcfg
    outcfg.update({
        "num_sites": plmc_result["num_valid_sites"],
        "num_valid_sequences": plmc_result["num_valid_seqs"],
        "effective_sequences": plmc_result["effective_samples"],
        "region_start": plmc_result["region_start"],
    })

    # read and sort ECs
    ecs = pairs.read_raw_ec_file(outcfg["raw_ec_file"])

    if segments is not None:
        # create index mapping
        seg_mapper = mapping.SegmentIndexMapper(
            kwargs["focus_mode"], outcfg["region_start"], *segments
        )

        # apply to EC table
        ecs = mapping.segment_map_ecs(ecs, seg_mapper)

    return outcfg, ecs, segments
コード例 #9
0
ファイル: pipeline.py プロジェクト: tetukas/EVcouplings
def execute(**config):
    """
    Execute a pipeline configuration

    Parameters
    ----------
    **config
        Input configuration for pipeline
        (see pipeline config files for
        example of how this should look like)

    Returns
    -------
    global_state : dict
        Global output state of pipeline
    """
    check_required(config, ["pipeline", "stages", "global"])

    # check if valid pipeline was selected
    if config["pipeline"] not in PIPELINES:
        raise InvalidParameterError("Not a valid pipeline selection. "
                                    "Valid choices are:\n{}".format(", ".join(
                                        PIPELINES.keys())))

    stages = config["stages"]
    if stages is None:
        raise InvalidParameterError("No stages defined, need at least one.")

    # get definition of selected pipeline
    pipeline = PIPELINES[config["pipeline"]]
    prefix = config["global"]["prefix"]

    # make sure output directory exists
    create_prefix_folders(prefix)

    # this is the global state of results as
    # we move through different stages of
    # the pipeline
    global_state = config["global"]

    # keep track of how many stages are still
    # to be run, so we can leave out stages at
    # the end of workflow below
    num_stages_to_run = len(stages)

    # get job tracker
    tracker = get_result_tracker(config)

    # set job status to running and also initalize global state
    tracker.update(status=EStatus.RUN, results=global_state)

    # iterate through individual stages
    for (stage, runner, key_prefix) in pipeline:
        # check if anything else is left to
        # run, otherwise skip
        if num_stages_to_run == 0:
            break

        # check if config for stage is there
        check_required(config, [stage])

        # output files for stage into an individual folder
        stage_prefix = insert_dir(prefix, stage)
        create_prefix_folders(stage_prefix)

        # config files for input and output of stage
        stage_incfg = "{}_{}.incfg".format(stage_prefix, stage)
        stage_outcfg = "{}_{}.outcfg".format(stage_prefix, stage)

        # update current stage of job
        tracker.update(stage=stage)

        # check if stage should be executed
        if stage in stages:
            # global state inserted at end, overrides any
            # stage-specific settings (except for custom prefix)
            incfg = {
                **config["tools"],
                **config["databases"],
                **config[stage],
                **global_state, "prefix": stage_prefix
            }
            # save input of stage in config file
            write_config_file(stage_incfg, incfg)

            # run stage
            outcfg = runner(**incfg)

            # prefix output keys if this parameter is
            # given in stage configuration, to avoid
            # name clashes if same protocol run multiple times
            if key_prefix is not None:
                outcfg = {key_prefix + k: v for k, v in outcfg.items()}

            # save output of stage in config file
            write_config_file(stage_outcfg, outcfg)

            # one less stage to put through after we ran this...
            num_stages_to_run -= 1
        else:
            # skip state by injecting state from previous run
            verify_resources(
                "Trying to skip, but output configuration "
                "for stage '{}' does not exist. Has it already "
                "been run?".format(stage, stage), stage_outcfg)

            # read output configuration
            outcfg = read_config_file(stage_outcfg)

            # verify all the output files are there
            outfiles = [
                filepath for f, filepath in outcfg.items()
                if f.endswith("_file") and filepath is not None
            ]

            verify_resources(
                "Output files from stage '{}' "
                "missing".format(stage), *outfiles)

        # update global state with outputs of stage
        global_state = {**global_state, **outcfg}

        # update state in tracker accordingly
        tracker.update(results=outcfg)

    # create results archive
    archive_file = create_archive(config, global_state, prefix)

    # only store results archive if a result file was created
    if archive_file is not None:
        global_state["archive_file"] = archive_file

        # prepare update for tracker, but only store in last
        # go when job is set to done
        tracker_archive_update = {"archive_file": archive_file}
    else:
        tracker_archive_update = None

    # set job status to done and transfer archive if selected for syncing
    tracker.update(status=EStatus.DONE, results=tracker_archive_update)

    # delete selected output files if requested;
    # tracker does not need to update here since it won't
    # sync entries of delete list in the first place
    global_state = delete_outputs(config, global_state)

    # write final global state of pipeline
    write_config_file(prefix + FINAL_CONFIG_SUFFIX, global_state)

    return global_state
コード例 #10
0
ファイル: app.py プロジェクト: tetukas/EVcouplings
def substitute_config(**kwargs):
    """
    Substitute command line arguments into config file

    Parameters
    ----------
    **kwargs
        Command line parameters to be substituted
        into configuration file

    Returns
    -------
    dict
        Updated configuration
    """
    # mapping of command line parameters to config file entries
    CONFIG_MAP = {
        "prefix": ("global", "prefix"),
        "protein": ("global", "sequence_id"),
        "seqfile": ("global", "sequence_file"),
        "alignment": ("align", "input_alignment"),
        "iterations": ("align", "iterations"),
        "id": ("align", "seqid_filter"),
        "seqcov": ("align", "minimum_sequence_coverage"),
        "colcov": ("align", "minimum_column_coverage"),
        "theta": ("global", "theta"),
        "plmiter": ("couplings", "iterations"),
        "queue": ("environment", "queue"),
        "time": ("environment", "time"),
        "cores": ("environment", "cores"),
        "memory": ("environment", "memory"),
    }

    # try to read in configuration
    config_file = kwargs["config"]
    if not valid_file(config_file):
        raise ResourceError(
            "Config file does not exist or is empty: {}".format(config_file))

    config = read_config_file(config_file, preserve_order=True)

    # substitute command-line parameters into configuration
    # (if straightforward substitution)
    for param, value in kwargs.items():
        if param in CONFIG_MAP and value is not None:
            outer, inner = CONFIG_MAP[param]
            config[outer][inner] = value

    # make sure that number of CPUs requested by
    # programs within pipeline does not exceed
    # number of cores requested in environment
    if config["environment"]["cores"] is not None:
        config["global"]["cpu"] = config["environment"]["cores"]

    # handle the more complicated parameters

    # If alignment is given, run "existing" protocol
    if kwargs.get("alignment", None) is not None:
        # TODO: think about what to do if sequence_file is given
        # (will not be used)
        config["align"]["protocol"] = "existing"

    # subregion of protein
    if kwargs.get("region", None) is not None:
        region = kwargs["region"]
        m = re.search("(\d+)-(\d+)", region)
        if m:
            start, end = map(int, m.groups())
            config["global"]["region"] = [start, end]
        else:
            raise InvalidParameterError(
                "Region string does not have format "
                "start-end (e.g. 5-123):".format(region))

    # pipeline stages to run
    if kwargs.get("stages", None) is not None:
        config["stages"] = kwargs["stages"].replace(" ", "").split(",")

    # sequence alignment input database
    if kwargs.get("database", None) is not None:
        db = kwargs["database"]
        # check if we have a predefined sequence database
        # if so, use it; otherwise, interpret as file path
        if db in config["databases"]:
            config["align"]["database"] = db
        else:
            config["align"]["database"] = "custom"
            config["databases"]["custom"] = db

    # make sure bitscore and E-value thresholds are exclusively set
    if kwargs.get("bitscores", None) is not None and kwargs.get(
            "evalues", None) is not None:
        raise InvalidParameterError(
            "Can not specify bitscore and E-value threshold at the same time.")

    if kwargs.get("bitscores", None) is not None:
        thresholds = kwargs["bitscores"]
        bitscore = True
    elif kwargs.get("evalues", None) is not None:
        thresholds = kwargs["evalues"]
        bitscore = False
    else:
        thresholds = None

    if thresholds is not None:
        T = thresholds.replace(" ", "").split(",")
        try:
            x_cast = [(float(t) if "." in t else int(t)) for t in T]
        except ValueError:
            raise InvalidParameterError(
                "Bitscore/E-value threshold(s) must be numeric: "
                "{}".format(thresholds))

        config["align"]["use_bitscores"] = bitscore

        # check if we have a single threshold (single job)
        # or if we need to create an array of jobs
        if len(x_cast) == 1:
            config["align"]["domain_threshold"] = x_cast[0]
            config["align"]["sequence_threshold"] = x_cast[0]
        else:
            config["batch"] = {}
            for t in x_cast:
                sub_prefix = ("_b" if bitscore else "_e") + str(t)
                config["batch"][sub_prefix] = {
                    "align": {
                        "domain_threshold": t,
                        "sequence_threshold": t,
                    }
                }

    return config
コード例 #11
0
def standard(**kwargs):
    """
    Protocol:

    Infer ECs from alignment using plmc.

    .. todo::

        1. make EC enrichment calculation segment-ready
        2. explain meaning of parameters in detail.

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * raw_ec_file
        * model_file
        * num_sites
        * num_sequences
        * effective_sequences
        * focus_mode (passed through)
        * focus_sequence (passed through)
        * segments (passed through)
    """
    check_required(
        kwargs,
        [
            "prefix", "alignment_file",
            "focus_mode", "focus_sequence", "theta",
            "alphabet", "segments", "ignore_gaps", "iterations",
            "lambda_h", "lambda_J", "lambda_group",
            "scale_clusters",
            "cpu", "plmc", "reuse_ecs",
            "min_sequence_distance", # "save_model",
        ]
    )

    prefix = kwargs["prefix"]

    # for now disable option to not save model, since
    # otherwise mutate stage will crash. To remove model
    # file at end, use delete option in management section.
    """
    if kwargs["save_model"]:
        model = prefix + ".model"
    else:
        model = None
    """
    model = prefix + ".model"

    outcfg = {
        "model_file": model,
        "raw_ec_file": prefix + "_ECs.txt",
        "ec_file": prefix + "_CouplingScores.csv",
        # TODO: the following are passed through stage...
        # keep this or unnecessary?
        "focus_mode": kwargs["focus_mode"],
        "focus_sequence": kwargs["focus_sequence"],
        "segments": kwargs["segments"],
    }

    # make sure input alignment exists
    verify_resources(
        "Input alignment does not exist",
        kwargs["alignment_file"]
    )

    # make sure output directory exists
    create_prefix_folders(prefix)

    # regularization strength on couplings J_ij
    lambda_J = kwargs["lambda_J"]

    segments = kwargs["segments"]
    if segments is not None:
        segments = [
            mapping.Segment.from_list(s) for s in segments
        ]

    # first determine size of alphabet;
    # default is amino acid alphabet
    if kwargs["alphabet"] is None:
        alphabet = ALPHABET_PROTEIN
        alphabet_setting = None
    else:
        alphabet = kwargs["alphabet"]

        # allow shortcuts for protein, DNA, RNA
        if alphabet in ALPHABET_MAP:
            alphabet = ALPHABET_MAP[alphabet]

        # if we have protein alphabet, do not set
        # as plmc parameter since default parameter,
        # has some implementation advantages for focus mode
        if alphabet == ALPHABET_PROTEIN:
            alphabet_setting = None
        else:
            alphabet_setting = alphabet

    # scale lambda_J to proportionally compensate
    # for higher number of J_ij compared to h_i?
    if kwargs["lambda_J_times_Lq"]:
        num_symbols = len(alphabet)

        # if we ignore gaps, there is one character less
        if kwargs["ignore_gaps"]:
            num_symbols -= 1

        # second, determine number of uppercase positions
        # that are included in the calculation
        with open(kwargs["alignment_file"]) as f:
            seq_id, seq = next(read_fasta(f))

        # gap character is by convention first char in alphabet
        gap = alphabet[0]
        uppercase = [
            c for c in seq if c == c.upper() or c == gap
        ]
        L = len(uppercase)

        # finally, scale lambda_J
        lambda_J *= (num_symbols - 1) * (L - 1)

    # run plmc... or reuse pre-exisiting results from previous run
    plm_outcfg_file = prefix + ".couplings_standard_plmc.outcfg"

    # determine if to rerun, only possible if previous results
    # were stored in ali_outcfg_file
    if kwargs["reuse_ecs"] and valid_file(plm_outcfg_file):
        plmc_result = read_config_file(plm_outcfg_file)

        # check if the EC/parameter files are there
        required_files = [outcfg["raw_ec_file"]]

        if outcfg["model_file"] is not None:
            required_files += [outcfg["model_file"]]

        verify_resources(
            "Tried to reuse ECs, but empty or "
            "does not exist",
            *required_files
        )

    else:
        # run plmc binary
        plmc_result = ct.run_plmc(
            kwargs["alignment_file"],
            outcfg["raw_ec_file"],
            outcfg["model_file"],
            focus_seq=kwargs["focus_sequence"],
            alphabet=alphabet_setting,
            theta=kwargs["theta"],
            scale=kwargs["scale_clusters"],
            ignore_gaps=kwargs["ignore_gaps"],
            iterations=kwargs["iterations"],
            lambda_h=kwargs["lambda_h"],
            lambda_J=lambda_J,
            lambda_g=kwargs["lambda_group"],
            cpu=kwargs["cpu"],
            binary=kwargs["plmc"],
        )

        # save iteration table to file
        iter_table_file = prefix + "_iteration_table.csv"
        plmc_result.iteration_table.to_csv(
            iter_table_file
        )

        # turn namedtuple into dictionary to make
        # restarting code nicer
        plmc_result = dict(plmc_result._asdict())

        # then replace table with filename so
        # we can store results in config file
        plmc_result["iteration_table"] = iter_table_file

        # save results of search for possible restart
        write_config_file(plm_outcfg_file, plmc_result)

    # store useful information about model in outcfg
    outcfg.update({
        "num_sites": plmc_result["num_valid_sites"],
        "num_sequences": plmc_result["num_valid_seqs"],
        "effective_sequences": plmc_result["effective_samples"],
        "region_start": plmc_result["region_start"],
    })

    # read and sort ECs
    ecs = pairs.read_raw_ec_file(outcfg["raw_ec_file"])

    # add mixture model probability
    ecs = pairs.add_mixture_probability(ecs)

    if segments is not None:  # and (len(segments) > 1 or not kwargs["focus_mode"]):
        # create index mapping
        seg_mapper = mapping.SegmentIndexMapper(
            kwargs["focus_mode"], outcfg["region_start"], *segments
        )

        # apply to EC table
        ecs = mapping.segment_map_ecs(ecs, seg_mapper)

    # write updated table to csv file
    ecs.to_csv(outcfg["ec_file"], index=False)

    # also store longrange ECs as convenience output
    if kwargs["min_sequence_distance"] is not None:
        outcfg["ec_longrange_file"] = prefix + "_CouplingScores_longrange.csv"
        ecs_longrange = ecs.query(
            "abs(i - j) >= {}".format(kwargs["min_sequence_distance"])
        )
        ecs_longrange.to_csv(outcfg["ec_longrange_file"], index=False)

        # also create line-drawing script (for now, only for single segments)
        if segments is None or len(segments) == 1:
            outcfg["ec_lines_pml_file"] = prefix + "_draw_ec_lines.pml"
            L = outcfg["num_sites"]
            ec_lines_pymol_script(
                ecs_longrange.iloc[:L, :],
                outcfg["ec_lines_pml_file"]
            )

    # compute EC enrichment (for now, for single segments
    # only since enrichment code cannot handle multiple segments)
    if segments is None or len(segments) == 1:
        outcfg["enrichment_file"] = prefix + "_enrichment.csv"
        ecs_enriched = pairs.enrichment(ecs)
        ecs_enriched.to_csv(outcfg["enrichment_file"], index=False)

        # create corresponding enrichment pymol scripts
        outcfg["enrichment_pml_files"] = []
        for sphere_view, pml_suffix in [
            (True, "_enrichment_spheres.pml"), (False, "_enrichment_sausage.pml")
        ]:
            pml_file = prefix + pml_suffix
            enrichment_pymol_script(ecs_enriched, pml_file, sphere_view=sphere_view)
            outcfg["enrichment_pml_files"].append(pml_file)

    # output EVzoom JSON file if we have stored model file
    if outcfg.get("model_file", None) is not None:
        outcfg["evzoom_file"] = prefix + "_evzoom.json"
        with open(outcfg["evzoom_file"], "w") as f:
            # load parameters
            c = CouplingsModel(outcfg["model_file"])

            # create JSON output and write to file
            f.write(
                evzoom_json(c) + "\n"
            )

    # dump output config to YAML file for debugging/logging
    write_config_file(prefix + ".couplings_standard.outcfg", outcfg)

    return outcfg