def _docking_config(config_file=None): """ Load docking configuration Parameters ---------- config_file: str, optional (default: None) Path to configuration file. If None, loads default configuration included with package. Returns ------- dict Loaded configuration """ if config_file is None: # get path of config within package config_file = resource_filename( __name__, "cns_templates/haddock_restraints.yml") # check if config file exists and read verify_resources("Folding config file does not exist or is empty", config_file) return read_config_file(config_file)
def __init__(self, logreg_model_file=None, min_n_eff_over_l=0.375): """ Create new logistic regression-based EC rescorer Parameters ---------- logreg_model_file : str, optional (default: None) Specify path to yml file with logistic regression model parameters; if None, will use default model included with package (evcouplings/couplings/scoring_models/logistic_regression_all.yml) min_n_eff_over_l : float, optional (default: 0.3) Minimum number of effective sequences per model site required for rescoring to be applied; otherwise standard score will be returned and all probabilities will be set to 0. The default value will be divided by theta for the rescored run, the default of 0.375 derives from N_eff/L = 0.3 at theta = 0.8 """ # by default load internal classifier included with package if logreg_model_file is None: logreg_model_file = resource_filename( __name__, "scoring_models/logistic_regression_all.yml") # load classifier from param file logreg_model_serialized = read_config_file(logreg_model_file) # deserialize and store classifier self.classifier, self.feature_names = logreg_classifier_from_dict( logreg_model_serialized) # store min N_eff/L requirement self.min_n_eff_over_l = min_n_eff_over_l
def run(**kwargs): """ EVcouplings pipeline execution from a configuration file (single thread, no batch or environment configuration) Parameters ---------- kwargs See click.option decorators for app() """ config_file = kwargs["config"] verify_resources("Config file does not exist or is empty.", config_file) # read configuration and execute config = read_config_file(config_file) # execute configuration in "wrapped" mode # that handles exceptions and internal interrupts return execute_wrapped(**config)
def protein_monomer(prefix, configs): """ Create results summary for run using protein_monomer pipeline # TODO """ MIN_PROBABILITY = 0.9 ali_table = pd.DataFrame() prefix_to_cfgs = {} data = defaultdict(lambda: defaultdict()) # go through all config files for cfg_file in configs: # check if the file exists and has contents # since run might not yet have finished or crashed if valid_file(cfg_file): # job input configuration C = read_config_file(cfg_file) sub_prefix = C["global"]["prefix"] domain_threshold = C["align"]["domain_threshold"] sub_index = (domain_threshold, sub_prefix) final_state_cfg = sub_prefix + FINAL_CONFIG_SUFFIX if not valid_file(final_state_cfg): continue # read final output state of job R = read_config_file(final_state_cfg) data[sub_index]["identities"] = R["identities_file"] data[sub_index]["frequencies"] = R["frequencies_file"] data[sub_index]["minimum_column_coverage"] = C["align"][ "minimum_column_coverage"] stat_file = R["statistics_file"] ec_file = R.get("ec_file", "") ec_comp_file = R.get("ec_compared_longrange_file", "") prefix_to_cfgs[(sub_prefix)] = (C, R) # read and modify alignment statistics if valid_file(stat_file): # get alignment stats for current job stat_df = pd.read_csv(stat_file) n_eff = R["effective_sequences"] if n_eff is not None: stat_df.loc[0, "N_eff"] = n_eff stat_df.loc[0, "domain_threshold"] = domain_threshold L = stat_df.loc[0, "num_cov"] # try to get number of significant ECs in addition if valid_file(ec_file): ecs = pd.read_csv(ec_file) min_seq_dist = C["compare"]["min_sequence_distance"] num_sig = len( ecs.query( "abs(i-j) >= @min_seq_dist and probability >= @MIN_PROBABILITY" )) stat_df.loc[0, "num_significant"] = num_sig # try to get EC precision in addition if valid_file(ec_comp_file): ec_comp = pd.read_csv(ec_comp_file) stat_df.loc[0, "precision"] = ec_comp.iloc[L]["precision"] # finally, append to global table ali_table = ali_table.append(stat_df) # sort table by sequence search threshold ali_table = ali_table.sort_values(by="domain_threshold") # when saving files, have to aquire lock to make sure # jobs don't start overwriting results # make plots and save fig = _protein_monomer_plot(ali_table, data) plot_file = prefix + "_job_statistics_summary.pdf" lock_plot = filelock.FileLock(plot_file) with lock_plot: fig.savefig(plot_file, bbox_inches="tight") # save ali statistics table table_file = prefix + "_job_statistics_summary.csv" lock_table = filelock.FileLock(table_file) with lock_table: ali_table.to_csv(table_file, index=False, float_format="%.3f") return ali_table
def protein_complex(prefix, configs): """ Create results summary for run using protein_complex pipeline """ # TODO: this is only designed to work with skewnormal threshold MIN_PROBABILITY = 0.9 # number of inter ECs to check for precision NUM_INTER = 5 # TODO: create segments global variable and import FIRST_SEGMENT = "A_1" SECOND_SEGMENT = "B_1" ali_table = pd.DataFrame() prefix_to_cfgs = {} data = defaultdict(lambda: defaultdict()) # go through all config files for cfg_file in configs: # check if the file exists and has contents # since run might not yet have finished or crashed if valid_file(cfg_file): # job input configuration C = read_config_file(cfg_file) sub_prefix = C["global"]["prefix"] sub_index = (sub_prefix) final_state_cfg = sub_prefix + FINAL_CONFIG_SUFFIX if not valid_file(final_state_cfg): continue # read final output state of job R = read_config_file(final_state_cfg) data[sub_index]["identities"] = R["identities_file"] data[sub_index]["frequencies"] = R["frequencies_file"] data[sub_index]["minimum_column_coverage"] = C["concatenate"][ "minimum_column_coverage"] stat_file = R["statistics_file"] ec_file = R.get("ec_file", "") ec_comp_file = R.get("ec_compared_longrange_file", "") concat_stat_file = R.get("concatentation_statistics_file", "") first_stat_file = R.get("first_statistics_file", "") second_stat_file = R.get("second_statistics_file", "") prefix_to_cfgs[(sub_prefix)] = (C, R) # read and modify alignment statistics if valid_file(stat_file): # get alignment stats for current job stat_df = pd.read_csv(stat_file) n_eff = R["effective_sequences"] if n_eff is not None: stat_df.loc[0, "N_eff"] = n_eff L = stat_df.loc[0, "num_cov"] # try to get concatenation statistics in addition if valid_file(concat_stat_file): concat_stat_df = pd.read_csv(concat_stat_file) # get and save n sequences per monomer aln n_seqs_1 = concat_stat_df.loc[0, "num_seqs_1"] n_seqs_2 = concat_stat_df.loc[0, "num_seqs_2"] stat_df.loc[0, "first_n_seqs"] = int(n_seqs_1) stat_df.loc[0, "second_n_seqs"] = int(n_seqs_2) # get and save median n paralogs per monomer aln n_paralogs_1 = concat_stat_df.loc[ 0, "median_num_per_species_1"] n_paralogs_2 = concat_stat_df.loc[ 0, "median_num_per_species_2"] stat_df.loc[0, "median_num_per_species_1"] = n_paralogs_1 stat_df.loc[0, "median_num_per_species_2"] = n_paralogs_2 # try to get number of significant ECs in addition if valid_file(ec_file): ecs = pd.read_csv(ec_file) #number of significant monomer Ecs min_seq_dist = C["compare"]["min_sequence_distance"] num_sig = len( ecs.query( "abs(i-j) >= @min_seq_dist and probability >= @MIN_PROBABILITY" )) # number of inter-protein ECs significant num_sig_inter = len( ecs.query( "segment_i != segment_j and probability >= @MIN_PROBABILITY" )) stat_df.loc[0, "num_significant"] = int(num_sig) #rank of top inter contact top_inter_rank = ecs.query( "segment_i != segment_j").index[0] stat_df.loc[0, "top_inter_rank"] = int(top_inter_rank) # try to get EC precision in addition if valid_file(ec_comp_file): ec_comp = pd.read_csv(ec_comp_file) ec_comp_1 = ec_comp.query( "segment_i == segment_j == @FIRST_SEGMENT") ec_comp_2 = ec_comp.query( "segment_i == segment_j == @SECOND_SEGMENT") ec_comp_inter = ec_comp.query("segment_i != segment_j") # use the monomer statistics files to figure out how many sites in each monomer if valid_file(first_stat_file) and valid_file( second_stat_file): stats_1 = pd.read_csv(first_stat_file) L_1 = L = stats_1.loc[0, "num_cov"] stats_2 = pd.read_csv(second_stat_file) L_2 = L = stats_2.loc[0, "num_cov"] # precision of monomer 1 stat_df.loc[ 0, "first_monomer_precision"] = ec_comp_1.iloc[ L_1]["segmentwise_precision"] # precicions of monomer 2 stat_df.loc[ 0, "second_monomer_precision"] = ec_comp_2.iloc[ L_2]["segmentwise_precision"] # precision of top 5 inter stat_df.loc[0, "inter_precision"] = ec_comp_inter.iloc[ NUM_INTER]["segmentwise_precision"] # finally, append to global table ali_table = ali_table.append(stat_df) # save ali statistics table table_file = prefix + "_job_statistics_summary.csv" lock_table = filelock.FileLock(table_file) with lock_table: ali_table.to_csv(table_file, index=False, float_format="%.3f") return ali_table
def jackhmmer_search(**kwargs): """ Protocol: Iterative jackhmmer search against a sequence database. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required .. todo:: explain meaning of parameters in detail. Returns ------- outcfg : dict Output configuration of the protocol, including the following fields: * sequence_id (passed through from input) * first_index (passed through from input) * target_sequence_file * sequence_file * raw_alignment_file * hittable_file * focus_mode * focus_sequence * segments """ check_required(kwargs, [ "prefix", "sequence_id", "sequence_file", "sequence_download_url", "region", "first_index", "use_bitscores", "domain_threshold", "sequence_threshold", "database", "iterations", "cpu", "nobias", "reuse_alignment", "checkpoints_hmm", "checkpoints_ali", "jackhmmer", "extract_annotation" ]) prefix = kwargs["prefix"] # make sure output directory exists create_prefix_folders(prefix) # store search sequence file here target_sequence_file = prefix + ".fa" full_sequence_file = prefix + "_full.fa" # make sure search sequence is defined and load it full_seq_file, (full_seq_id, full_seq) = fetch_sequence( kwargs["sequence_id"], kwargs["sequence_file"], kwargs["sequence_download_url"], full_sequence_file) # cut sequence to target region and save in sequence_file # (this is the main sequence file used downstream) (region_start, region_end), cut_seq = cut_sequence(full_seq, kwargs["sequence_id"], kwargs["region"], kwargs["first_index"], target_sequence_file) # run jackhmmer... allow to reuse pre-exisiting # Stockholm alignment file here ali_outcfg_file = prefix + ".align_jackhmmer_search.outcfg" # determine if to rerun, only possible if previous results # were stored in ali_outcfg_file if kwargs["reuse_alignment"] and valid_file(ali_outcfg_file): ali = read_config_file(ali_outcfg_file) # check if the alignment file itself is also there verify_resources( "Tried to reuse alignment, but empty or " "does not exist", ali["alignment"], ali["domtblout"]) else: # otherwise, we have to run the alignment # modify search thresholds to be suitable for jackhmmer seq_threshold, domain_threshold = search_thresholds( kwargs["use_bitscores"], kwargs["sequence_threshold"], kwargs["domain_threshold"], len(cut_seq)) # run search process ali = at.run_jackhmmer( query=target_sequence_file, database=kwargs[kwargs["database"]], prefix=prefix, use_bitscores=kwargs["use_bitscores"], domain_threshold=domain_threshold, seq_threshold=seq_threshold, iterations=kwargs["iterations"], nobias=kwargs["nobias"], cpu=kwargs["cpu"], checkpoints_hmm=kwargs["checkpoints_hmm"], checkpoints_ali=kwargs["checkpoints_ali"], binary=kwargs["jackhmmer"], ) # get rid of huge stdout log file immediately # (do not use /dev/null option of jackhmmer function # to make no assumption about operating system) try: os.remove(ali.output) except OSError: pass # turn namedtuple into dictionary to make # restarting code nicer ali = dict(ali._asdict()) # save results of search for possible restart write_config_file(ali_outcfg_file, ali) # prepare output dictionary with result files outcfg = { "sequence_id": kwargs["sequence_id"], "target_sequence_file": target_sequence_file, "sequence_file": full_sequence_file, "first_index": kwargs["first_index"], "focus_mode": True, "raw_alignment_file": ali["alignment"], "hittable_file": ali["domtblout"], } # define a single protein segment based on target sequence outcfg["segments"] = [ Segment("aa", kwargs["sequence_id"], region_start, region_end, range(region_start, region_end + 1)).to_list() ] outcfg["focus_sequence"] = "{}/{}-{}".format(kwargs["sequence_id"], region_start, region_end) return outcfg
def hmmbuild_and_search(**kwargs): """ Protocol: Build HMM from sequence alignment using hmmbuild and search against a sequence database using hmmsearch. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the protocol, including the following fields: * target_sequence_file * sequence_file * raw_alignment_file * hittable_file * focus_mode * focus_sequence * segments """ def _format_alignment_for_hmmbuild(input_alignment_file, **kwargs): # this file is starting point of pipeline; # check if input alignment actually exists verify_resources("Input alignment does not exist", input_alignment_file) # first try to autodetect format of alignment with open(input_alignment_file) as f: format = detect_format(f) if format is None: raise InvalidParameterError( "Format of input alignment {} could not be " "automatically detected.".format(input_alignment_file)) with open(input_alignment_file) as f: ali_raw = Alignment.from_file(f, format) # Target sequence of alignment sequence_id = kwargs["sequence_id"] if sequence_id is None: raise InvalidParameterError( "Parameter sequence_id must be defined") # First, find focus sequence in alignment focus_index = None for i, id_ in enumerate(ali_raw.ids): if id_.startswith(sequence_id): focus_index = i break # if we didn't find it, cannot continue if focus_index is None: raise InvalidParameterError( "Target sequence {} could not be found in alignment".format( sequence_id)) # identify what columns (non-gap) to keep for focus # this should be all columns in the raw_focus_alignment_file # but checking anyway focus_seq = ali_raw[focus_index] focus_cols = np.array([ c not in [ali_raw._match_gap, ali_raw._insert_gap] for c in focus_seq ]) # extract focus alignment focus_ali = ali_raw.select(columns=focus_cols) focus_seq_nogap = "".join(focus_ali[focus_index]) # determine region of sequence. If first_index is given, # use that in any case, otherwise try to autodetect full_focus_header = ali_raw.ids[focus_index] focus_id = full_focus_header.split()[0] # try to extract region from sequence header id_, region_start, region_end = parse_header(focus_id) # override with first_index if given if kwargs["first_index"] is not None: region_start = kwargs["first_index"] region_end = region_start + len(focus_seq_nogap) - 1 if region_start is None or region_end is None: raise InvalidParameterError( "Could not extract region information " + "from sequence header {} ".format(full_focus_header) + "and first_index parameter is not given.") # resubstitute full sequence ID from identifier # and region information header = "{}/{}-{}".format(id_, region_start, region_end) focus_ali.ids[focus_index] = header # write target sequence to file target_sequence_file = prefix + ".fa" with open(target_sequence_file, "w") as f: write_fasta([(header, focus_seq_nogap)], f) # swap target sequence to first position if it is not # the first sequence in alignment; # this is particularly important for hhfilter run # because target sequence might otherwise be filtered out if focus_index != 0: indices = np.arange(0, len(focus_ali)) indices[0] = focus_index indices[focus_index] = 0 focus_index = 0 focus_ali = focus_ali.select(sequences=indices) # write the raw focus alignment for hmmbuild focus_fasta_file = prefix + "_raw_focus_input.fasta" with open(focus_fasta_file, "w") as f: focus_ali.write(f, "fasta") return focus_fasta_file, target_sequence_file, region_start, region_end # define the gap threshold for inclusion in HMM's build by HMMbuild. SYMFRAC_HMMBUILD = 0.0 # check for required options check_required(kwargs, [ "prefix", "sequence_id", "alignment_file", "use_bitscores", "domain_threshold", "sequence_threshold", "database", "cpu", "nobias", "reuse_alignment", "hmmbuild", "hmmsearch" ]) prefix = kwargs["prefix"] # make sure output directory exists create_prefix_folders(prefix) # prepare input alignment for hmmbuild focus_fasta_file, target_sequence_file, region_start, region_end = \ _format_alignment_for_hmmbuild( kwargs["alignment_file"], **kwargs ) # run hmmbuild_and_search... allow to reuse pre-exisiting # Stockholm alignment file here ali_outcfg_file = prefix + ".align_hmmbuild_and_search.outcfg" # determine if to rerun, only possible if previous results # were stored in ali_outcfg_file if kwargs["reuse_alignment"] and valid_file(ali_outcfg_file): ali = read_config_file(ali_outcfg_file) # check if the alignment file itself is also there verify_resources( "Tried to reuse alignment, but empty or " "does not exist", ali["alignment"], ali["domtblout"]) else: # otherwise, we have to run the alignment # modify search thresholds to be suitable for hmmsearch sequence_length = region_end - region_start + 1 seq_threshold, domain_threshold = search_thresholds( kwargs["use_bitscores"], kwargs["sequence_threshold"], kwargs["domain_threshold"], sequence_length) # create the hmm hmmbuild_result = at.run_hmmbuild( alignment_file=focus_fasta_file, prefix=prefix, symfrac=SYMFRAC_HMMBUILD, cpu=kwargs["cpu"], binary=kwargs["hmmbuild"], ) hmmfile = hmmbuild_result.hmmfile # run the alignment from the hmm ali = at.run_hmmsearch( hmmfile=hmmfile, database=kwargs[kwargs["database"]], prefix=prefix, use_bitscores=kwargs["use_bitscores"], domain_threshold=domain_threshold, seq_threshold=seq_threshold, nobias=kwargs["nobias"], cpu=kwargs["cpu"], binary=kwargs["hmmsearch"], ) # get rid of huge stdout log file immediately try: os.remove(ali.output) except OSError: pass # turn namedtuple into dictionary to make # restarting code nicer ali = dict(ali._asdict()) # only item from hmmsearch_result to save is the hmmfile ali["hmmfile"] = hmmfile # save results of search for possible restart write_config_file(ali_outcfg_file, ali) # prepare output dictionary with result files outcfg = { "sequence_file": target_sequence_file, "first_index": region_start, "input_raw_focus_alignment": focus_fasta_file, "target_sequence_file": target_sequence_file, "focus_mode": True, "raw_alignment_file": ali["alignment"], "hittable_file": ali["domtblout"], } # convert the raw output alignment to fasta format # and add the appropriate query sequecne raw_focus_alignment_file = _make_hmmsearch_raw_fasta(outcfg, prefix) outcfg["raw_focus_alignment_file"] = raw_focus_alignment_file # define a single protein segment based on target sequence outcfg["segments"] = [ Segment("aa", kwargs["sequence_id"], region_start, region_end, range(region_start, region_end + 1)).to_list() ] outcfg["focus_sequence"] = "{}/{}-{}".format(kwargs["sequence_id"], region_start, region_end) return outcfg
def infer_plmc(**kwargs): """ Run EC computation on alignment. This function contains the functionality shared between monomer and complex EC inference. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: raw_ec_file model_file num_sites num_sequences effective_sequences focus_mode (passed through) focus_sequence (passed through) segments (passed through) """ check_required( kwargs, [ "prefix", "alignment_file", "focus_mode", "focus_sequence", "theta", "alphabet", "segments", "ignore_gaps", "iterations", "lambda_h", "lambda_J", "lambda_group", "scale_clusters", "cpu", "plmc", "reuse_ecs", ] ) prefix = kwargs["prefix"] # for now disable option to not save model, since # otherwise mutate stage will crash. To remove model # file at end, use delete option in management section. """ if kwargs["save_model"]: model = prefix + ".model" else: model = None """ model = prefix + ".model" outcfg = { "model_file": model, "raw_ec_file": prefix + "_ECs.txt", "ec_file": prefix + "_CouplingScores.csv", # the following are passed through stage... "focus_mode": kwargs["focus_mode"], "focus_sequence": kwargs["focus_sequence"], "segments": kwargs["segments"], } # make sure input alignment exists verify_resources( "Input alignment does not exist", kwargs["alignment_file"] ) # make sure output directory exists create_prefix_folders(prefix) # regularization strength on couplings J_ij lambda_J = kwargs["lambda_J"] segments = kwargs["segments"] if segments is not None: segments = [ mapping.Segment.from_list(s) for s in segments ] # first determine size of alphabet; # default is amino acid alphabet if kwargs["alphabet"] is None: alphabet = ALPHABET_PROTEIN alphabet_setting = None else: alphabet = kwargs["alphabet"] # allow shortcuts for protein, DNA, RNA if alphabet in ALPHABET_MAP: alphabet = ALPHABET_MAP[alphabet] # if we have protein alphabet, do not set # as plmc parameter since default parameter, # has some implementation advantages for focus mode if alphabet == ALPHABET_PROTEIN: alphabet_setting = None else: alphabet_setting = alphabet # scale lambda_J to proportionally compensate # for higher number of J_ij compared to h_i? if kwargs["lambda_J_times_Lq"]: num_symbols = len(alphabet) # if we ignore gaps, there is one character less if kwargs["ignore_gaps"]: num_symbols -= 1 # second, determine number of uppercase positions # that are included in the calculation with open(kwargs["alignment_file"]) as f: seq_id, seq = next(read_fasta(f)) # gap character is by convention first char in alphabet gap = alphabet[0] uppercase = [ c for c in seq if c == c.upper() or c == gap ] L = len(uppercase) # finally, scale lambda_J lambda_J *= (num_symbols - 1) * (L - 1) # run plmc... or reuse pre-exisiting results from previous run plm_outcfg_file = prefix + ".couplings_standard_plmc.outcfg" # determine if to rerun, only possible if previous results # were stored in ali_outcfg_file if kwargs["reuse_ecs"] and valid_file(plm_outcfg_file): plmc_result = read_config_file(plm_outcfg_file) # check if the EC/parameter files are there required_files = [outcfg["raw_ec_file"]] if outcfg["model_file"] is not None: required_files += [outcfg["model_file"]] verify_resources( "Tried to reuse ECs, but empty or " "does not exist", *required_files ) else: # run plmc binary plmc_result = ct.run_plmc( kwargs["alignment_file"], outcfg["raw_ec_file"], outcfg["model_file"], focus_seq=kwargs["focus_sequence"], alphabet=alphabet_setting, theta=kwargs["theta"], scale=kwargs["scale_clusters"], ignore_gaps=kwargs["ignore_gaps"], iterations=kwargs["iterations"], lambda_h=kwargs["lambda_h"], lambda_J=lambda_J, lambda_g=kwargs["lambda_group"], cpu=kwargs["cpu"], binary=kwargs["plmc"], ) # save iteration table to file iter_table_file = prefix + "_iteration_table.csv" plmc_result.iteration_table.to_csv( iter_table_file ) # turn namedtuple into dictionary to make # restarting code nicer plmc_result = dict(plmc_result._asdict()) # then replace table with filename so # we can store results in config file plmc_result["iteration_table"] = iter_table_file # save results of search for possible restart write_config_file(plm_outcfg_file, plmc_result) # store useful information about model in outcfg outcfg.update({ "num_sites": plmc_result["num_valid_sites"], "num_valid_sequences": plmc_result["num_valid_seqs"], "effective_sequences": plmc_result["effective_samples"], "region_start": plmc_result["region_start"], }) # read and sort ECs ecs = pairs.read_raw_ec_file(outcfg["raw_ec_file"]) if segments is not None: # create index mapping seg_mapper = mapping.SegmentIndexMapper( kwargs["focus_mode"], outcfg["region_start"], *segments ) # apply to EC table ecs = mapping.segment_map_ecs(ecs, seg_mapper) return outcfg, ecs, segments
def execute(**config): """ Execute a pipeline configuration Parameters ---------- **config Input configuration for pipeline (see pipeline config files for example of how this should look like) Returns ------- global_state : dict Global output state of pipeline """ check_required(config, ["pipeline", "stages", "global"]) # check if valid pipeline was selected if config["pipeline"] not in PIPELINES: raise InvalidParameterError("Not a valid pipeline selection. " "Valid choices are:\n{}".format(", ".join( PIPELINES.keys()))) stages = config["stages"] if stages is None: raise InvalidParameterError("No stages defined, need at least one.") # get definition of selected pipeline pipeline = PIPELINES[config["pipeline"]] prefix = config["global"]["prefix"] # make sure output directory exists create_prefix_folders(prefix) # this is the global state of results as # we move through different stages of # the pipeline global_state = config["global"] # keep track of how many stages are still # to be run, so we can leave out stages at # the end of workflow below num_stages_to_run = len(stages) # get job tracker tracker = get_result_tracker(config) # set job status to running and also initalize global state tracker.update(status=EStatus.RUN, results=global_state) # iterate through individual stages for (stage, runner, key_prefix) in pipeline: # check if anything else is left to # run, otherwise skip if num_stages_to_run == 0: break # check if config for stage is there check_required(config, [stage]) # output files for stage into an individual folder stage_prefix = insert_dir(prefix, stage) create_prefix_folders(stage_prefix) # config files for input and output of stage stage_incfg = "{}_{}.incfg".format(stage_prefix, stage) stage_outcfg = "{}_{}.outcfg".format(stage_prefix, stage) # update current stage of job tracker.update(stage=stage) # check if stage should be executed if stage in stages: # global state inserted at end, overrides any # stage-specific settings (except for custom prefix) incfg = { **config["tools"], **config["databases"], **config[stage], **global_state, "prefix": stage_prefix } # save input of stage in config file write_config_file(stage_incfg, incfg) # run stage outcfg = runner(**incfg) # prefix output keys if this parameter is # given in stage configuration, to avoid # name clashes if same protocol run multiple times if key_prefix is not None: outcfg = {key_prefix + k: v for k, v in outcfg.items()} # save output of stage in config file write_config_file(stage_outcfg, outcfg) # one less stage to put through after we ran this... num_stages_to_run -= 1 else: # skip state by injecting state from previous run verify_resources( "Trying to skip, but output configuration " "for stage '{}' does not exist. Has it already " "been run?".format(stage, stage), stage_outcfg) # read output configuration outcfg = read_config_file(stage_outcfg) # verify all the output files are there outfiles = [ filepath for f, filepath in outcfg.items() if f.endswith("_file") and filepath is not None ] verify_resources( "Output files from stage '{}' " "missing".format(stage), *outfiles) # update global state with outputs of stage global_state = {**global_state, **outcfg} # update state in tracker accordingly tracker.update(results=outcfg) # create results archive archive_file = create_archive(config, global_state, prefix) # only store results archive if a result file was created if archive_file is not None: global_state["archive_file"] = archive_file # prepare update for tracker, but only store in last # go when job is set to done tracker_archive_update = {"archive_file": archive_file} else: tracker_archive_update = None # set job status to done and transfer archive if selected for syncing tracker.update(status=EStatus.DONE, results=tracker_archive_update) # delete selected output files if requested; # tracker does not need to update here since it won't # sync entries of delete list in the first place global_state = delete_outputs(config, global_state) # write final global state of pipeline write_config_file(prefix + FINAL_CONFIG_SUFFIX, global_state) return global_state
def substitute_config(**kwargs): """ Substitute command line arguments into config file Parameters ---------- **kwargs Command line parameters to be substituted into configuration file Returns ------- dict Updated configuration """ # mapping of command line parameters to config file entries CONFIG_MAP = { "prefix": ("global", "prefix"), "protein": ("global", "sequence_id"), "seqfile": ("global", "sequence_file"), "alignment": ("align", "input_alignment"), "iterations": ("align", "iterations"), "id": ("align", "seqid_filter"), "seqcov": ("align", "minimum_sequence_coverage"), "colcov": ("align", "minimum_column_coverage"), "theta": ("global", "theta"), "plmiter": ("couplings", "iterations"), "queue": ("environment", "queue"), "time": ("environment", "time"), "cores": ("environment", "cores"), "memory": ("environment", "memory"), } # try to read in configuration config_file = kwargs["config"] if not valid_file(config_file): raise ResourceError( "Config file does not exist or is empty: {}".format(config_file)) config = read_config_file(config_file, preserve_order=True) # substitute command-line parameters into configuration # (if straightforward substitution) for param, value in kwargs.items(): if param in CONFIG_MAP and value is not None: outer, inner = CONFIG_MAP[param] config[outer][inner] = value # make sure that number of CPUs requested by # programs within pipeline does not exceed # number of cores requested in environment if config["environment"]["cores"] is not None: config["global"]["cpu"] = config["environment"]["cores"] # handle the more complicated parameters # If alignment is given, run "existing" protocol if kwargs.get("alignment", None) is not None: # TODO: think about what to do if sequence_file is given # (will not be used) config["align"]["protocol"] = "existing" # subregion of protein if kwargs.get("region", None) is not None: region = kwargs["region"] m = re.search("(\d+)-(\d+)", region) if m: start, end = map(int, m.groups()) config["global"]["region"] = [start, end] else: raise InvalidParameterError( "Region string does not have format " "start-end (e.g. 5-123):".format(region)) # pipeline stages to run if kwargs.get("stages", None) is not None: config["stages"] = kwargs["stages"].replace(" ", "").split(",") # sequence alignment input database if kwargs.get("database", None) is not None: db = kwargs["database"] # check if we have a predefined sequence database # if so, use it; otherwise, interpret as file path if db in config["databases"]: config["align"]["database"] = db else: config["align"]["database"] = "custom" config["databases"]["custom"] = db # make sure bitscore and E-value thresholds are exclusively set if kwargs.get("bitscores", None) is not None and kwargs.get( "evalues", None) is not None: raise InvalidParameterError( "Can not specify bitscore and E-value threshold at the same time.") if kwargs.get("bitscores", None) is not None: thresholds = kwargs["bitscores"] bitscore = True elif kwargs.get("evalues", None) is not None: thresholds = kwargs["evalues"] bitscore = False else: thresholds = None if thresholds is not None: T = thresholds.replace(" ", "").split(",") try: x_cast = [(float(t) if "." in t else int(t)) for t in T] except ValueError: raise InvalidParameterError( "Bitscore/E-value threshold(s) must be numeric: " "{}".format(thresholds)) config["align"]["use_bitscores"] = bitscore # check if we have a single threshold (single job) # or if we need to create an array of jobs if len(x_cast) == 1: config["align"]["domain_threshold"] = x_cast[0] config["align"]["sequence_threshold"] = x_cast[0] else: config["batch"] = {} for t in x_cast: sub_prefix = ("_b" if bitscore else "_e") + str(t) config["batch"][sub_prefix] = { "align": { "domain_threshold": t, "sequence_threshold": t, } } return config
def standard(**kwargs): """ Protocol: Infer ECs from alignment using plmc. .. todo:: 1. make EC enrichment calculation segment-ready 2. explain meaning of parameters in detail. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * raw_ec_file * model_file * num_sites * num_sequences * effective_sequences * focus_mode (passed through) * focus_sequence (passed through) * segments (passed through) """ check_required( kwargs, [ "prefix", "alignment_file", "focus_mode", "focus_sequence", "theta", "alphabet", "segments", "ignore_gaps", "iterations", "lambda_h", "lambda_J", "lambda_group", "scale_clusters", "cpu", "plmc", "reuse_ecs", "min_sequence_distance", # "save_model", ] ) prefix = kwargs["prefix"] # for now disable option to not save model, since # otherwise mutate stage will crash. To remove model # file at end, use delete option in management section. """ if kwargs["save_model"]: model = prefix + ".model" else: model = None """ model = prefix + ".model" outcfg = { "model_file": model, "raw_ec_file": prefix + "_ECs.txt", "ec_file": prefix + "_CouplingScores.csv", # TODO: the following are passed through stage... # keep this or unnecessary? "focus_mode": kwargs["focus_mode"], "focus_sequence": kwargs["focus_sequence"], "segments": kwargs["segments"], } # make sure input alignment exists verify_resources( "Input alignment does not exist", kwargs["alignment_file"] ) # make sure output directory exists create_prefix_folders(prefix) # regularization strength on couplings J_ij lambda_J = kwargs["lambda_J"] segments = kwargs["segments"] if segments is not None: segments = [ mapping.Segment.from_list(s) for s in segments ] # first determine size of alphabet; # default is amino acid alphabet if kwargs["alphabet"] is None: alphabet = ALPHABET_PROTEIN alphabet_setting = None else: alphabet = kwargs["alphabet"] # allow shortcuts for protein, DNA, RNA if alphabet in ALPHABET_MAP: alphabet = ALPHABET_MAP[alphabet] # if we have protein alphabet, do not set # as plmc parameter since default parameter, # has some implementation advantages for focus mode if alphabet == ALPHABET_PROTEIN: alphabet_setting = None else: alphabet_setting = alphabet # scale lambda_J to proportionally compensate # for higher number of J_ij compared to h_i? if kwargs["lambda_J_times_Lq"]: num_symbols = len(alphabet) # if we ignore gaps, there is one character less if kwargs["ignore_gaps"]: num_symbols -= 1 # second, determine number of uppercase positions # that are included in the calculation with open(kwargs["alignment_file"]) as f: seq_id, seq = next(read_fasta(f)) # gap character is by convention first char in alphabet gap = alphabet[0] uppercase = [ c for c in seq if c == c.upper() or c == gap ] L = len(uppercase) # finally, scale lambda_J lambda_J *= (num_symbols - 1) * (L - 1) # run plmc... or reuse pre-exisiting results from previous run plm_outcfg_file = prefix + ".couplings_standard_plmc.outcfg" # determine if to rerun, only possible if previous results # were stored in ali_outcfg_file if kwargs["reuse_ecs"] and valid_file(plm_outcfg_file): plmc_result = read_config_file(plm_outcfg_file) # check if the EC/parameter files are there required_files = [outcfg["raw_ec_file"]] if outcfg["model_file"] is not None: required_files += [outcfg["model_file"]] verify_resources( "Tried to reuse ECs, but empty or " "does not exist", *required_files ) else: # run plmc binary plmc_result = ct.run_plmc( kwargs["alignment_file"], outcfg["raw_ec_file"], outcfg["model_file"], focus_seq=kwargs["focus_sequence"], alphabet=alphabet_setting, theta=kwargs["theta"], scale=kwargs["scale_clusters"], ignore_gaps=kwargs["ignore_gaps"], iterations=kwargs["iterations"], lambda_h=kwargs["lambda_h"], lambda_J=lambda_J, lambda_g=kwargs["lambda_group"], cpu=kwargs["cpu"], binary=kwargs["plmc"], ) # save iteration table to file iter_table_file = prefix + "_iteration_table.csv" plmc_result.iteration_table.to_csv( iter_table_file ) # turn namedtuple into dictionary to make # restarting code nicer plmc_result = dict(plmc_result._asdict()) # then replace table with filename so # we can store results in config file plmc_result["iteration_table"] = iter_table_file # save results of search for possible restart write_config_file(plm_outcfg_file, plmc_result) # store useful information about model in outcfg outcfg.update({ "num_sites": plmc_result["num_valid_sites"], "num_sequences": plmc_result["num_valid_seqs"], "effective_sequences": plmc_result["effective_samples"], "region_start": plmc_result["region_start"], }) # read and sort ECs ecs = pairs.read_raw_ec_file(outcfg["raw_ec_file"]) # add mixture model probability ecs = pairs.add_mixture_probability(ecs) if segments is not None: # and (len(segments) > 1 or not kwargs["focus_mode"]): # create index mapping seg_mapper = mapping.SegmentIndexMapper( kwargs["focus_mode"], outcfg["region_start"], *segments ) # apply to EC table ecs = mapping.segment_map_ecs(ecs, seg_mapper) # write updated table to csv file ecs.to_csv(outcfg["ec_file"], index=False) # also store longrange ECs as convenience output if kwargs["min_sequence_distance"] is not None: outcfg["ec_longrange_file"] = prefix + "_CouplingScores_longrange.csv" ecs_longrange = ecs.query( "abs(i - j) >= {}".format(kwargs["min_sequence_distance"]) ) ecs_longrange.to_csv(outcfg["ec_longrange_file"], index=False) # also create line-drawing script (for now, only for single segments) if segments is None or len(segments) == 1: outcfg["ec_lines_pml_file"] = prefix + "_draw_ec_lines.pml" L = outcfg["num_sites"] ec_lines_pymol_script( ecs_longrange.iloc[:L, :], outcfg["ec_lines_pml_file"] ) # compute EC enrichment (for now, for single segments # only since enrichment code cannot handle multiple segments) if segments is None or len(segments) == 1: outcfg["enrichment_file"] = prefix + "_enrichment.csv" ecs_enriched = pairs.enrichment(ecs) ecs_enriched.to_csv(outcfg["enrichment_file"], index=False) # create corresponding enrichment pymol scripts outcfg["enrichment_pml_files"] = [] for sphere_view, pml_suffix in [ (True, "_enrichment_spheres.pml"), (False, "_enrichment_sausage.pml") ]: pml_file = prefix + pml_suffix enrichment_pymol_script(ecs_enriched, pml_file, sphere_view=sphere_view) outcfg["enrichment_pml_files"].append(pml_file) # output EVzoom JSON file if we have stored model file if outcfg.get("model_file", None) is not None: outcfg["evzoom_file"] = prefix + "_evzoom.json" with open(outcfg["evzoom_file"], "w") as f: # load parameters c = CouplingsModel(outcfg["model_file"]) # create JSON output and write to file f.write( evzoom_json(c) + "\n" ) # dump output config to YAML file for debugging/logging write_config_file(prefix + ".couplings_standard.outcfg", outcfg) return outcfg