def run(**kwargs): """ Exposes command line interface as a Python function. Parameters ---------- kwargs See click.option decorators for app() function """ # substitute commmand line options in config file config = substitute_config(**kwargs) # check minimal set of parameters is present in config check_required(config, ["pipeline", "stages", "global"]) # verify that global prefix makes sense pipeline.verify_prefix(verify_subdir=False, **config) # for convenience, turn on N_eff computation if we run alignment, # but not the couplings stage if "align" in config["stages"] and "couplings" not in config["stages"]: config["align"]["compute_num_effective_seqs"] = True # unroll batch jobs into individual pipeline jobs sub_configs = unroll_config(config) # run pipeline computation for each individual (unrolled) config run_jobs(sub_configs, config, kwargs.get("yolo", False), kwargs.get("workdir", None))
def run(**kwargs): """ Run inference protocol to calculate ECs from input sequence alignment. Parameters ---------- Mandatory kwargs arguments: protocol: EC protocol to run prefix: Output prefix for all generated files Returns ------- outcfg : dict Output configuration of stage (see individual protocol for fields) """ check_required(kwargs, ["protocol"]) if kwargs["protocol"] not in PROTOCOLS: raise InvalidParameterError( "Invalid protocol selection: " + "{}. Valid protocols are: {}".format(kwargs["protocol"], ", ".join( PROTOCOLS.keys()))) return PROTOCOLS[kwargs["protocol"]](**kwargs)
def standard(**kwargs): """ Protocol: Infer ECs from alignment using plmc. Use complex protocol for heteromultimeric complexes instead. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required and infer_plmc() Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: raw_ec_file model_file num_sites num_sequences effective_sequences focus_mode (passed through) focus_sequence (passed through) segments (passed through) """ # for additional required parameters, see infer_plmc() check_required(kwargs, [ "prefix", "min_sequence_distance", ]) prefix = kwargs["prefix"] # infer ECs and load them outcfg, ecs, segments = infer_plmc(**kwargs) model = CouplingsModel(outcfg["model_file"]) # following computations are mostly specific to monomer pipeline is_single_segment = segments is None or len(segments) == 1 outcfg = { **outcfg, **_postprocess_inference(ecs, kwargs, model, outcfg, prefix, generate_enrichment=is_single_segment, generate_line_plot=is_single_segment) } # dump output config to YAML file for debugging/logging write_config_file(prefix + ".couplings_standard.outcfg", outcfg) return outcfg
def run(**kwargs): """ Run alignment concatenation protocol Parameters ---------- Mandatory kwargs arguments: protocol: concatenation protocol to run prefix: Output prefix for all generated files Returns ------- outcfg : dict Output configuration of concatenation stage Dictionary with results in following fields: (in brackets: not mandatory) alignment_file raw_alignment_file focus_mode focus_sequence segments frequencies_file identities_file num_sequences num_sites raw_focus_alignment_file statistics_file """ check_required(kwargs, ["protocol"]) if kwargs["protocol"] not in PROTOCOLS: raise InvalidParameterError( "Invalid protocol selection: " + "{}. Valid protocols are: {}".format( kwargs["protocol"], ", ".join(PROTOCOLS.keys()) ) ) return PROTOCOLS[kwargs["protocol"]](**kwargs)
def run(**kwargs): """ Run inference protocol to calculate ECs from input sequence alignment. Parameters ---------- Mandatory kwargs arguments: protocol: EC protocol to run prefix: Output prefix for all generated files Returns ------- outcfg : dict Output configuration of couplings stage Dictionary with results in following fields: (in brackets: not mandatory) ec_file effective_sequences [enrichment_file] focus_mode focus_sequence model_file num_sequences num_sites raw_ec_file region_start segments """ check_required(kwargs, ["protocol"]) if kwargs["protocol"] not in PROTOCOLS: raise InvalidParameterError( "Invalid protocol selection: " + "{}. Valid protocols are: {}".format( kwargs["protocol"], ", ".join(PROTOCOLS.keys()) ) ) return PROTOCOLS[kwargs["protocol"]](**kwargs)
def run(**kwargs): """ Run alignment protocol to generate multiple sequence alignment from input sequence. Parameters ---------- Mandatory kwargs arguments: protocol: Alignment protocol to run prefix: Output prefix for all generated files Optional: Returns ------- Alignment Dictionary with results of stage in following fields (in brackets - not returned by all protocols): * alignment_file * [raw_alignment_file] * statistics_file * target_sequence_file * sequence_file * [annotation_file] * frequencies_file * identities_file * [hittable_file] * focus_mode * focus_sequence * segments """ check_required(kwargs, ["protocol"]) if kwargs["protocol"] not in PROTOCOLS: raise InvalidParameterError( "Invalid protocol selection: " + "{}. Valid protocols are: {}".format(kwargs["protocol"], ", ".join( PROTOCOLS.keys()))) return PROTOCOLS[kwargs["protocol"]](**kwargs)
def run(**kwargs): """ Run alignment concatenation protocol Parameters ---------- Mandatory kwargs arguments: protocol: concatenation protocol to run prefix: Output prefix for all generated files Returns ------- outcfg : dict Output configuration of concatenation stage Dictionary with results in following fields (in brackets: not mandatory): .. todo:: to be finalized after implementing protocols * alignment_file * focus_mode * focus_sequence * segments * num_sites * num_sequences """ check_required(kwargs, ["protocol"]) if kwargs["protocol"] not in PROTOCOLS: raise InvalidParameterError( "Invalid protocol selection: " + "{}. Valid protocols are: {}".format(kwargs["protocol"], ", ".join( PROTOCOLS.keys()))) return PROTOCOLS[kwargs["protocol"]](**kwargs)
def execute(**config): """ Execute a pipeline configuration Parameters ---------- **config Input configuration for pipeline (see pipeline config files for example of how this should look like) Returns ------- global_state : dict Global output state of pipeline """ check_required(config, ["pipeline", "stages", "global"]) # check if valid pipeline was selected if config["pipeline"] not in PIPELINES: raise InvalidParameterError("Not a valid pipeline selection. " "Valid choices are:\n{}".format(", ".join( PIPELINES.keys()))) stages = config["stages"] if stages is None: raise InvalidParameterError("No stages defined, need at least one.") # get definition of selected pipeline pipeline = PIPELINES[config["pipeline"]] prefix = config["global"]["prefix"] # make sure output directory exists create_prefix_folders(prefix) # this is the global state of results as # we move through different stages of # the pipeline global_state = config["global"] # keep track of how many stages are still # to be run, so we can leave out stages at # the end of workflow below num_stages_to_run = len(stages) # get job tracker tracker = get_result_tracker(config) # set job status to running and also initalize global state tracker.update(status=EStatus.RUN, results=global_state) # iterate through individual stages for (stage, runner, key_prefix) in pipeline: # check if anything else is left to # run, otherwise skip if num_stages_to_run == 0: break # check if config for stage is there check_required(config, [stage]) # output files for stage into an individual folder stage_prefix = insert_dir(prefix, stage) create_prefix_folders(stage_prefix) # config files for input and output of stage stage_incfg = "{}_{}.incfg".format(stage_prefix, stage) stage_outcfg = "{}_{}.outcfg".format(stage_prefix, stage) # update current stage of job tracker.update(stage=stage) # check if stage should be executed if stage in stages: # global state inserted at end, overrides any # stage-specific settings (except for custom prefix) incfg = { **config["tools"], **config["databases"], **config[stage], **global_state, "prefix": stage_prefix } # save input of stage in config file write_config_file(stage_incfg, incfg) # run stage outcfg = runner(**incfg) # prefix output keys if this parameter is # given in stage configuration, to avoid # name clashes if same protocol run multiple times if key_prefix is not None: outcfg = {key_prefix + k: v for k, v in outcfg.items()} # save output of stage in config file write_config_file(stage_outcfg, outcfg) # one less stage to put through after we ran this... num_stages_to_run -= 1 else: # skip state by injecting state from previous run verify_resources( "Trying to skip, but output configuration " "for stage '{}' does not exist. Has it already " "been run?".format(stage, stage), stage_outcfg) # read output configuration outcfg = read_config_file(stage_outcfg) # verify all the output files are there outfiles = [ filepath for f, filepath in outcfg.items() if f.endswith("_file") and filepath is not None ] verify_resources( "Output files from stage '{}' " "missing".format(stage), *outfiles) # update global state with outputs of stage global_state = {**global_state, **outcfg} # update state in tracker accordingly tracker.update(results=outcfg) # create results archive archive_file = create_archive(config, global_state, prefix) # only store results archive if a result file was created if archive_file is not None: global_state["archive_file"] = archive_file # prepare update for tracker, but only store in last # go when job is set to done tracker_archive_update = {"archive_file": archive_file} else: tracker_archive_update = None # set job status to done and transfer archive if selected for syncing tracker.update(status=EStatus.DONE, results=tracker_archive_update) # delete selected output files if requested; # tracker does not need to update here since it won't # sync entries of delete list in the first place global_state = delete_outputs(config, global_state) # write final global state of pipeline write_config_file(prefix + FINAL_CONFIG_SUFFIX, global_state) return global_state
def complex(**kwargs): """ Protocol: Run monomer alignment protocol and postprocess it for EVcomplex calculations Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the alignment protocol, and the following additional field: genome_location_file : path to file containing the genomic locations for CDs's corresponding to identifiers in the alignment. """ check_required(kwargs, [ "prefix", "alignment_protocol", "uniprot_to_embl_table", "ena_genome_location_table" ]) verify_resources("Uniprot to EMBL mapping table does not exist", kwargs["uniprot_to_embl_table"]) verify_resources("ENA genome location table does not exist", kwargs["ena_genome_location_table"]) prefix = kwargs["prefix"] # make sure output directory exists create_prefix_folders(prefix) # run the regular alignment protocol # (standard, existing, ...) alignment_protocol = kwargs["alignment_protocol"] if alignment_protocol not in PROTOCOLS: raise InvalidParameterError( "Invalid choice for alignment protocol: {}".format( alignment_protocol)) outcfg = PROTOCOLS[kwargs["alignment_protocol"]](**kwargs) # if the user selected the existing alignment protocol # they can supply an input annotation file # which overwrites the annotation file generated by the existing protocol if alignment_protocol == "existing": check_required(kwargs, ["override_annotation_file"]) if kwargs["override_annotation_file"] is not None: verify_resources("Override annotation file does not exist", kwargs["override_annotation_file"]) outcfg["annotation_file"] = prefix + "_annotation.csv" annotation_data = pd.read_csv(kwargs["override_annotation_file"]) annotation_data.to_csv(outcfg["annotation_file"]) # extract cds identifiers for alignment uniprot IDs cds_ids = extract_cds_ids(outcfg["alignment_file"], kwargs["uniprot_to_embl_table"]) # extract genome location information from ENA genome_location_filename = prefix + "_genome_location.csv" genome_location_table = extract_embl_annotation( cds_ids, kwargs["ena_genome_location_table"], genome_location_filename) genome_location_table = add_full_header(genome_location_table, outcfg["alignment_file"]) genome_location_table.to_csv(genome_location_filename) outcfg["genome_location_file"] = genome_location_filename # dump output config to YAML file for debugging/logging write_config_file(prefix + ".align_complex.outcfg", outcfg) return outcfg
def standard(**kwargs): """ Protocol: Standard buildali4 workflow (run iterative jackhmmer search against sequence database, than determine which sequences and columns to include in the calculation based on coverage and maximum gap thresholds). Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * sequence_id (passed through from input) * first_index (passed through from input) * alignment_file * raw_alignment_file * raw_focus_alignment_file * statistics_file * target_sequence_file * sequence_file * annotation_file * frequencies_file * identities_file * hittable_file * focus_mode * focus_sequence * segments ali : Alignment Final sequence alignment """ check_required(kwargs, [ "prefix", "extract_annotation", ]) prefix = kwargs["prefix"] # make sure output directory exists create_prefix_folders(prefix) # first step of protocol is to get alignment using # jackhmmer; initialize output configuration with # results of this search jackhmmer_outcfg = jackhmmer_search(**kwargs) stockholm_file = jackhmmer_outcfg["raw_alignment_file"] segment = Segment.from_list(jackhmmer_outcfg["segments"][0]) target_seq_id = segment.sequence_id region_start = segment.region_start region_end = segment.region_end # read in stockholm format (with full annotation) with open(stockholm_file) as a: ali_raw = Alignment.from_file(a, "stockholm") # and store as FASTA file first (disabled for now # since equivalent information easily be obtained # from Stockholm file """ ali_raw_fasta_file = prefix + "_raw.fasta" with open(ali_raw_fasta_file, "w") as f: ali_raw.write(f, "fasta") """ # save annotation in sequence headers (species etc.) if kwargs["extract_annotation"]: annotation_file = prefix + "_annotation.csv" annotation = extract_header_annotation(ali_raw) annotation.to_csv(annotation_file, index=False) # center alignment around focus/search sequence focus_cols = np.array([c != "-" for c in ali_raw[0]]) focus_ali = ali_raw.select(columns=focus_cols) target_seq_index = 0 mod_outcfg, ali = modify_alignment(focus_ali, target_seq_index, target_seq_id, region_start, **kwargs) # merge results of jackhmmer_search and modify_alignment stage outcfg = { **jackhmmer_outcfg, **mod_outcfg, "annotation_file": annotation_file } # dump output config to YAML file for debugging/logging write_config_file(prefix + ".align_standard.outcfg", outcfg) # return results of protocol return outcfg
def standard(**kwargs): """ Protocol: Predict 3D structure from evolutionary couplings Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * sec_struct_file * folding_ec_file * folded_structure_files """ check_required( kwargs, [ "prefix", "engine", "ec_file", "target_sequence_file", "segments", "folding_config_file", "cut_to_alignment_region", "sec_struct_method", "reuse_sec_struct", "sec_struct_file", "filter_sec_struct_clashes", "min_sequence_distance", "fold_probability_cutoffs", "fold_lowest_count", "fold_highest_count", "fold_increase", "num_models", "psipred", "cpu", "remapped_pdb_files", "cleanup", ] ) prefix = kwargs["prefix"] # make sure output directory exists create_prefix_folders(prefix) outcfg = { "folding_ec_file": prefix + "_CouplingScores_with_clashes.csv", "sec_struct_file": prefix + "_secondary_structure.csv", } # get secondary structure prediction # check if we should (and can) reuse output file from previous run if kwargs["reuse_sec_struct"] and valid_file(outcfg["sec_struct_file"]): residues = pd.read_csv(outcfg["sec_struct_file"]) else: residues = secondary_structure(**kwargs) # make pymol secondary structure assignment script outcfg["secondary_structure_pml_file"] = prefix + "_ss_draw.pml" pymol_secondary_structure( residues, outcfg["secondary_structure_pml_file"] ) # load ECs and filter for long-range pairs verify_resources( "EC file does not exist", kwargs["ec_file"] ) ecs_all = pd.read_csv(kwargs["ec_file"]) ecs = ecs_all.query("abs(i - j) > {}".format( kwargs["min_sequence_distance"]) ) # find secondary structure clashes ecs = secstruct_clashes(ecs, residues) ecs.to_csv(outcfg["folding_ec_file"], index=False) # if requested, filter clashes out before folding if kwargs["filter_sec_struct_clashes"]: ecs_fold = ecs.loc[~ecs.ss_clash] else: ecs_fold = ecs # cut modelled region to aligned region, if selected if kwargs["cut_to_alignment_region"]: segments = kwargs["segments"] # infer region from segment positions if we have it if segments is not None: positions = Segment.from_list(segments[0]).positions else: # otherwise get from EC values (could be misleading if # EC list is truncated, so only second option) positions = set(ecs.i.unique()).union(ecs.j.unique()) # limit modelled positions to covered region first_pos, last_pos = min(positions), max(positions) residues.loc[:, "in_model"] = False residues.loc[ (residues.i >= first_pos) & (residues.i <= last_pos), "in_model" ] = True else: # otherwise include all positions in model residues.loc[:, "in_model"] = True # save secondary structure prediction residues.to_csv(outcfg["sec_struct_file"], index=False) # only use the residues that will be in model for folding residues_fold = residues.loc[residues.in_model] # after all the setup, now fold the structures... # to speed things up, parallelize this to the number of # available CPUs num_procs = kwargs["cpu"] if num_procs is None: num_procs = 1 # first define all the sub-runs... folding_runs = [] # ... based on mixture model probability cutoffs = kwargs["fold_probability_cutoffs"] if cutoffs is not None and "probability" in ecs_fold.columns: if not isinstance(cutoffs, list): cutoffs = [cutoffs] for c in cutoffs: sig_ecs = ecs_fold.query("probability >= @c") if len(sig_ecs) > 0: folding_runs.append( (sig_ecs, "_significant_ECs_{}".format(c)) ) # ... and on simple EC counts/bins flc = kwargs["fold_lowest_count"] fhc = kwargs["fold_highest_count"] fi = kwargs["fold_increase"] if flc is not None and fhc is not None and fi is not None: num_sites = len( set.union(set(ecs.i.unique()), set(ecs.j.unique())) ) # transform fraction of number of sites into discrete number of ECs def _discrete_count(x): if isinstance(x, float): x = ceil(x * num_sites) return int(x) # range of plots to make lowest = _discrete_count(flc) highest = _discrete_count(fhc) step = _discrete_count(fi) # append to list of jobs to run folding_runs += [ ( ecs_fold.iloc[:c], "_{}".format(c) ) for c in range(lowest, highest + 1, step) ] # set up method to drive the folding of each job method = kwargs["engine"] # store structures in an auxiliary subdirectory, after folding # final models will be moved to main folding dir. Depending # on cleanup setting, the aux directory will be removed aux_prefix = insert_dir(prefix, "aux", rootname_subdir=False) aux_dir = path.dirname(aux_prefix) folding_runs = [ (job_ecs, aux_prefix + job_suffix) for (job_ecs, job_suffix) in folding_runs ] if method == "cns_dgsa": folder = partial( cns_dgsa_fold, residues_fold, config_file=kwargs["folding_config_file"], num_structures=kwargs["num_models"], log_level=None, binary=kwargs["cns"] ) else: raise InvalidParameterError( "Invalid folding engine: {} ".format(method) + "Valid selections are: cns_dgsa" ) # then apply folding function to each sub-run pool = mp.Pool(processes=num_procs) results = pool.starmap(folder, folding_runs) # make double sure that the pool is cleaned up, # or SIGTERM upon exit will interfere with # interrupt signal interception pool.close() pool.join() # merge result dictionaries into one dict folded_files = { k: v for subres in results for k, v in subres.items() } # move structures from aux into main folding dir fold_dir = path.dirname(prefix) prediction_files = [] for name, file_path in folded_files.items(): # move file (use copy to allow overwriting) shutil.copy(file_path, fold_dir) # update file path to main folding dir, # and put in a flat list of result files prediction_files.append( file_path.replace(aux_prefix, prefix) ) outcfg["folded_structure_files"] = prediction_files # remove aux dir if cleanup is requested if kwargs["cleanup"]: shutil.rmtree(aux_dir) # apply ranking to predicted models ranking = dihedral_ranking(prediction_files, residues) # apply clustering (all available methods), but only # if we have something to cluster if len(prediction_files) > 1: clustering = maxcluster_clustering_table( prediction_files, binary=kwargs["maxcluster"] ) # join ranking with clustering ranking = ranking.merge(clustering, on="filename", how="left") # sort by score (best models first) ranking = ranking.sort_values(by="ranking_score", ascending=False) # store as file outcfg["folding_ranking_file"] = prefix + "_ranking.csv" ranking.to_csv(outcfg["folding_ranking_file"], index=False) # apply comparison to existing structures if kwargs["remapped_pdb_files"] is not None and len(kwargs["remapped_pdb_files"]) > 0: experimental_files = kwargs["remapped_pdb_files"] comp_all, comp_singles = compare_models_maxcluster( list(experimental_files.keys()), prediction_files, norm_by_intersection=True, distance_cutoff=None, binary=kwargs["maxcluster"] ) # merge with ranking and save comparison = ranking.merge( comp_all, on="filename", how="left" ).sort_values(by="tm", ascending=False) outcfg["folding_comparison_file"] = prefix + "_comparison.csv" comparison.to_csv(outcfg["folding_comparison_file"], index=False) # also store comparison to structures in individual files ind_comp_files = {} for filename, comp_single in comp_singles.items(): comparison_s = ranking.merge( comp_single, on="filename", how="left" ).sort_values(by="tm", ascending=False) basename = path.splitext(path.split(filename)[1])[0] ind_file = path.join(fold_dir, basename + ".csv") # map back to original key from remapped_pdb_files as a key for this list ind_comp_files[ind_file] = experimental_files[filename] comparison_s.to_csv(ind_file, index=False) outcfg["folding_individual_comparison_files"] = ind_comp_files return outcfg
def complex(**kwargs): """ Protocol: Compare ECs for a complex to 3D structure Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * ec_file_compared_all * ec_file_compared_all_longrange * pdb_structure_hits * distmap_monomer * distmap_multimer * contact_map_files * remapped_pdb_files """ check_required(kwargs, [ "prefix", "ec_file", "min_sequence_distance", "pdb_mmtf_dir", "atom_filter", "first_compare_multimer", "second_compare_multimer", "distance_cutoff", "first_sequence_id", "second_sequence_id", "first_sequence_file", "second_sequence_file", "first_segments", "second_segments", "first_target_sequence_file", "second_target_sequence_file", "scale_sizes" ]) prefix = kwargs["prefix"] outcfg = { # initialize output EC files "ec_compared_all_file": prefix + "_CouplingScoresCompared_all.csv", "ec_compared_longrange_file": prefix + "_CouplingScoresCompared_longrange.csv", "ec_compared_inter_file": prefix + "_CouplingScoresCompared_inter.csv", # initialize output inter distancemap files "distmap_inter": prefix + "_distmap_inter", "inter_contacts_file": prefix + "_inter_contacts_file" } # Add PDB comparison files for first and second monomer for monomer_prefix in ["first", "second"]: outcfg = { **outcfg, monomer_prefix + "_pdb_structure_hits_file": "{}_{}_structure_hits.csv".format(prefix, monomer_prefix), monomer_prefix + "_pdb_structure_hits_unfiltered_file": "{}_{}_structure_hits_unfitered.csv".format( prefix, monomer_prefix), monomer_prefix + "_distmap_monomer": "{}_{}_distance_map_monomer".format(prefix, monomer_prefix), monomer_prefix + "_distmap_multimer": "{}_{}_distance_map_multimer".format(prefix, monomer_prefix), } # make sure EC file exists verify_resources("EC file does not exist", kwargs["ec_file"]) # make sure output directory exists create_prefix_folders(prefix) # store auxiliary files here (too much for average user) aux_prefix = insert_dir(prefix, "aux", rootname_subdir=False) create_prefix_folders(aux_prefix) # store auxiliary files here (too much for average user) first_aux_prefix = insert_dir(aux_prefix, "first_monomer", rootname_subdir=False) create_prefix_folders(first_aux_prefix) # store auxiliary files here (too much for average user) second_aux_prefix = insert_dir(aux_prefix, "second_monomer", rootname_subdir=False) create_prefix_folders(second_aux_prefix) # Step 1: Identify 3D structures for comparison def _identify_monomer_structures(name_prefix, outcfg, aux_prefix): # create a dictionary with kwargs for just the current monomer # remove the "prefix" kwargs so that we can replace with the # aux prefix when calling _identify_structures # only replace first occurrence of name_prefix monomer_kwargs = { k.replace(name_prefix + "_", "", 1): v for k, v in kwargs.items() if "prefix" not in k } # this field needs to be set explicitly else it gets overwritten by concatenated file monomer_kwargs["alignment_file"] = kwargs[name_prefix + "_alignment_file"] monomer_kwargs["raw_focus_alignment_file"] = kwargs[ name_prefix + "_raw_focus_alignment_file"] # identify structures for that monomer sifts_map, sifts_map_full = _identify_structures(**monomer_kwargs, prefix=aux_prefix) # save selected PDB hits sifts_map.hits.to_csv(outcfg[name_prefix + "_pdb_structure_hits_file"], index=False) # also save full list of hits sifts_map_full.hits.to_csv( outcfg[name_prefix + "_pdb_structure_hits_unfiltered_file"], index=False) return outcfg, sifts_map outcfg, first_sifts_map = _identify_monomer_structures( "first", outcfg, first_aux_prefix) outcfg, second_sifts_map = _identify_monomer_structures( "second", outcfg, second_aux_prefix) # get the segment names from the kwargs segment_list = kwargs["segments"] # Make sure user provided exactly two segments if len(segment_list) != 2: raise InvalidParameterError( "Compare stage for protein complexes requires exactly two segments" ) first_segment_name = kwargs["segments"][0][0] second_segment_name = kwargs["segments"][1][0] # Step 2: Compute distance maps def _compute_monomer_distance_maps(sifts_map, name_prefix, chain_name): # prepare a sequence map to remap the structures we have found verify_resources("Target sequence file does not exist", kwargs[name_prefix + "_target_sequence_file"]) # create target sequence map for remapping structure with open(kwargs[name_prefix + "_target_sequence_file"]) as f: header, seq = next(read_fasta(f)) # create target sequence map for remapping structure seq_id, seq_start, seq_end = parse_header(header) seqmap = dict(zip(range(seq_start, seq_end + 1), seq)) # compute distance maps and save # (but only if we found some structure) if len(sifts_map.hits) > 0: d_intra = intra_dists(sifts_map, structures, atom_filter=kwargs["atom_filter"], output_prefix=aux_prefix + "_" + name_prefix + "_distmap_intra") d_intra.to_file(outcfg[name_prefix + "_distmap_monomer"]) # save contacts to separate file outcfg[ name_prefix + "_monomer_contacts_file"] = prefix + "_" + name_prefix + "_contacts_monomer.csv" d_intra.contacts(kwargs["distance_cutoff"]).to_csv( outcfg[name_prefix + "_monomer_contacts_file"], index=False) # compute multimer distances, if requested; # note that d_multimer can be None if there # are no structures with multiple chains if kwargs[name_prefix + "_compare_multimer"]: d_multimer = multimer_dists(sifts_map, structures, atom_filter=kwargs["atom_filter"], output_prefix=aux_prefix + "_" + name_prefix + "_distmap_multimer") else: d_multimer = None # if we have a multimer contact map, save it if d_multimer is not None: d_multimer.to_file(outcfg[name_prefix + "_distmap_multimer"]) outcfg[ name_prefix + "_multimer_contacts_file"] = prefix + name_prefix + "_contacts_multimer.csv" # save contacts to separate file d_multimer.contacts(kwargs["distance_cutoff"]).to_csv( outcfg[name_prefix + "_multimer_contacts_file"], index=False) else: outcfg[name_prefix + "_distmap_multimer"] = None # create remapped structures (e.g. for # later comparison of folding results) # remap structures, swap mapping index and filename in # dictionary so we have a list of files in the dict keys outcfg[name_prefix + "_remapped_pdb_files"] = { filename: mapping_index for mapping_index, filename in remap_chains( sifts_map, aux_prefix, seqmap, chain_name=chain_name, raise_missing=kwargs["raise_missing"]).items() } else: # if no structures, cannot compute distance maps d_intra = None d_multimer = None outcfg[name_prefix + "_distmap_monomer"] = None outcfg[name_prefix + "_distmap_multimer"] = None outcfg[name_prefix + "remapped_pdb_files"] = None return d_intra, d_multimer, seqmap # load all structures for both monomers all_structures = set(first_sifts_map.hits.pdb_id).union( set(second_sifts_map.hits.pdb_id)) structures = load_structures(all_structures, kwargs["pdb_mmtf_dir"], raise_missing=False) d_intra_i, d_multimer_i, seqmap_i = _compute_monomer_distance_maps( first_sifts_map, "first", "A") d_intra_j, d_multimer_j, seqmap_j = _compute_monomer_distance_maps( second_sifts_map, "second", "B") # compute inter distance map if sifts map for each monomer exists if len(first_sifts_map.hits) > 0 and len(second_sifts_map.hits) > 0: d_inter = inter_dists(first_sifts_map, second_sifts_map, raise_missing=kwargs["raise_missing"]) # if there were overlapping PDBs, save the results if d_inter is not None: d_inter.to_file(outcfg["distmap_inter"]) # save contacts to separate file d_inter.contacts(kwargs["distance_cutoff"]).to_csv( outcfg["inter_contacts_file"], index=False) else: outcfg["inter_contacts_file"] = None d_inter = None # # Step 3: Compare ECs to distance maps ec_table = pd.read_csv(kwargs["ec_file"]) for out_file, min_seq_dist in [ ("ec_compared_longrange_file", kwargs["min_sequence_distance"]), ("ec_compared_all_file", 0), ]: # compare ECs only if we have an intra distance map # for at least one monomer - inter can't exist unless # we have both monomers if (d_intra_i is not None) or (d_intra_j is not None): # compare distances individually for each segment pair ecs_intra_i = ec_table.query( "segment_i == segment_j == @first_segment_name") if d_intra_i is not None: ecs_intra_i_compared = coupling_scores_compared( ecs_intra_i, d_intra_i, d_multimer_i, dist_cutoff=kwargs["distance_cutoff"], output_file=None, min_sequence_dist=min_seq_dist) else: # If no distance map, the distance is saved as np.nan ecs_intra_i_compared = ecs_intra_i.assign(dist=np.nan) ecs_intra_j = ec_table.query( "segment_i == segment_j == @second_segment_name") if d_intra_j is not None: ecs_intra_j_compared = coupling_scores_compared( ecs_intra_j, d_intra_j, d_multimer_j, dist_cutoff=kwargs["distance_cutoff"], output_file=None, min_sequence_dist=min_seq_dist) else: ecs_intra_j_compared = ecs_intra_j.assign(dist=np.nan) ecs_inter = ec_table.query("segment_i != segment_j") if d_inter is not None: ecs_inter_compared = coupling_scores_compared( ecs_inter, d_inter, dist_map_multimer=None, dist_cutoff=kwargs["distance_cutoff"], output_file=None, min_sequence_dist= None # does not apply for inter-protein ECs ) else: ecs_inter_compared = ecs_inter.assign(dist=np.nan) # combine the tables ec_table_compared = pd.concat([ ecs_inter_compared, ecs_intra_i_compared, ecs_intra_j_compared ]) # rename the precision column to "segmentwise_precision" # because we calculated precision for each segment independently ec_table_compared = ec_table_compared.rename( columns={"precision": "segmentwise_precision"}) # TODO: change "cn" to "score" eventually ec_table_compared = ec_table_compared.sort_values("cn", ascending=False) # add the total precision # TODO: implement different cutoffs for intra vs inter contacts ec_table_compared = add_precision( ec_table_compared, dist_cutoff=kwargs["distance_cutoff"]) # save to file # all ecs ec_table_compared.to_csv(outcfg[out_file]) # save the inter ECs to a file ecs_inter_compared.to_csv(outcfg["ec_compared_inter_file"]) # create the inter-ecs line drawing script if outcfg["ec_compared_inter_file"] is not None and kwargs[ "plot_highest_count"] is not None: inter_ecs = ec_table.query("segment_i != segment_j") outcfg[ "ec_lines_compared_pml_file"] = prefix + "_draw_ec_lines_compared.pml" pairs.ec_lines_pymol_script( inter_ecs.iloc[:kwargs["plot_highest_count"], :], outcfg["ec_lines_compared_pml_file"], distance_cutoff=kwargs["distance_cutoff"], chain={ first_segment_name: "A", second_segment_name: "B" }) # Remap the complex crystal structures, if available if len(first_sifts_map.hits) > 0 and len(second_sifts_map.hits) > 0: outcfg["complex_remapped_pdb_files"] = { filename: mapping_index for mapping_index, filename in remap_complex_chains( first_sifts_map, second_sifts_map, seqmap_i, seqmap_j, output_prefix=aux_prefix, raise_missing=kwargs["raise_missing"]).items() } # Step 4: Make contact map plots # if no structures available, defaults to EC-only plot outcfg["contact_map_files"] = _make_complex_contact_maps( ec_table, d_intra_i, d_multimer_i, d_intra_j, d_multimer_j, d_inter, first_segment_name, second_segment_name, **kwargs) return outcfg
def _identify_structures(**kwargs): """ Identify set of 3D structures for comparison Parameters ---------- **kwargs See check_required in code below Returns ------- SIFTSResult Identified structures and residue index mappings """ def _filter_by_id(x, id_list): x = deepcopy(x) x.hits = x.hits.loc[x.hits.pdb_id.isin(id_list)] return x check_required(kwargs, [ "prefix", "pdb_ids", "compare_multimer", "max_num_hits", "max_num_structures", "pdb_mmtf_dir", "sifts_mapping_table", "sifts_sequence_db", "by_alignment", "pdb_alignment_method", "alignment_min_overlap", "sequence_id", "sequence_file", "region", "use_bitscores", "domain_threshold", "sequence_threshold" ]) # get SIFTS mapping object/sequence DB s = SIFTS(kwargs["sifts_mapping_table"], kwargs["sifts_sequence_db"]) reduce_chains = not kwargs["compare_multimer"] # determine if we need to find structures # by sequence search or just fetching # based on Uniprot/PDB identifier if kwargs["by_alignment"]: # if searching by alignment, verify that # user selected jackhmmer or hmmsearch SEARCH_METHODS = ["jackhmmer", "hmmsearch"] if kwargs["pdb_alignment_method"] not in SEARCH_METHODS: raise InvalidParameterError("Invalid pdb search method: " + "{}. Valid selections are: {}".format( ", ".join(SEARCH_METHODS.keys()))) sifts_map = s.by_alignment(reduce_chains=reduce_chains, min_overlap=kwargs["alignment_min_overlap"], **kwargs) else: sifts_map = s.by_uniprot_id(kwargs["sequence_id"], reduce_chains=reduce_chains) sifts_map_full = deepcopy(sifts_map) # filter ID list down to manually selected PDB entries if kwargs["pdb_ids"] is not None: pdb_ids = kwargs["pdb_ids"] # make sure we have a list of PDB IDs if not isinstance(pdb_ids, list): pdb_ids = [pdb_ids] pdb_ids = [x.lower() for x in pdb_ids] sifts_map = _filter_by_id(sifts_map, pdb_ids) # limit number of hits and structures if kwargs["max_num_hits"] is not None: sifts_map.hits = sifts_map.hits.iloc[:kwargs["max_num_hits"]] if kwargs["max_num_structures"] is not None: keep_ids = sifts_map.hits.pdb_id.unique() keep_ids = keep_ids[:kwargs["max_num_structures"]] sifts_map = _filter_by_id(sifts_map, keep_ids) return sifts_map, sifts_map_full
def _make_contact_maps(ec_table, d_intra, d_multimer, **kwargs): """ Plot contact maps with all ECs above a certain probability threshold, or a given count of ECs Parameters ---------- ec_table : pandas.DataFrame Full set of evolutionary couplings (all pairs) d_intra : DistanceMap Computed residue-residue distances inside chain d_multimer : DistanceMap Computed residue-residue distances between homomultimeric chains **kwargs Further plotting parameters, see check_required in code for necessary values. Returns ------- cm_files : list(str) Paths of generated contact map files """ def plot_cm(ecs, output_file=None): """ Simple wrapper for contact map plotting """ with misc.plot_context("Arial"): fig = plt.figure(figsize=(8, 8)) if kwargs["scale_sizes"]: ecs = ecs.copy() ecs.loc[:, "size"] = ecs.cn.values / ecs.cn.max() pairs.plot_contact_map( ecs, d_intra, d_multimer, distance_cutoff=kwargs["distance_cutoff"], show_secstruct=kwargs["draw_secondary_structure"], margin=5, boundaries=kwargs["boundaries"]) plt.suptitle("{} evolutionary couplings".format(len(ecs)), fontsize=14) if output_file is not None: plt.savefig(output_file, bbox_inches="tight") plt.close(fig) check_required(kwargs, [ "prefix", "min_sequence_distance", "plot_probability_cutoffs", "boundaries", "plot_lowest_count", "plot_highest_count", "plot_increase", "draw_secondary_structure" ]) prefix = kwargs["prefix"] cm_files = [] ecs_longrange = ec_table.query("abs(i - j) >= {}".format( kwargs["min_sequence_distance"])) # based on significance cutoff if kwargs["plot_probability_cutoffs"]: cutoffs = kwargs["plot_probability_cutoffs"] if not isinstance(cutoffs, list): cutoffs = [cutoffs] for c in cutoffs: ec_set = ecs_longrange.query("probability >= @c") # only can plot if we have any significant ECs above threshold if len(ec_set) > 0: output_file = prefix + "_significant_ECs_{}.pdf".format(c) plot_cm(ec_set, output_file=output_file) cm_files.append(output_file) # based on number of long-range ECs # identify number of sites in EC model num_sites = len( set.union(set(ec_table.i.unique()), set(ec_table.j.unique()))) # transform fraction of number of sites into discrete number of ECs def _discrete_count(x): if isinstance(x, float): x = ceil(x * num_sites) return int(x) # range of plots to make lowest = _discrete_count(kwargs["plot_lowest_count"]) highest = _discrete_count(kwargs["plot_highest_count"]) step = _discrete_count(kwargs["plot_increase"]) # create individual plots for c in range(lowest, highest + 1, step): ec_set = ecs_longrange.iloc[:c] output_file = prefix + "_{}_ECs.pdf".format(c) plot_cm(ec_set, output_file=output_file) cm_files.append(output_file) # give back list of all contact map file names return cm_files
def jackhmmer_search(**kwargs): """ Protocol: Iterative jackhmmer search against a sequence database. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required .. todo:: explain meaning of parameters in detail. Returns ------- outcfg : dict Output configuration of the protocol, including the following fields: * sequence_id (passed through from input) * first_index (passed through from input) * target_sequence_file * sequence_file * raw_alignment_file * hittable_file * focus_mode * focus_sequence * segments """ check_required(kwargs, [ "prefix", "sequence_id", "sequence_file", "sequence_download_url", "region", "first_index", "use_bitscores", "domain_threshold", "sequence_threshold", "database", "iterations", "cpu", "nobias", "reuse_alignment", "checkpoints_hmm", "checkpoints_ali", "jackhmmer", "extract_annotation" ]) prefix = kwargs["prefix"] # make sure output directory exists create_prefix_folders(prefix) # store search sequence file here target_sequence_file = prefix + ".fa" full_sequence_file = prefix + "_full.fa" # make sure search sequence is defined and load it full_seq_file, (full_seq_id, full_seq) = fetch_sequence( kwargs["sequence_id"], kwargs["sequence_file"], kwargs["sequence_download_url"], full_sequence_file) # cut sequence to target region and save in sequence_file # (this is the main sequence file used downstream) (region_start, region_end), cut_seq = cut_sequence(full_seq, kwargs["sequence_id"], kwargs["region"], kwargs["first_index"], target_sequence_file) # run jackhmmer... allow to reuse pre-exisiting # Stockholm alignment file here ali_outcfg_file = prefix + ".align_jackhmmer_search.outcfg" # determine if to rerun, only possible if previous results # were stored in ali_outcfg_file if kwargs["reuse_alignment"] and valid_file(ali_outcfg_file): ali = read_config_file(ali_outcfg_file) # check if the alignment file itself is also there verify_resources( "Tried to reuse alignment, but empty or " "does not exist", ali["alignment"], ali["domtblout"]) else: # otherwise, we have to run the alignment # modify search thresholds to be suitable for jackhmmer seq_threshold, domain_threshold = search_thresholds( kwargs["use_bitscores"], kwargs["sequence_threshold"], kwargs["domain_threshold"], len(cut_seq)) # run search process ali = at.run_jackhmmer( query=target_sequence_file, database=kwargs[kwargs["database"]], prefix=prefix, use_bitscores=kwargs["use_bitscores"], domain_threshold=domain_threshold, seq_threshold=seq_threshold, iterations=kwargs["iterations"], nobias=kwargs["nobias"], cpu=kwargs["cpu"], checkpoints_hmm=kwargs["checkpoints_hmm"], checkpoints_ali=kwargs["checkpoints_ali"], binary=kwargs["jackhmmer"], ) # get rid of huge stdout log file immediately # (do not use /dev/null option of jackhmmer function # to make no assumption about operating system) try: os.remove(ali.output) except OSError: pass # turn namedtuple into dictionary to make # restarting code nicer ali = dict(ali._asdict()) # save results of search for possible restart write_config_file(ali_outcfg_file, ali) # prepare output dictionary with result files outcfg = { "sequence_id": kwargs["sequence_id"], "target_sequence_file": target_sequence_file, "sequence_file": full_sequence_file, "first_index": kwargs["first_index"], "focus_mode": True, "raw_alignment_file": ali["alignment"], "hittable_file": ali["domtblout"], } # define a single protein segment based on target sequence outcfg["segments"] = [ Segment("aa", kwargs["sequence_id"], region_start, region_end, range(region_start, region_end + 1)).to_list() ] outcfg["focus_sequence"] = "{}/{}-{}".format(kwargs["sequence_id"], region_start, region_end) return outcfg
def existing(**kwargs): """ Protocol: Use external sequence alignment and extract all relevant information from there (e.g. sequence, region, etc.), then apply gap & fragment filtering as usual Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * sequence_id (passed through from input) * alignment_file * raw_focus_alignment_file * statistics_file * sequence_file * first_index * target_sequence_file * annotation_file (None) * frequencies_file * identities_file * focus_mode * focus_sequence * segments """ check_required(kwargs, [ "prefix", "input_alignment", "sequence_id", "first_index", "extract_annotation" ]) prefix = kwargs["prefix"] # make sure output directory exists create_prefix_folders(prefix) # this file is starting point of pipeline; # check if input alignment actually exists input_alignment = kwargs["input_alignment"] verify_resources("Input alignment does not exist", input_alignment) # first try to autodetect format of alignment with open(input_alignment) as f: format = detect_format(f) if format is None: raise InvalidParameterError( "Format of input alignment {} could not be " "automatically detected.".format(input_alignment)) with open(input_alignment) as f: ali_raw = Alignment.from_file(f, format) # save annotation in sequence headers (species etc.) annotation_file = None if kwargs["extract_annotation"]: annotation_file = prefix + "_annotation.csv" from_anno_line = (format == "stockholm") annotation = extract_header_annotation(ali_raw, from_annotation=from_anno_line) annotation.to_csv(annotation_file, index=False) # Target sequence of alignment sequence_id = kwargs["sequence_id"] if sequence_id is None: raise InvalidParameterError("Parameter sequence_id must be defined") # First, find focus sequence in alignment focus_index = None for i, id_ in enumerate(ali_raw.ids): if id_.startswith(sequence_id): focus_index = i break # if we didn't find it, cannot continue if focus_index is None: raise InvalidParameterError( "Target sequence {} could not be found in alignment".format( sequence_id)) # identify what columns (non-gap) to keep for focus focus_seq = ali_raw[focus_index] focus_cols = np.array([ c not in [ali_raw._match_gap, ali_raw._insert_gap] for c in focus_seq ]) # extract focus alignment focus_ali = ali_raw.select(columns=focus_cols) focus_seq_nogap = "".join(focus_ali[focus_index]) # determine region of sequence. If first_index is given, # use that in any case, otherwise try to autodetect full_focus_header = ali_raw.ids[focus_index] focus_id = full_focus_header.split()[0] # try to extract region from sequence header id_, region_start, region_end = parse_header(focus_id) # override with first_index if given if kwargs["first_index"] is not None: region_start = kwargs["first_index"] region_end = region_start + len(focus_seq_nogap) - 1 if region_start is None or region_end is None: raise InvalidParameterError( "Could not extract region information " + "from sequence header {} ".format(full_focus_header) + "and first_index parameter is not given.") # resubstitute full sequence ID from identifier # and region information header = "{}/{}-{}".format(id_, region_start, region_end) focus_ali.ids[focus_index] = header # write target sequence to file target_sequence_file = prefix + ".fa" with open(target_sequence_file, "w") as f: write_fasta([(header, focus_seq_nogap)], f) # apply sequence identity and fragment filters, # and gap threshold mod_outcfg, ali = modify_alignment(focus_ali, focus_index, id_, region_start, **kwargs) # generate output configuration of protocol outcfg = { **mod_outcfg, "sequence_id": sequence_id, "sequence_file": target_sequence_file, "first_index": region_start, "target_sequence_file": target_sequence_file, "focus_sequence": header, "focus_mode": True, } if annotation_file is not None: outcfg["annotation_file"] = annotation_file # dump config to YAML file for debugging/logging write_config_file(prefix + ".align_existing.outcfg", outcfg) # return results of protocol return outcfg
def secondary_structure(**kwargs): """ Predict or load secondary structure for an input sequence Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- residues : pandas.DataFrame Table with sequence and secondary structure in columns i, A_i and sec_struct_3state """ check_required( kwargs, [ "prefix", "target_sequence_file", "segments", "sec_struct_method", "sec_struct_file", "psipred", ] ) prefix = kwargs["prefix"] create_prefix_folders(prefix) secstruct_file = kwargs["sec_struct_file"] if secstruct_file is not None: verify_resources( "Secondary structure prediction file does not exist/is empty", secstruct_file ) residues = pd.read_csv(secstruct_file) else: # make sure target sequence file is there so we can # predict secondary structure target_seq_file = kwargs["target_sequence_file"] verify_resources( "Sequence file does not exist/is empty", target_seq_file ) # we need to figure out what the index of the first residue # in the target sequence is; obtain first index from segment # information if possible if kwargs["segments"] is not None: s = Segment.from_list(kwargs["segments"][0]) first_index = s.region_start else: # otherwise try to get it from sequence file first_index = None with open(target_seq_file) as f: header, _ = next(read_fasta(f)) if header is not None: _, first_index, _ = parse_header(header) # if we cannot identify first index from header, # do not make guesses but fail if first_index is None: raise InvalidParameterError( "Could not unambiguously identify sequence range from " "FASTA header, needs to specified as id/start-end: {}".format( header ) ) # finally, run secondary structure prediction if kwargs["sec_struct_method"] == "psipred": # store psipred output in a separate directory output_dir = path.join(path.dirname(prefix), "psipred") # run psipred ss2_file, horiz_file = run_psipred( target_seq_file, output_dir, binary=kwargs["psipred"] ) # parse output, renumber to first index residues = read_psipred_prediction( horiz_file, first_index=first_index ) else: raise InvalidParameterError( "Secondary structure prediction method not implemented: " "{}. Valid choices: psipred".format(kwargs["sec_struct_method"]) ) # return predicted table return residues
def best_hit(**kwargs): """ Protocol: Concatenate alignments based on the best hit to the focus sequence in each species Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: alignment_file raw_alignment_file focus_mode focus_sequence segments frequencies_file identities_file num_sequences num_sites raw_focus_alignment_file statistics_file """ check_required( kwargs, [ "prefix", "first_alignment_file", "second_alignment_file", "first_focus_sequence", "second_focus_sequence", "first_focus_mode", "second_focus_mode", "first_segments", "second_segments", "first_identities_file", "second_identities_file", "first_annotation_file", "second_annotation_file", "use_best_reciprocal", "paralog_identity_threshold" ] ) prefix = kwargs["prefix"] # make sure input alignments verify_resources( "Input alignment does not exist", kwargs["first_alignment_file"], kwargs["second_alignment_file"] ) # make sure output directory exists create_prefix_folders(prefix) def _load_monomer_info(annotations_file, identities_file, target_sequence, alignment_file, use_best_reciprocal, identity_threshold): # read in annotation to a file and rename the appropriate column annotation_table = read_species_annotation_table(annotations_file) # read identity file similarities = pd.read_csv(identities_file) # create a pd.DataFrame containing the best hit in each organism most_similar_in_species = most_similar_by_organism(similarities, annotation_table) if use_best_reciprocal: paralogs = find_paralogs( target_sequence, annotation_table, similarities, identity_threshold ) most_similar_in_species = filter_best_reciprocal( alignment_file, paralogs, most_similar_in_species ) return most_similar_in_species # load the information about each monomer alignment most_similar_in_species_1 = _load_monomer_info( kwargs["first_annotation_file"], kwargs["first_identities_file"], kwargs["first_focus_sequence"], kwargs["first_alignment_file"], kwargs["use_best_reciprocal"], kwargs["paralog_identity_threshold"] ) most_similar_in_species_2 = _load_monomer_info( kwargs["second_annotation_file"], kwargs["second_identities_file"], kwargs["second_focus_sequence"], kwargs["second_alignment_file"], kwargs["use_best_reciprocal"], kwargs["paralog_identity_threshold"] ) # merge the two dataframes to get all species found in # both alignments species_intersection = most_similar_in_species_1.merge( most_similar_in_species_2, how="inner", # takes the intersection on="species", # merges on species identifiers suffixes=("_1", "_2") ) # write concatenated alignment with distance filtering # TODO: save monomer alignments? target_seq_id, target_seq_index, raw_ali, mon_ali_1, mon_ali_2 = \ write_concatenated_alignment( species_intersection, kwargs["first_alignment_file"], kwargs["second_alignment_file"], kwargs["first_focus_sequence"], kwargs["second_focus_sequence"] ) # save the alignment files raw_alignment_file = prefix + "_raw.fasta" with open(raw_alignment_file, "w") as of: raw_ali.write(of) mon_alignment_file_1 = prefix + "_monomer_1.fasta" with open(mon_alignment_file_1, "w") as of: mon_ali_1.write(of) mon_alignment_file_2 = prefix + "_monomer_2.fasta" with open(mon_alignment_file_2, "w") as of: mon_ali_2.write(of) aln_outcfg, _ = modify_alignment( raw_ali, target_seq_index, target_seq_id, kwargs["first_region_start"], **kwargs ) # make sure we return all the necessary information: # * alignment_file: final concatenated alignment that will go into plmc # * focus_sequence: this is the identifier of the concatenated target # sequence which will be passed into plmc with -f outcfg = aln_outcfg outcfg["raw_alignment_file"] = raw_alignment_file outcfg["first_concatenated_monomer_alignment_file"] = mon_alignment_file_1 outcfg["second_concatenated_monomer_alignment_file"] = mon_alignment_file_2 outcfg["focus_sequence"] = target_seq_id # Update the segments outcfg = modify_complex_segments(outcfg, **kwargs) # Describe the statistics of the concatenation outcfg = _run_describe_concatenation(outcfg, **kwargs) return outcfg
def genome_distance(**kwargs): """ Protocol: Concatenate alignments based on genomic distance Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * alignment_file * raw_alignment_file * focus_mode * focus_sequence * segments * frequencies_file * identities_file * num_sequences * num_sites * raw_focus_alignment_file * statistics_file """ check_required( kwargs, [ "prefix", "first_alignment_file", "second_alignment_file", "first_focus_sequence", "second_focus_sequence", "first_focus_mode", "second_focus_mode", "first_region_start", "second_region_start", "first_segments", "second_segments", "genome_distance_threshold", "first_genome_location_file", "second_genome_location_file", "first_annotation_file", "second_annotation_file" ] ) prefix = kwargs["prefix"] # make sure input alignments exist verify_resources( "Input alignment does not exist", kwargs["first_alignment_file"], kwargs["second_alignment_file"] ) verify_resources( "Genome location file does not exist", kwargs["first_genome_location_file"], kwargs["second_genome_location_file"] ) # make sure output directory exists create_prefix_folders(prefix) # load the information for each monomer alignment alignment_1 = kwargs["first_alignment_file"] alignment_2 = kwargs["second_alignment_file"] genome_location_filename_1 = kwargs["first_genome_location_file"] genome_location_filename_2 = kwargs["second_genome_location_file"] gene_location_table_1 = pd.read_csv(genome_location_filename_1, header=0) gene_location_table_2 = pd.read_csv(genome_location_filename_2, header=0) # find all possible matches possible_partners = find_possible_partners( gene_location_table_1, gene_location_table_2 ) # find the best reciprocal matches id_pairing_unfiltered = best_reciprocal_matching(possible_partners) # filter best reciprocal matches by genome distance threshold if kwargs["genome_distance_threshold"]: distance_threshold = kwargs["genome_distance_threshold"] id_pairing = id_pairing_unfiltered.query("distance < @distance_threshold") else: id_pairing = id_pairing_unfiltered id_pairing.loc[:, "id_1"] = id_pairing.loc[:, "uniprot_id_1"] id_pairing.loc[:, "id_2"] = id_pairing.loc[:, "uniprot_id_2"] # write concatenated alignment with distance filtering # TODO: save monomer alignments? target_seq_id, target_seq_index, raw_ali, mon_ali_1, mon_ali_2 = \ write_concatenated_alignment( id_pairing, alignment_1, alignment_2, kwargs["first_focus_sequence"], kwargs["second_focus_sequence"] ) # save the alignment files raw_alignment_file = prefix + "_raw.fasta" with open(raw_alignment_file, "w") as of: raw_ali.write(of) mon_alignment_file_1 = prefix + "_monomer_1.fasta" with open(mon_alignment_file_1, "w") as of: mon_ali_1.write(of) mon_alignment_file_2 = prefix + "_monomer_2.fasta" with open(mon_alignment_file_2, "w") as of: mon_ali_2.write(of) # filter the alignment aln_outcfg, _ = modify_alignment( raw_ali, target_seq_index, target_seq_id, kwargs["first_region_start"], **kwargs ) # make sure we return all the necessary information: # * alignment_file: final concatenated alignment that will go into plmc # * focus_sequence: this is the identifier of the concatenated target # sequence which will be passed into plmc with -f outcfg = aln_outcfg outcfg["raw_alignment_file"] = raw_alignment_file outcfg["first_concatenated_monomer_alignment_file"] = mon_alignment_file_1 outcfg["second_concatenated_monomer_alignment_file"] = mon_alignment_file_2 outcfg["focus_sequence"] = target_seq_id # Update the segments outcfg = modify_complex_segments(outcfg, **kwargs) # Describe the statistics of the concatenation outcfg = _run_describe_concatenation(outcfg, **kwargs) # plot the genome distance distribution outcfg["distance_plot_file"] = prefix + "_distplot.pdf" plot_distance_distribution(id_pairing_unfiltered, outcfg["distance_plot_file"]) return outcfg
def genome_distance(**kwargs): """ Protocol: Concatenate alignments based on genomic distance Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required .. todo:: Explain meaning of parameters in detail. Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: .. todo:: this is the full list normally returned by alignment protocol, decide which ones to keep. Mandatory: * alignment_file * focus_sequence * focus_mode * segments * alignment_file * [raw_alignment_file] * statistics_file * target_sequence_file * sequence_file * [annotation_file] * frequencies_file * identities_file * [hittable_file] * focus_mode * focus_sequence * segments """ check_required(kwargs, [ "prefix", "first_raw_focus_alignment_file", "second_raw_focus_alignment_file", "first_focus_sequence", "second_focus_sequence", "first_focus_mode", "second_focus_mode", "first_segments", "second_segments", ]) prefix = kwargs["prefix"] # make sure input alignments verify_resources("Input alignment does not exist", kwargs["first_alignment_file"], kwargs["second_alignment_file"]) # make sure output directory exists create_prefix_folders(prefix) # ------------------------------------------------- # TODO: implement concatenation functionality and # postprocessing functionality here # ------------------------------------------------- def _modify_segments(seg_list, seg_prefix): # extract segments from list representation into objects segs = [Segment.from_list(s) for s in seg_list] # update segment IDs for i, s in enumerate(segs, start=1): s.segment_id = "{}_{}".format(seg_prefix, i) return segs # merge segments - this allows to have more than one segment per # "monomer" alignment segments_1 = _modify_segments(kwargs["first_segments"], "A") segments_2 = _modify_segments(kwargs["second_segments"], "B") segments_complex = segments_1 + segments_2 # make sure we return all the necessary information: # * alignment_file: final concatenated alignment that will go into plmc # * focus_sequence: this is the identifier of the concatenated target # sequence which will be passed into plmc with -f outcfg = { "alignment_file": None, # TODO: specify "focus_mode": True, "focus_sequence": None, # TODO: specify "segments": [s.to_list() for s in segments_complex], # optional but good to have: "num_sites": None, "num_sequences": None, # "effective_sequences": n_eff # TODO: could compute this like in align stage # TODO: there are more outputs that we could add here (not mandatory), # e.g. single column frequencies in concatenated alignment } return outcfg
def modify_alignment(focus_ali, target_seq_index, target_seq_id, region_start, **kwargs): """ Apply pairwise identity filtering, fragment filtering, and exclusion of columns with too many gaps to a sequence alignment. Also generates files describing properties of the alignment such as frequency distributions, conservation, and "old-style" alignment statistics files. .. note:: assumes focus alignment (otherwise unprocessed) as input. .. todo:: come up with something more clever to filter fragments than fixed width (e.g. use 95% quantile of length distribution as reference point) Parameters ---------- focus_ali : Alignment Focus-mode input alignment target_seq_index : int Index of target sequence in alignment target_seq_id : str Identifier of target sequence (without range) region_start : int Index of first sequence position in target sequence kwargs : See required arguments in source code Returns ------- outcfg : Dict File products generated by the function: * alignment_file * statistics_file * frequencies_file * identities_file * raw_focus_alignment_file ali : Alignment Final processed alignment """ check_required(kwargs, [ "prefix", "seqid_filter", "hhfilter", "minimum_sequence_coverage", "minimum_column_coverage", "compute_num_effective_seqs", "theta", ]) prefix = kwargs["prefix"] create_prefix_folders(prefix) focus_fasta_file = prefix + "_raw_focus.fasta" outcfg = { "alignment_file": prefix + ".a2m", "statistics_file": prefix + "_alignment_statistics.csv", "frequencies_file": prefix + "_frequencies.csv", "identities_file": prefix + "_identities.csv", "raw_focus_alignment_file": focus_fasta_file, } # swap target sequence to first position if it is not # the first sequence in alignment; # this is particularly important for hhfilter run # because target sequence might otherwise be filtered out if target_seq_index != 0: indices = np.arange(0, len(focus_ali)) indices[0] = target_seq_index indices[target_seq_index] = 0 target_seq_index = 0 focus_ali = focus_ali.select(sequences=indices) with open(focus_fasta_file, "w") as f: focus_ali.write(f, "fasta") # apply pairwise identity filter (using hhfilter) if kwargs["seqid_filter"] is not None: filtered_file = prefix + "_filtered.a3m" at.run_hhfilter(focus_fasta_file, filtered_file, threshold=kwargs["seqid_filter"], columns="first", binary=kwargs["hhfilter"]) with open(filtered_file) as f: focus_ali = Alignment.from_file(f, "a3m") # final FASTA alignment before applying A2M format modifications filtered_fasta_file = prefix + "_raw_focus_filtered.fasta" with open(filtered_fasta_file, "w") as f: focus_ali.write(f, "fasta") ali = focus_ali # filter fragments # come up with something more clever here than fixed width # (e.g. use 95% quantile of length distribution as reference point) min_cov = kwargs["minimum_sequence_coverage"] if min_cov is not None: if isinstance(min_cov, int): min_cov /= 100 keep_seqs = (1 - ali.count("-", axis="seq")) >= min_cov ali = ali.select(sequences=keep_seqs) # Calculate frequencies, conservation and identity to query # on final alignment (except for lowercase modification) # Note: running hhfilter might cause a loss of the target seque # if it is not the first sequence in the file! To be sure that # nothing goes wrong, target_seq_index should always be 0. describe_seq_identities(ali, target_seq_index=target_seq_index).to_csv( outcfg["identities_file"], float_format="%.3f", index=False) describe_frequencies(ali, region_start, target_seq_index=target_seq_index).to_csv( outcfg["frequencies_file"], float_format="%.3f", index=False) coverage_stats = describe_coverage(ali, prefix, region_start, kwargs["minimum_column_coverage"]) # keep list of uppercase sequence positions in alignment pos_list = np.arange(region_start, region_start + ali.L, dtype="int32") # Make columns with too many gaps lowercase min_col_cov = kwargs["minimum_column_coverage"] if min_col_cov is not None: if isinstance(min_col_cov, int): min_col_cov /= 100 lc_cols = ali.count(ali._match_gap, axis="pos") > 1 - min_col_cov ali = ali.lowercase_columns(lc_cols) # if we remove columns, we have to update list of positions pos_list = pos_list[~lc_cols] else: lc_cols = None # compute effective number of sequences # (this is intended for cases where coupling stage is # not run, but this number is wanted nonetheless) if kwargs["compute_num_effective_seqs"]: # make sure we only compute N_eff on the columns # that would be used for model inference, dispose # the rest if lc_cols is None: cut_ali = ali else: cut_ali = ali.select(columns=~lc_cols) # compute sequence weights cut_ali.set_weights(kwargs["theta"]) # N_eff := sum of all sequence weights n_eff = float(cut_ali.weights.sum()) # patch into coverage statistics (N_eff column) coverage_stats.loc[:, "N_eff"] = n_eff else: n_eff = None # save coverage statistics to file coverage_stats.to_csv(outcfg["statistics_file"], float_format="%.3f", index=False) # store description of final sequence alignment in outcfg # (note these parameters will be updated by couplings protocol) outcfg.update({ "num_sites": len(pos_list), "num_sequences": len(ali), "effective_sequences": n_eff, "region_start": region_start, }) # create segment in outcfg outcfg["segments"] = [ Segment("aa", target_seq_id, region_start, region_start + ali.L - 1, pos_list).to_list() ] with open(outcfg["alignment_file"], "w") as f: ali.write(f, "fasta") return outcfg, ali
def complex(**kwargs): """ Protocol: Infer ECs for protein complexes from alignment using plmc. Allows user to select scoring protocol. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required and infer_plmc() Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: raw_ec_file model_file num_sites num_sequences effective_sequences focus_mode (passed through) focus_sequence (passed through) segments (passed through) """ # for additional required parameters, see infer_plmc() check_required( kwargs, [ "prefix", "min_sequence_distance", "scoring_model", "use_all_ecs_for_scoring", ] ) prefix = kwargs["prefix"] # infer ECs and load them outcfg, ecs, segments = infer_plmc(**kwargs) model = CouplingsModel(outcfg["model_file"]) # following computations are mostly specific to complex pipeline # add mixture model probability if kwargs["scoring_model"] in SCORING_MODELS: if kwargs["use_all_ecs_for_scoring"] is not None: use_all_ecs = kwargs["use_all_ecs_for_scoring"] else: use_all_ecs = False ecs = complex_probability( ecs, kwargs["scoring_model"], use_all_ecs ) else: raise InvalidParameterError( "Invalid scoring_model parameter: " + "{}. Valid options are: {}".format( kwargs["protocol"], ", ".join(SCORING_MODELS) ) ) # also create line-drawing script (for multiple chains) # by convention, we map first segment to chain A, # second to B, a.s.f. chain_mapping = dict( zip( [s.segment_id for s in segments], string.ascii_uppercase, ) ) outcfg = { **outcfg, **_postprocess_inference( ecs, kwargs, model, outcfg, prefix, generate_line_plot=True, generate_enrichment=False, ec_filter="segment_i != segment_j or abs(i - j) >= {}", chain=chain_mapping ) } # save just the inter protein ECs ## TODO: eventually have this accomplished by _postprocess_inference ## right now avoiding a second call with a different ec_filter ecs = pd.read_csv(outcfg["ec_file"]) outcfg["inter_ec_file"] = prefix + "_CouplingScores_inter.csv" inter_ecs = ecs.query("segment_i != segment_j") inter_ecs.to_csv(outcfg["inter_ec_file"], index=False) # dump output config to YAML file for debugging/logging write_config_file(prefix + ".couplings_complex.outcfg", outcfg) # TODO: make the following complex-ready # EC enrichment: # # 1) think about making EC enrichment complex-ready and add # it back here - so far it only makes sense if all ECs are # on one segment # # EVzoom: # # 1) at the moment, EVzoom will use numbering before remapping # we should eventually get this to a point where segments + residue # index are displayed on EVzoom # # 2) note that this will currently use the default mixture model # selection for determining the EC cutoff, rather than the selection # used for the EC table above return outcfg
def _make_contact_maps(ec_table, d_intra, d_multimer, sifts_map, **kwargs): """ Plot contact maps with all ECs above a certain probability threshold, or a given count of ECs Parameters ---------- ec_table : pandas.DataFrame Full set of evolutionary couplings (all pairs) d_intra : DistanceMap Computed residue-residue distances inside chain d_multimer : DistanceMap Computed residue-residue distances between homomultimeric chains **kwargs Further plotting parameters, see check_required in code for necessary values. Returns ------- cm_files : list(str) Paths of generated contact map files """ def plot_cm(ecs, output_file=None): """ Simple wrapper for contact map plotting """ with misc.plot_context("Arial"): fig = plt.figure(figsize=(10, 10)) if kwargs["scale_sizes"]: ecs = ecs.copy() ecs.loc[:, "size"] = ecs.score.values / ecs.score.max() # avoid negative sizes ecs.loc[ecs["size"] < 0, "size"] = 0 # draw PDB structure and alignment/EC coverage information on contact map if selected # (for now, not a required parameter, default to True) if kwargs.get("draw_coverage", True): additional_plot_kwargs = { "show_structure_coverage": True, "margin": 0, "ec_coverage": ec_table, } else: additional_plot_kwargs = { "show_structure_coverage": False, "margin": 5, "ec_coverage": None, } pairs.plot_contact_map( ecs, d_intra, d_multimer, distance_cutoff=kwargs["distance_cutoff"], show_secstruct=kwargs["draw_secondary_structure"], boundaries=kwargs["boundaries"], **additional_plot_kwargs) # print PDB information if selected as parameter # (for now, not a required parameter, default to True) if kwargs.get("print_pdb_information", True) and sifts_map is not None and len( sifts_map.hits) > 0: print_pdb_structure_info( sifts_map, ax=plt.gca(), header_text="PDB structures:", ) plt.suptitle("{} evolutionary couplings".format(len(ecs)), fontsize=14) if output_file is not None: plt.savefig(output_file, bbox_inches="tight") plt.close(fig) # TODO: eventually add draw_coverage and print_pdb_information as required parameters # (used above in plot_cm()) check_required(kwargs, [ "prefix", "min_sequence_distance", "plot_probability_cutoffs", "boundaries", "plot_lowest_count", "plot_highest_count", "plot_increase", "draw_secondary_structure" ]) prefix = kwargs["prefix"] cm_files = [] ecs_longrange = ec_table.query("abs(i - j) >= {}".format( kwargs["min_sequence_distance"])) # based on significance cutoff if kwargs["plot_probability_cutoffs"]: cutoffs = kwargs["plot_probability_cutoffs"] if not isinstance(cutoffs, list): cutoffs = [cutoffs] for c in cutoffs: ec_set = ecs_longrange.query("probability >= @c") # only can plot if we have any significant ECs above threshold if len(ec_set) > 0: output_file = prefix + "_significant_ECs_{}.pdf".format(c) plot_cm(ec_set, output_file=output_file) cm_files.append(output_file) # based on number of long-range ECs # identify number of sites in EC model num_sites = len( set.union(set(ec_table.i.unique()), set(ec_table.j.unique()))) # transform fraction of number of sites into discrete number of ECs def _discrete_count(x): if isinstance(x, float): x = ceil(x * num_sites) return int(x) # range of plots to make lowest = _discrete_count(kwargs["plot_lowest_count"]) highest = _discrete_count(kwargs["plot_highest_count"]) step = _discrete_count(kwargs["plot_increase"]) # create individual plots for c in range(lowest, highest + 1, step): ec_set = ecs_longrange.iloc[:c] output_file = prefix + "_{}_ECs.pdf".format(c) plot_cm(ec_set, output_file=output_file) cm_files.append(output_file) # give back list of all contact map file names return cm_files
def mean_field(**kwargs): """ Protocol: Infer ECs from alignment using mean field direct coupling analysis. For now, mean field DCA can only be run in focus mode, gaps included. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required. Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * raw_ec_file * model_file * num_sites * num_sequences * effective_sequences * focus_mode (passed through) * focus_sequence (passed through) * segments (passed through) """ check_required( kwargs, [ "prefix", "alignment_file", "segments", "focus_mode", "focus_sequence", "theta", "pseudo_count", "alphabet", "min_sequence_distance", # "save_model", ] ) if not kwargs["focus_mode"]: raise InvalidParameterError( "For now, mean field DCA can only be run in focus mode." ) prefix = kwargs["prefix"] # option to save model disabled """ if kwargs["save_model"]: model = prefix + ".model" else: model = None """ model = prefix + ".model" outcfg = { "model_file": model, "raw_ec_file": prefix + "_ECs.txt", "ec_file": prefix + "_CouplingScores.csv", # TODO: the following are passed through stage... # keep this or unnecessary? "focus_mode": kwargs["focus_mode"], "focus_sequence": kwargs["focus_sequence"], "segments": kwargs["segments"], } # make sure input alignment exists alignment_file = kwargs["alignment_file"] verify_resources( "Input alignment does not exist", kwargs["alignment_file"] ) # make sure output directory exists create_prefix_folders(prefix) segments = kwargs["segments"] if segments is not None: segments = [ mapping.Segment.from_list(s) for s in segments ] # determine alphabet # default is protein if kwargs["alphabet"] is None: alphabet = ALPHABET_PROTEIN else: alphabet = kwargs["alphabet"] # allow shortcuts for protein, DNA, RNA if alphabet in ALPHABET_MAP: alphabet = ALPHABET_MAP[alphabet] # read in a2m alignment with open(alignment_file) as f: input_alignment = Alignment.from_file( f, alphabet=alphabet, format="fasta" ) # init mean field direct coupling analysis mf_dca = MeanFieldDCA(input_alignment) # run mean field approximation model = mf_dca.fit( theta=kwargs["theta"], pseudo_count=kwargs["pseudo_count"] ) # write ECs to file model.to_raw_ec_file( outcfg["raw_ec_file"] ) # write model file if outcfg["model_file"] is not None: model.to_file( outcfg["model_file"], file_format="plmc_v2" ) # store useful information about model in outcfg outcfg.update({ "num_sites": model.L, "num_valid_sequences": model.N_valid, "effective_sequences": float(round(model.N_eff, 1)), "region_start": int(model.index_list[0]), }) # read and sort ECs ecs = pd.read_csv( outcfg["raw_ec_file"], sep=" ", # for now, call the last two columns # "fn" and "cn" to prevent compare # stage from crashing names=["i", "A_i", "j", "A_j", "fn", "cn"] # names=["i", "A_i", "j", "A_j", "mi", "di"] ).sort_values( by="cn", ascending=False ) is_single_segment = segments is None or len(segments) == 1 outcfg = { **outcfg, **_postprocess_inference( ecs, kwargs, model, outcfg, prefix, generate_enrichment=is_single_segment, generate_line_plot=is_single_segment ) } # dump output config to YAML file for debugging/logging write_config_file(prefix + ".couplings_meanfield.outcfg", outcfg) return outcfg
def _make_complex_contact_maps(ec_table, d_intra_i, d_multimer_i, d_intra_j, d_multimer_j, d_inter, first_segment_name, second_segment_name, **kwargs): """ Plot contact maps with all ECs above a certain probability threshold, or a given count of ECs Parameters ---------- ec_table : pandas.DataFrame Full set of evolutionary couplings (all pairs) d_intra_i, d_intra_j: DistanceMap Computed residue-residue distances within chains for monomers i and j d_multimer_i, d_multimer_j : DistanceMap Computed residue-residue distances between homomultimeric chains for monomers i and j d_inter: DistanceMap Computed residue-residue distances between heteromultimeric chains i and j first_segment_name, second_segment_name: str Name of segment i and segment j in the ec_table **kwargs Further plotting parameters, see check_required in code for necessary values. Returns ------- cm_files : list(str) Paths of generated contact map files """ def plot_complex_cm(ecs_i, ecs_j, ecs_inter, first_segment_name, second_segment_name, output_file=None): """ Simple wrapper for contact map plotting """ with misc.plot_context("Arial"): if kwargs["scale_sizes"]: # to scale sizes, combine all ecs to rescale together ecs = pd.concat([ecs_i, ecs_j, ecs_inter]) ecs.loc[:, "size"] = ecs.cn.values / ecs.cn.max() # split back into three separate DataFrames ecs_i = ecs.query( "segment_i == segment_j == @first_segment_name") ecs_j = ecs.query( "segment_i == segment_j == @second_segment_name") ecs_inter = ecs.query("segment_i != segment_j") # if any of these groups are entry, replace with None if len(ecs_i) == 0: ecs_i = None if len(ecs_j) == 0: ecs_j = None if len(ecs_inter) == 0: ecs_inter = None # Currently, we require at least one of the monomer # to have either ECs or distances in order to make a plot if ((ecs_i is None or ecs_i.empty) and d_intra_i is None and d_multimer_i is None) \ or ((ecs_j is None or ecs_j.empty) and d_intra_j is None and d_multimer_i is None): return False fig = plt.figure(figsize=(8, 8)) # create the contact map pairs.complex_contact_map(ecs_i, ecs_j, ecs_inter, d_intra_i, d_multimer_i, d_intra_j, d_multimer_j, d_inter, margin=5, boundaries=kwargs["boundaries"], scale_sizes=kwargs["scale_sizes"]) # Add title to the plot if ecs_inter is None: ec_len = '0' else: ec_len = len(ecs_inter) plt.suptitle( "{} inter-molecule evolutionary couplings".format(ec_len), fontsize=14) # save to output if output_file is not None: plt.savefig(output_file, bbox_inches="tight") plt.close(fig) return True check_required(kwargs, [ "prefix", "min_sequence_distance", "plot_probability_cutoffs", "boundaries", "draw_secondary_structure", "plot_lowest_count", "plot_highest_count", "plot_increase", "scale_sizes" ]) prefix = kwargs["prefix"] cm_files = [] ecs_longrange = ec_table.query( "abs(i - j) >= {} or segment_i != segment_j".format( kwargs["min_sequence_distance"])) # create plots based on significance cutoff if kwargs["plot_probability_cutoffs"]: cutoffs = kwargs["plot_probability_cutoffs"] if not isinstance(cutoffs, list): cutoffs = [cutoffs] for c in cutoffs: ec_set = ecs_longrange.query("probability >= @c") # only can plot if we have any significant ECs above threshold if len(ec_set) > 0: ec_set_i = ec_set.query( "segment_i == segment_j == @first_segment_name") ec_set_j = ec_set.query( "segment_i == segment_j == @second_segment_name") ec_set_inter = ec_set.query("segment_i != segment_j") output_file = prefix + "_significant_ECs_{}.pdf".format(c) plot_completed = plot_complex_cm(ec_set_i, ec_set_j, ec_set_inter, first_segment_name, second_segment_name, output_file=output_file) if plot_completed: cm_files.append(output_file) # transform fraction of number of sites into discrete number of ECs def _discrete_count(x): if isinstance(x, float): num_sites = 0 for seg_name in [first_segment_name, second_segment_name]: num_sites += len( set.union( set( ec_table.query( "segment_i == @seg_name").i.unique()), set( ec_table.query( "segment_j == @seg_name").j.unique()))) x = ceil(x * num_sites) return int(x) # range of plots to make lowest = _discrete_count(kwargs["plot_lowest_count"]) highest = _discrete_count(kwargs["plot_highest_count"]) step = _discrete_count(kwargs["plot_increase"]) for c in range(lowest, highest + 1, step): # get the inter ECs to plot ec_set_inter = ecs_longrange.query("segment_i != segment_j")[0:c] # if there are no inter ecs to be plotted, continue if ec_set_inter.empty: continue # get the index of the lowest inter EC last_inter_index = ec_set_inter.index[-1] # take all intra-protein ECs that score higher than the lowest plotted inter-protein EC ec_set_i = ecs_longrange.iloc[0:last_inter_index].query( "segment_i == segment_j == @first_segment_name") ec_set_j = ecs_longrange.iloc[0:last_inter_index].query( "segment_i == segment_j == @second_segment_name") output_file = prefix + "_{}_ECs.pdf".format(c) plot_completed = plot_complex_cm(ec_set_i, ec_set_j, ec_set_inter, first_segment_name, second_segment_name, output_file=output_file) if plot_completed: cm_files.append(output_file) # give back list of all contact map file names return cm_files
def infer_plmc(**kwargs): """ Run EC computation on alignment. This function contains the functionality shared between monomer and complex EC inference. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: raw_ec_file model_file num_sites num_sequences effective_sequences focus_mode (passed through) focus_sequence (passed through) segments (passed through) """ check_required( kwargs, [ "prefix", "alignment_file", "focus_mode", "focus_sequence", "theta", "alphabet", "segments", "ignore_gaps", "iterations", "lambda_h", "lambda_J", "lambda_group", "scale_clusters", "cpu", "plmc", "reuse_ecs", ] ) prefix = kwargs["prefix"] # for now disable option to not save model, since # otherwise mutate stage will crash. To remove model # file at end, use delete option in management section. """ if kwargs["save_model"]: model = prefix + ".model" else: model = None """ model = prefix + ".model" outcfg = { "model_file": model, "raw_ec_file": prefix + "_ECs.txt", "ec_file": prefix + "_CouplingScores.csv", # the following are passed through stage... "focus_mode": kwargs["focus_mode"], "focus_sequence": kwargs["focus_sequence"], "segments": kwargs["segments"], } # make sure input alignment exists verify_resources( "Input alignment does not exist", kwargs["alignment_file"] ) # make sure output directory exists create_prefix_folders(prefix) # regularization strength on couplings J_ij lambda_J = kwargs["lambda_J"] segments = kwargs["segments"] if segments is not None: segments = [ mapping.Segment.from_list(s) for s in segments ] # first determine size of alphabet; # default is amino acid alphabet if kwargs["alphabet"] is None: alphabet = ALPHABET_PROTEIN alphabet_setting = None else: alphabet = kwargs["alphabet"] # allow shortcuts for protein, DNA, RNA if alphabet in ALPHABET_MAP: alphabet = ALPHABET_MAP[alphabet] # if we have protein alphabet, do not set # as plmc parameter since default parameter, # has some implementation advantages for focus mode if alphabet == ALPHABET_PROTEIN: alphabet_setting = None else: alphabet_setting = alphabet # scale lambda_J to proportionally compensate # for higher number of J_ij compared to h_i? if kwargs["lambda_J_times_Lq"]: num_symbols = len(alphabet) # if we ignore gaps, there is one character less if kwargs["ignore_gaps"]: num_symbols -= 1 # second, determine number of uppercase positions # that are included in the calculation with open(kwargs["alignment_file"]) as f: seq_id, seq = next(read_fasta(f)) # gap character is by convention first char in alphabet gap = alphabet[0] uppercase = [ c for c in seq if c == c.upper() or c == gap ] L = len(uppercase) # finally, scale lambda_J lambda_J *= (num_symbols - 1) * (L - 1) # run plmc... or reuse pre-exisiting results from previous run plm_outcfg_file = prefix + ".couplings_standard_plmc.outcfg" # determine if to rerun, only possible if previous results # were stored in ali_outcfg_file if kwargs["reuse_ecs"] and valid_file(plm_outcfg_file): plmc_result = read_config_file(plm_outcfg_file) # check if the EC/parameter files are there required_files = [outcfg["raw_ec_file"]] if outcfg["model_file"] is not None: required_files += [outcfg["model_file"]] verify_resources( "Tried to reuse ECs, but empty or " "does not exist", *required_files ) else: # run plmc binary plmc_result = ct.run_plmc( kwargs["alignment_file"], outcfg["raw_ec_file"], outcfg["model_file"], focus_seq=kwargs["focus_sequence"], alphabet=alphabet_setting, theta=kwargs["theta"], scale=kwargs["scale_clusters"], ignore_gaps=kwargs["ignore_gaps"], iterations=kwargs["iterations"], lambda_h=kwargs["lambda_h"], lambda_J=lambda_J, lambda_g=kwargs["lambda_group"], cpu=kwargs["cpu"], binary=kwargs["plmc"], ) # save iteration table to file iter_table_file = prefix + "_iteration_table.csv" plmc_result.iteration_table.to_csv( iter_table_file ) # turn namedtuple into dictionary to make # restarting code nicer plmc_result = dict(plmc_result._asdict()) # then replace table with filename so # we can store results in config file plmc_result["iteration_table"] = iter_table_file # save results of search for possible restart write_config_file(plm_outcfg_file, plmc_result) # store useful information about model in outcfg outcfg.update({ "num_sites": plmc_result["num_valid_sites"], "num_valid_sequences": plmc_result["num_valid_seqs"], "effective_sequences": plmc_result["effective_samples"], "region_start": plmc_result["region_start"], }) # read and sort ECs ecs = pairs.read_raw_ec_file(outcfg["raw_ec_file"]) if segments is not None: # create index mapping seg_mapper = mapping.SegmentIndexMapper( kwargs["focus_mode"], outcfg["region_start"], *segments ) # apply to EC table ecs = mapping.segment_map_ecs(ecs, seg_mapper) return outcfg, ecs, segments
def standard(**kwargs): """ Protocol: Compare ECs for single proteins (or domains) to 3D structure information Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * ec_file_compared_all * ec_file_compared_all_longrange * pdb_structure_hits * distmap_monomer * distmap_multimer * contact_map_files * remapped_pdb_files """ check_required(kwargs, [ "prefix", "ec_file", "min_sequence_distance", "pdb_mmtf_dir", "atom_filter", "compare_multimer", "distance_cutoff", "target_sequence_file", "scale_sizes", ]) prefix = kwargs["prefix"] outcfg = { "ec_compared_all_file": prefix + "_CouplingScoresCompared_all.csv", "ec_compared_longrange_file": prefix + "_CouplingScoresCompared_longrange.csv", "pdb_structure_hits_file": prefix + "_structure_hits.csv", "pdb_structure_hits_unfiltered_file": prefix + "_structure_hits_unfiltered.csv", # cannot have the distmap files end with "_file" because there are # two files (.npy and .csv), which would cause problems with automatic # checking if those files exist "distmap_monomer": prefix + "_distance_map_monomer", "distmap_multimer": prefix + "_distance_map_multimer", } # make sure EC file exists verify_resources("EC file does not exist", kwargs["ec_file"]) # make sure output directory exists create_prefix_folders(prefix) # store auxiliary files here (too much for average user) aux_prefix = insert_dir(prefix, "aux", rootname_subdir=False) create_prefix_folders(aux_prefix) # Step 1: Identify 3D structures for comparison sifts_map, sifts_map_full = _identify_structures( **{ **kwargs, "prefix": aux_prefix, }) # save selected PDB hits sifts_map.hits.to_csv(outcfg["pdb_structure_hits_file"], index=False) # also save full list of hits sifts_map_full.hits.to_csv(outcfg["pdb_structure_hits_unfiltered_file"], index=False) # Step 2: Compute distance maps # load all structures at once structures = load_structures(sifts_map.hits.pdb_id, kwargs["pdb_mmtf_dir"], raise_missing=False) # compute distance maps and save # (but only if we found some structure) if len(sifts_map.hits) > 0: d_intra = intra_dists(sifts_map, structures, atom_filter=kwargs["atom_filter"], output_prefix=aux_prefix + "_distmap_intra") d_intra.to_file(outcfg["distmap_monomer"]) # save contacts to separate file outcfg["monomer_contacts_file"] = prefix + "_contacts_monomer.csv" d_intra.contacts(kwargs["distance_cutoff"]).to_csv( outcfg["monomer_contacts_file"], index=False) # compute multimer distances, if requested; # note that d_multimer can be None if there # are no structures with multiple chains if kwargs["compare_multimer"]: d_multimer = multimer_dists(sifts_map, structures, atom_filter=kwargs["atom_filter"], output_prefix=aux_prefix + "_distmap_multimer") else: d_multimer = None # if we have a multimer contact mapin the end, save it if d_multimer is not None: d_multimer.to_file(outcfg["distmap_multimer"]) outcfg[ "multimer_contacts_file"] = prefix + "_contacts_multimer.csv" # save contacts to separate file d_multimer.contacts(kwargs["distance_cutoff"]).to_csv( outcfg["multimer_contacts_file"], index=False) else: outcfg["distmap_multimer"] = None # at this point, also create remapped structures (e.g. for # later comparison of folding results) verify_resources("Target sequence file does not exist", kwargs["target_sequence_file"]) # create target sequence map for remapping structure with open(kwargs["target_sequence_file"]) as f: header, seq = next(read_fasta(f)) seq_id, seq_start, seq_end = parse_header(header) seqmap = dict(zip(range(seq_start, seq_end + 1), seq)) # remap structures, swap mapping index and filename in # dictionary so we have a list of files in the dict keys outcfg["remapped_pdb_files"] = { filename: mapping_index for mapping_index, filename in remap_chains( sifts_map, aux_prefix, seqmap).items() } else: # if no structures, can not compute distance maps d_intra = None d_multimer = None outcfg["distmap_monomer"] = None outcfg["distmap_multimer"] = None outcfg["remapped_pdb_files"] = None # Step 3: Compare ECs to distance maps ec_table = pd.read_csv(kwargs["ec_file"]) # identify number of sites in EC model num_sites = len( set.union(set(ec_table.i.unique()), set(ec_table.j.unique()))) for out_file, min_seq_dist in [ ("ec_compared_longrange_file", kwargs["min_sequence_distance"]), ("ec_compared_all_file", 0), ]: # compare ECs only if we minimally have intra distance map if d_intra is not None: coupling_scores_compared(ec_table, d_intra, d_multimer, dist_cutoff=kwargs["distance_cutoff"], output_file=outcfg[out_file], min_sequence_dist=min_seq_dist) else: outcfg[out_file] = None # also create line-drawing script if we made the csv if outcfg["ec_compared_longrange_file"] is not None: ecs_longrange = pd.read_csv(outcfg["ec_compared_longrange_file"]) outcfg[ "ec_lines_compared_pml_file"] = prefix + "_draw_ec_lines_compared.pml" pairs.ec_lines_pymol_script(ecs_longrange.iloc[:num_sites, :], outcfg["ec_lines_compared_pml_file"], distance_cutoff=kwargs["distance_cutoff"]) # Step 4: Make contact map plots # if no structures available, defaults to EC-only plot outcfg["contact_map_files"] = _make_contact_maps(ec_table, d_intra, d_multimer, **kwargs) return outcfg
def find_homologs(pdb_alignment_method="jackhmmer", **kwargs): """ Identify homologs using jackhmmer or hmmbuild/hmmsearch Parameters ---------- pdb_alignment_method : {"jackhmmer", "hmmsearch"}, optional (default: "jackhmmer") Sequence alignment method used for searching the PDB **kwargs Passed into jackhmmer / hmmbuild_and_search protocol (see documentation for available options) Returns ------- ali : evcouplings.align.Alignment Alignment of homologs of query sequence in sequence database hits : pandas.DataFrame Tabular representation of hits """ # load default configuration config = parse_config(HMMER_CONFIG) # update with overrides from kwargs config = { **config, **kwargs, } # create temporary output if no prefix is given if config["prefix"] is None: config["prefix"] = path.join(tempdir(), "compare") check_required(config, ["prefix"]) # run hmmsearch (possibly preceded by hmmbuild) if pdb_alignment_method == "hmmsearch": # set up config to run hmmbuild_and_search on the unfiltered alignment file updated_config = deepcopy(config) updated_config["alignment_file"] = config.get( "raw_focus_alignment_file") ar = hmmbuild_and_search(**updated_config) # For hmmbuild and search, we have to read the raw focus alignment file # to guarantee that the query sequence is present with open(ar["raw_focus_alignment_file"]) as a: ali = Alignment.from_file(a, "fasta") # run jackhmmer against sequence database # at this point we have already checked to ensure # that the input is either jackhmmer or hmmsearch elif pdb_alignment_method == "jackhmmer": ar = jackhmmer_search(**config) with open(ar["raw_alignment_file"]) as a: ali = Alignment.from_file(a, "stockholm") # write alignment as FASTA file for easier checking by hand, # if necessary with open(config["prefix"] + "_raw.fasta", "w") as f: ali.write(f) else: raise InvalidParameterError( "Invalid pdb_alignment_method selected. Valid options are: " + ", ".join(["jackhmmer", "hmmsearch"])) # read hmmer hittable and simplify hits = read_hmmer_domtbl(ar["hittable_file"]) hits.loc[:, "uniprot_ac"] = hits.loc[:, "target_name"].map( lambda x: x.split("|")[1]) hits.loc[:, "uniprot_id"] = hits.loc[:, "target_name"].map( lambda x: x.split("|")[2]) hits = hits.rename( columns={ "domain_score": "bitscore", "domain_i_Evalue": "e_value", "ali_from": "alignment_start", "ali_to": "alignment_end", "hmm_from": "hmm_start", "hmm_to": "hmm_end", }) hits.loc[:, "alignment_start"] = pd.to_numeric( hits.alignment_start).astype(int) hits.loc[:, "alignment_end"] = pd.to_numeric(hits.alignment_end).astype(int) hits.loc[:, "alignment_id"] = (hits.target_name + "/" + hits.alignment_start.astype(str) + "-" + hits.alignment_end.astype(str)) hits = hits.loc[:, [ "alignment_id", "uniprot_ac", "uniprot_id", "alignment_start", "alignment_end", "bitscore", "e_value" ]] return ali, hits
def hmmbuild_and_search(**kwargs): """ Protocol: Build HMM from sequence alignment using hmmbuild and search against a sequence database using hmmsearch. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the protocol, including the following fields: * target_sequence_file * sequence_file * raw_alignment_file * hittable_file * focus_mode * focus_sequence * segments """ def _format_alignment_for_hmmbuild(input_alignment_file, **kwargs): # this file is starting point of pipeline; # check if input alignment actually exists verify_resources("Input alignment does not exist", input_alignment_file) # first try to autodetect format of alignment with open(input_alignment_file) as f: format = detect_format(f) if format is None: raise InvalidParameterError( "Format of input alignment {} could not be " "automatically detected.".format(input_alignment_file)) with open(input_alignment_file) as f: ali_raw = Alignment.from_file(f, format) # Target sequence of alignment sequence_id = kwargs["sequence_id"] if sequence_id is None: raise InvalidParameterError( "Parameter sequence_id must be defined") # First, find focus sequence in alignment focus_index = None for i, id_ in enumerate(ali_raw.ids): if id_.startswith(sequence_id): focus_index = i break # if we didn't find it, cannot continue if focus_index is None: raise InvalidParameterError( "Target sequence {} could not be found in alignment".format( sequence_id)) # identify what columns (non-gap) to keep for focus # this should be all columns in the raw_focus_alignment_file # but checking anyway focus_seq = ali_raw[focus_index] focus_cols = np.array([ c not in [ali_raw._match_gap, ali_raw._insert_gap] for c in focus_seq ]) # extract focus alignment focus_ali = ali_raw.select(columns=focus_cols) focus_seq_nogap = "".join(focus_ali[focus_index]) # determine region of sequence. If first_index is given, # use that in any case, otherwise try to autodetect full_focus_header = ali_raw.ids[focus_index] focus_id = full_focus_header.split()[0] # try to extract region from sequence header id_, region_start, region_end = parse_header(focus_id) # override with first_index if given if kwargs["first_index"] is not None: region_start = kwargs["first_index"] region_end = region_start + len(focus_seq_nogap) - 1 if region_start is None or region_end is None: raise InvalidParameterError( "Could not extract region information " + "from sequence header {} ".format(full_focus_header) + "and first_index parameter is not given.") # resubstitute full sequence ID from identifier # and region information header = "{}/{}-{}".format(id_, region_start, region_end) focus_ali.ids[focus_index] = header # write target sequence to file target_sequence_file = prefix + ".fa" with open(target_sequence_file, "w") as f: write_fasta([(header, focus_seq_nogap)], f) # swap target sequence to first position if it is not # the first sequence in alignment; # this is particularly important for hhfilter run # because target sequence might otherwise be filtered out if focus_index != 0: indices = np.arange(0, len(focus_ali)) indices[0] = focus_index indices[focus_index] = 0 focus_index = 0 focus_ali = focus_ali.select(sequences=indices) # write the raw focus alignment for hmmbuild focus_fasta_file = prefix + "_raw_focus_input.fasta" with open(focus_fasta_file, "w") as f: focus_ali.write(f, "fasta") return focus_fasta_file, target_sequence_file, region_start, region_end # define the gap threshold for inclusion in HMM's build by HMMbuild. SYMFRAC_HMMBUILD = 0.0 # check for required options check_required(kwargs, [ "prefix", "sequence_id", "alignment_file", "use_bitscores", "domain_threshold", "sequence_threshold", "database", "cpu", "nobias", "reuse_alignment", "hmmbuild", "hmmsearch" ]) prefix = kwargs["prefix"] # make sure output directory exists create_prefix_folders(prefix) # prepare input alignment for hmmbuild focus_fasta_file, target_sequence_file, region_start, region_end = \ _format_alignment_for_hmmbuild( kwargs["alignment_file"], **kwargs ) # run hmmbuild_and_search... allow to reuse pre-exisiting # Stockholm alignment file here ali_outcfg_file = prefix + ".align_hmmbuild_and_search.outcfg" # determine if to rerun, only possible if previous results # were stored in ali_outcfg_file if kwargs["reuse_alignment"] and valid_file(ali_outcfg_file): ali = read_config_file(ali_outcfg_file) # check if the alignment file itself is also there verify_resources( "Tried to reuse alignment, but empty or " "does not exist", ali["alignment"], ali["domtblout"]) else: # otherwise, we have to run the alignment # modify search thresholds to be suitable for hmmsearch sequence_length = region_end - region_start + 1 seq_threshold, domain_threshold = search_thresholds( kwargs["use_bitscores"], kwargs["sequence_threshold"], kwargs["domain_threshold"], sequence_length) # create the hmm hmmbuild_result = at.run_hmmbuild( alignment_file=focus_fasta_file, prefix=prefix, symfrac=SYMFRAC_HMMBUILD, cpu=kwargs["cpu"], binary=kwargs["hmmbuild"], ) hmmfile = hmmbuild_result.hmmfile # run the alignment from the hmm ali = at.run_hmmsearch( hmmfile=hmmfile, database=kwargs[kwargs["database"]], prefix=prefix, use_bitscores=kwargs["use_bitscores"], domain_threshold=domain_threshold, seq_threshold=seq_threshold, nobias=kwargs["nobias"], cpu=kwargs["cpu"], binary=kwargs["hmmsearch"], ) # get rid of huge stdout log file immediately try: os.remove(ali.output) except OSError: pass # turn namedtuple into dictionary to make # restarting code nicer ali = dict(ali._asdict()) # only item from hmmsearch_result to save is the hmmfile ali["hmmfile"] = hmmfile # save results of search for possible restart write_config_file(ali_outcfg_file, ali) # prepare output dictionary with result files outcfg = { "sequence_file": target_sequence_file, "first_index": region_start, "input_raw_focus_alignment": focus_fasta_file, "target_sequence_file": target_sequence_file, "focus_mode": True, "raw_alignment_file": ali["alignment"], "hittable_file": ali["domtblout"], } # convert the raw output alignment to fasta format # and add the appropriate query sequecne raw_focus_alignment_file = _make_hmmsearch_raw_fasta(outcfg, prefix) outcfg["raw_focus_alignment_file"] = raw_focus_alignment_file # define a single protein segment based on target sequence outcfg["segments"] = [ Segment("aa", kwargs["sequence_id"], region_start, region_end, range(region_start, region_end + 1)).to_list() ] outcfg["focus_sequence"] = "{}/{}-{}".format(kwargs["sequence_id"], region_start, region_end) return outcfg
def standard(**kwargs): """ Protocol: Compare ECs for single proteins (or domains) to 3D structure information Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * mutation_matrix_file * [mutation_dataset_predicted_file] """ check_required(kwargs, [ "prefix", "model_file", "mutation_dataset_file", ]) prefix = kwargs["prefix"] outcfg = { "mutation_matrix_file": prefix + "_single_mutant_matrix.csv", "mutation_matrix_plot_files": [], } # make sure model file exists verify_resources("Model parameter file does not exist", kwargs["model_file"]) # make sure output directory exists create_prefix_folders(prefix) # load couplings object, and create independent model c = CouplingsModel(kwargs["model_file"]) c0 = c.to_independent_model() for model, type_ in [(c, "Epistatic"), (c0, "Independent")]: # interactive plot using bokeh filename = prefix + "_{}_model".format(type_.lower(), ) output_file(filename + ".html", "{} model".format(type_)) fig = evcouplings.visualize.mutations.plot_mutation_matrix( model, engine="bokeh") save(fig) outcfg["mutation_matrix_plot_files"].append(filename + ".html") # static matplotlib plot evcouplings.visualize.mutations.plot_mutation_matrix(model) plt.savefig(filename + ".pdf", bbox_inches="tight") outcfg["mutation_matrix_plot_files"].append(filename + ".pdf") # create single mutation matrix table, # add prediction by independent model and # save to file singles = single_mutant_matrix(c, output_column="prediction_epistatic") singles = predict_mutation_table(c0, singles, "prediction_independent") singles.to_csv(outcfg["mutation_matrix_file"], index=False) # Pymol scripts outcfg["mutations_epistatic_pml_files"] = [] for model in ["epistatic", "independent"]: pml_filename = prefix + "_{}_model.pml".format(model) evcouplings.visualize.mutations.mutation_pymol_script( singles, pml_filename, effect_column="prediction_" + model) outcfg["mutations_epistatic_pml_files"].append(pml_filename) # predict experimental dataset if given dataset_file = kwargs["mutation_dataset_file"] if dataset_file is not None: verify_resources("Dataset file does not exist", dataset_file) data = pd.read_csv(dataset_file, comment="#") # add epistatic model prediction data_pred = predict_mutation_table(c, data, "prediction_epistatic") # add independent model prediction data_pred = predict_mutation_table(c0, data_pred, "prediction_independent") outcfg[ "mutation_dataset_predicted_file"] = prefix + "_dataset_predicted.csv" data_pred.to_csv(outcfg["mutation_dataset_predicted_file"], index=False) return outcfg