def run(**kwargs): """ Run embedding protocol Parameters ---------- kwargs arguments (* denotes optional): sequences_file: Where sequences live prefix: Output prefix for all generated files protocol: Which embedder to use mapping_file: the mapping file generated by the pipeline when remapping indexes stage_name: The stage name Returns ------- Dictionary with results of stage """ check_required( kwargs, [ "protocol", "prefix", "stage_name", "remapped_sequences_file", "mapping_file" ], ) if kwargs["protocol"] not in PROTOCOLS: raise InvalidParameterError( "Invalid protocol selection: {}. Valid protocols are: {}".format( kwargs["protocol"], ", ".join(PROTOCOLS.keys()))) embedder_class = PROTOCOLS[kwargs["protocol"]] if embedder_class == UniRepEmbedder and kwargs.get("use_cpu") is not None: raise InvalidParameterError( "UniRep does not support configuring `use_cpu`") result_kwargs = deepcopy(kwargs) # Download necessary files if needed # noinspection PyProtectedMember for file in embedder_class._necessary_files: if not result_kwargs.get(file): result_kwargs[file] = get_model_file(model=embedder_class.name, file=file) # noinspection PyProtectedMember for directory in embedder_class._necessary_directories: if not result_kwargs.get(directory): result_kwargs[directory] = get_model_directories_from_zip( model=embedder_class.name, directory=directory) result_kwargs.setdefault("max_amino_acids", DEFAULT_MAX_AMINO_ACIDS[kwargs["protocol"]]) file_manager = get_file_manager(**kwargs) embedder: EmbedderInterface = embedder_class(**result_kwargs) _check_transform_embeddings_function(embedder, result_kwargs) return embed_and_write_batched(embedder, file_manager, result_kwargs, kwargs.get("half_precision", False))
def run(**kwargs): """ Run project protocol Parameters ---------- kwargs arguments (* denotes optional): reduced_embeddings_file: Where per-protein embeddings live prefix: Output prefix for all generated files stage_name: The stage name protocol: Which projection technique to use mapping_file: the mapping file generated by the pipeline when remapping indexes Returns ------- Dictionary with results of stage """ check_required(kwargs, [ 'protocol', 'prefix', 'stage_name', 'reduced_embeddings_file', 'mapping_file' ]) if kwargs["protocol"] not in PROTOCOLS: raise InvalidParameterError( "Invalid protocol selection: " + "{}. Valid protocols are: {}".format(kwargs["protocol"], ", ".join( PROTOCOLS.keys()))) return PROTOCOLS[kwargs["protocol"]](**kwargs)
def run(**kwargs): """ Run visualize protocol Parameters ---------- kwargs arguments (* denotes optional): projected_embeddings_file: A csv with columns: (index), original_id, x, y, z prefix: Output prefix for all generated files stage_name: The stage name protocol: Which plot to generate Returns ------- Dictionary with results of stage """ check_required(kwargs, ['protocol', 'prefix', 'stage_name', 'projected_embeddings_file']) if kwargs["protocol"] not in PROTOCOLS: raise InvalidParameterError( "Invalid protocol selection: " + "{}. Valid protocols are: {}".format( kwargs["protocol"], ", ".join(PROTOCOLS.keys()) ) ) return PROTOCOLS[kwargs["protocol"]](**kwargs)
def run(**kwargs): """ Run visualize protocol Parameters ---------- kwargs arguments (* denotes optional): projected_reduced_embeddings_file: A csv with columns: (index), original_id, x, y, z prefix: Output prefix for all generated files stage_name: The stage name protocol: Which plot to generate For plotly: projected_reduced_embeddings_file: The projected (dimensionality reduced) embeddings, normally coming from the project stage annotation_file: csv file with annotations display_unknown: Hide proteins for which there is no annotation in the annotation file (only relevant if annotation file is provided) merge_via_index: Set to True if in annotation_file identifiers correspond to sequence MD5 hashes n_components: 2D vs 3D plot For plot_mutagenesis: residue_probabilities_file: The csv with the probabilities, normally coming from the mutagenesis stage Returns ------- Dictionary with results of stage """ check_required(kwargs, ['protocol', 'prefix', 'stage_name']) if kwargs["protocol"] not in PROTOCOLS: raise InvalidParameterError( "Invalid protocol selection: " + "{}. Valid protocols are: {}".format(kwargs["protocol"], ", ".join( PROTOCOLS.keys()))) result_kwargs = deepcopy(kwargs) if kwargs["protocol"] == "plotly": # Support legacy projected_embeddings_file projected_reduced_embeddings_file = ( kwargs.get("projected_reduced_embeddings_file") or kwargs.get("projected_embeddings_file")) if not projected_reduced_embeddings_file: raise InvalidParameterError( f"You need to provide either projected_reduced_embeddings_file or projected_embeddings_file or " f"reduced_embeddings_file for {kwargs['protocol']}") result_kwargs[ "projected_reduced_embeddings_file"] = projected_reduced_embeddings_file return PROTOCOLS[kwargs["protocol"]](result_kwargs)
def run(**kwargs): """ Run project protocol Parameters ---------- kwargs arguments (* denotes optional): projected_reduced_embeddings_file or projected_embeddings_file or reduced_embeddings_file: Where per-protein embeddings live prefix: Output prefix for all generated files stage_name: The stage name protocol: Which projection technique to use mapping_file: the mapping file generated by the pipeline when remapping indexes Returns ------- Dictionary with results of stage """ check_required(kwargs, ["protocol", "prefix", "stage_name", "mapping_file"]) if kwargs["protocol"] not in PROTOCOLS: raise InvalidParameterError( "Invalid protocol selection: " + "{}. Valid protocols are: {}".format( kwargs["protocol"], ", ".join(PROTOCOLS.keys()) ) ) result_kwargs = deepcopy(kwargs) # We want to allow chaining protocols, e.g. first tucker than umap, # so we need to allow projected embeddings as input embeddings_input_file = ( kwargs.get("projected_reduced_embeddings_file") or kwargs.get("projected_embeddings_file") or kwargs.get("reduced_embeddings_file") ) if not embeddings_input_file: raise InvalidParameterError( f"You need to provide either projected_reduced_embeddings_file or projected_embeddings_file or " f"reduced_embeddings_file for {kwargs['protocol']}" ) result_kwargs["reduced_embeddings_file"] = embeddings_input_file file_manager = get_file_manager(**kwargs) result_kwargs = PROTOCOLS[kwargs["protocol"]](file_manager, result_kwargs) return result_kwargs
def run(**kwargs): """ Run visualize protocol Parameters ---------- kwargs arguments (* denotes optional): projected_reduced_embeddings_file: A csv with columns: (index), original_id, x, y, z prefix: Output prefix for all generated files stage_name: The stage name protocol: Which plot to generate Returns ------- Dictionary with results of stage """ check_required(kwargs, ['protocol', 'prefix', 'stage_name']) if kwargs["protocol"] not in PROTOCOLS: raise InvalidParameterError( "Invalid protocol selection: " + "{}. Valid protocols are: {}".format( kwargs["protocol"], ", ".join(PROTOCOLS.keys()) ) ) result_kwargs = deepcopy(kwargs) # Support legacy projected_embeddings_file projected_reduced_embeddings_file = ( kwargs.get("projected_reduced_embeddings_file") or kwargs.get("projected_embeddings_file") ) if not projected_reduced_embeddings_file: raise InvalidParameterError( f"You need to provide either projected_reduced_embeddings_file or projected_embeddings_file or " f"reduced_embeddings_file for {kwargs['protocol']}" ) result_kwargs["projected_reduced_embeddings_file"] = projected_reduced_embeddings_file return PROTOCOLS[kwargs["protocol"]](result_kwargs)
def plot_mutagenesis(result_kwargs): """BETA: visualize in-silico mutagenesis as a heatmap with plotly mandatory: * residue_probabilities_file """ required_kwargs = [ "protocol", "prefix", "stage_name", "residue_probabilities_file", ] check_required(result_kwargs, required_kwargs) file_manager = get_file_manager() file_manager.create_stage(result_kwargs["prefix"], result_kwargs["stage_name"]) probabilities_all = pandas.read_csv(result_kwargs["residue_probabilities_file"]) assert ( list(probabilities_all.columns) == PROBABILITIES_COLUMNS ), f"probabilities file is expected to have the following columns: {PROBABILITIES_COLUMNS}" number_of_proteins = len(set(probabilities_all["id"])) for sequence_id, probabilities in tqdm( probabilities_all.groupby("id"), total=number_of_proteins ): fig = plot(probabilities) plotly.offline.plot( fig, filename=file_manager.create_file( result_kwargs.get("prefix"), result_kwargs.get("stage_name"), sequence_id, extension=".html", ), ) return result_kwargs
def execute_pipeline_from_config(config: Dict, post_stage: Callable[[Dict], None] = _null_function, **kwargs) -> Dict: original_config = deepcopy(config) check_required(config, ["global"]) # !! pop = remove from config! global_parameters = config.pop('global') check_required(global_parameters, ["prefix", "sequences_file"]) file_manager = get_file_manager(**global_parameters) # Make sure prefix exists prefix = global_parameters['prefix'] # If prefix already exists if file_manager.exists(prefix): if not kwargs.get('overwrite'): raise FileExistsError( "The prefix already exists & no overwrite option has been set.\n" "Either set --overwrite, or move data from the prefix.\n" "Prefix: {}".format(prefix)) else: # create the prefix file_manager.create_prefix(prefix) # Copy original config to prefix global_in = file_manager.create_file(prefix, None, _IN_CONFIG_NAME, extension='.yml') write_config_file(global_in, original_config) # This downloads sequences_file if required download_files_for_stage(global_parameters, file_manager, prefix) global_parameters = _process_fasta_file(**global_parameters) for stage_name in config: stage_parameters = config[stage_name] original_stage_parameters = dict(**stage_parameters) check_required(stage_parameters, ["protocol", "type"]) stage_type = stage_parameters['type'] stage_runnable = _STAGES.get(stage_type) if not stage_runnable: raise Exception( "No type defined, or invalid stage type defined: {}".format( stage_type)) # Prepare to run stage stage_parameters['stage_name'] = stage_name file_manager.create_stage(prefix, stage_name) stage_parameters = download_files_for_stage(stage_parameters, file_manager, prefix, stage_name) stage_dependency = stage_parameters.get('depends_on') if stage_dependency: if stage_dependency not in config: raise Exception( "Stage {} depends on {}, but dependency not found in config." .format(stage_name, stage_dependency)) stage_dependency_parameters = config.get(stage_dependency) stage_parameters = { **global_parameters, **stage_dependency_parameters, **stage_parameters } else: stage_parameters = {**global_parameters, **stage_parameters} # Register start time start_time = datetime.now().astimezone() stage_parameters['start_time'] = str(start_time) stage_in = file_manager.create_file(prefix, stage_name, _IN_CONFIG_NAME, extension='.yml') write_config_file(stage_in, stage_parameters) try: stage_output_parameters = stage_runnable(**stage_parameters) except Exception as e: # Tell the user which stage failed and show an url to report an error on github try: version = importlib_metadata.version("bio_embeddings") except PackageNotFoundError: version = "unknown" # Make a github flavored markdown table; the header is in the template parameter_table = "\n".join( f"{key}|{value}" for key, value in original_stage_parameters.items()) params = { # https://stackoverflow.com/a/35498685/3549270 "title": f"Protocol {original_stage_parameters['protocol']}: {type(e).__name__}: {e}", "body": _ERROR_REPORTING_TEMPLATE.format( version, torch.cuda.is_available(), parameter_table, traceback.format_exc(10), ), } print(traceback.format_exc(), file=sys.stderr) print( f"Consider reporting this error at this url: {_ISSUE_URL}?{urllib.parse.urlencode(params)}\n\n" f"Stage {stage_name} failed.", file=sys.stderr, ) sys.exit(1) # Register end time end_time = datetime.now().astimezone() stage_output_parameters['end_time'] = str(end_time) # Register elapsed time stage_output_parameters['elapsed_time'] = str(end_time - start_time) stage_out = file_manager.create_file(prefix, stage_name, _OUT_CONFIG_NAME, extension='.yml') write_config_file(stage_out, stage_output_parameters) # Store in global_out config for later retrieval (e.g. depends_on) config[stage_name] = stage_output_parameters # Execute post-stage function, if provided post_stage(stage_output_parameters) config['global'] = global_parameters try: config['global']['version'] = importlib_metadata.version( "bio_embeddings") except PackageNotFoundError: pass # :( global_out = file_manager.create_file(prefix, None, _OUT_CONFIG_NAME, extension='.yml') write_config_file(global_out, config) return config
def prepare_kwargs(**kwargs): required_kwargs = [ "protocol", "prefix", "stage_name", "remapped_sequences_file", "mapping_file", ] check_required(kwargs, required_kwargs) if kwargs["protocol"] not in name_to_embedder: if kwargs["protocol"] in ALL_PROTOCOLS: raise InvalidParameterError( f"The extra for the protocol {kwargs['protocol']} is missing. " "See https://docs.bioembeddings.com/#installation on how to install all extras" ) raise InvalidParameterError( "Invalid protocol selection: {}. Valid protocols are: {}".format( kwargs["protocol"], ", ".join(name_to_embedder.keys()))) embedder_class = name_to_embedder[kwargs["protocol"]] if kwargs["protocol"] == "unirep" and kwargs.get("use_cpu") is not None: raise InvalidParameterError( "UniRep does not support configuring `use_cpu`") # See parameter_blueprints.yml global_options = {"sequences_file", "simple_remapping", "start_time"} embed_options = { "decoder", "device", "discard_per_amino_acid_embeddings", "half_precision_model", "half_precision", "max_amino_acids", "reduce", "type", } known_parameters = (set(required_kwargs) | global_options | embed_options | set(embedder_class.necessary_files) | set(embedder_class.necessary_directories)) if embedder_class == "seqvec": # We support two ways of configuration for seqvec known_parameters.add("model_directory") if not set(kwargs) < known_parameters: # Complain louder if the input looks fishier for option in set(kwargs) - known_parameters: logger.warning( f"You set an unknown option for {embedder_class.name}: {option} (value: {kwargs[option]})" ) if kwargs.get("half_precision_model"): if kwargs["protocol"] not in [ "prottrans_t5_bfd", "prottrans_t5_uniref50" ]: raise InvalidParameterError( "`half_precision_model` is only supported with prottrans_t5_bfd and prottrans_t5_uniref50" ) if kwargs.get("half_precision") is False: # None remains allowed raise InvalidParameterError( "You can't have `half_precision_model` be true and `half_precision` be false. " "We suggest also setting `half_precision` to true, " "which will compute and save embeddings as half-precision floats" ) result_kwargs = deepcopy(kwargs) result_kwargs.setdefault("max_amino_acids", DEFAULT_MAX_AMINO_ACIDS[kwargs["protocol"]]) return embedder_class, result_kwargs
def execute_pipeline_from_config(config: Dict, post_stage: Callable[[Dict], None] = _null_function, **kwargs) -> Dict: original_config = deepcopy(config) check_required(config, ["global"]) # !! pop = remove from config! global_parameters = config.pop('global') check_required(global_parameters, ["prefix", "sequences_file"]) file_manager = get_file_manager(**global_parameters) # Make sure prefix exists prefix = global_parameters['prefix'] # If prefix already exists if file_manager.exists(prefix): if not kwargs.get('overwrite'): raise FileExistsError( "The prefix already exists & no overwrite option has been set.\n" "Either set --overwrite, or move data from the prefix.\n" "Prefix: {}".format(prefix)) else: # create the prefix file_manager.create_prefix(prefix) try: Path(prefix).joinpath("bio_embeddings_version.txt").write_text( importlib_metadata.version("bio_embeddings")) except PackageNotFoundError: pass # :( # Copy original config to prefix global_in = file_manager.create_file(prefix, None, _IN_CONFIG_NAME, extension='.yml') write_config_file(global_in, original_config) global_parameters = _process_fasta_file(**global_parameters) for stage_name in config: stage_parameters = config[stage_name] check_required(stage_parameters, ["protocol", "type"]) stage_type = stage_parameters['type'] stage_runnable = _STAGES.get(stage_type) if not stage_runnable: raise Exception( "No type defined, or invalid stage type defined: {}".format( stage_type)) # Prepare to run stage stage_parameters['stage_name'] = stage_name file_manager.create_stage(prefix, stage_name) stage_dependency = stage_parameters.get('depends_on') if stage_dependency: if stage_dependency not in config: raise Exception( "Stage {} depends on {}, but dependency not found in config." .format(stage_name, stage_dependency)) stage_dependency_parameters = config.get(stage_dependency) stage_parameters = { **global_parameters, **stage_dependency_parameters, **stage_parameters } else: stage_parameters = {**global_parameters, **stage_parameters} # Register start time start_time = datetime.now().astimezone() stage_parameters['start_time'] = str(start_time) stage_in = file_manager.create_file(prefix, stage_name, _IN_CONFIG_NAME, extension='.yml') write_config_file(stage_in, stage_parameters) stage_output_parameters = stage_runnable(**stage_parameters) # Register end time end_time = datetime.now().astimezone() stage_output_parameters['end_time'] = str(end_time) # Register elapsed time stage_output_parameters['elapsed_time'] = str(end_time - start_time) stage_out = file_manager.create_file(prefix, stage_name, _OUT_CONFIG_NAME, extension='.yml') write_config_file(stage_out, stage_output_parameters) # Store in global_out config for later retrieval (e.g. depends_on) config[stage_name] = stage_output_parameters # Execute post-stage function, if provided post_stage(stage_output_parameters) config['global'] = global_parameters global_out = file_manager.create_file(prefix, None, _OUT_CONFIG_NAME, extension='.yml') write_config_file(global_out, config) return config
def run(**kwargs): """BETA: in-silico mutagenesis using BertForMaskedLM optional (see extract stage for details): * model_directory * device * half_precision * half_precision_model * temperature: temperature for softmax """ required_kwargs = [ "protocol", "prefix", "stage_name", "remapped_sequences_file", "mapping_file", ] check_required(kwargs, required_kwargs) result_kwargs = deepcopy(kwargs) if result_kwargs["protocol"] not in _PROTOCOLS: raise RuntimeError( f"Passed protocol {result_kwargs['protocol']}, but allowed are: {', '.join(_PROTOCOLS)}" ) temperature = result_kwargs.setdefault("temperature", 1) device = get_device(result_kwargs.get("device")) model_class: Type[ProtTransBertBFDMutagenesis] = _PROTOCOLS[ result_kwargs["protocol"] ] model = model_class( device, result_kwargs.get("model_directory"), result_kwargs.get("half_precision_model"), ) file_manager = get_file_manager() file_manager.create_stage(result_kwargs["prefix"], result_kwargs["stage_name"]) # The mapping file contains the corresponding ids in the same order sequences = [ str(entry.seq) for entry in SeqIO.parse(result_kwargs["remapped_sequences_file"], "fasta") ] mapping_file = read_mapping_file(result_kwargs["mapping_file"]) probabilities_all = dict() with tqdm(total=int(mapping_file["sequence_length"].sum())) as progress_bar: for sequence_id, original_id, sequence in zip( mapping_file.index, mapping_file["original_id"], sequences ): with torch.no_grad(): probabilities = model.get_sequence_probabilities( sequence, temperature, progress_bar=progress_bar ) for p in probabilities: assert math.isclose( 1, (sum(p.values()) - p["position"]), rel_tol=1e-6 ), "softmax values should add up to 1" probabilities_all[sequence_id] = probabilities residue_probabilities = probabilities_as_dataframe( mapping_file, probabilities_all, sequences ) probabilities_file = file_manager.create_file( result_kwargs.get("prefix"), result_kwargs.get("stage_name"), "residue_probabilities_file", extension=".csv", ) residue_probabilities.to_csv(probabilities_file, index=False) result_kwargs["residue_probabilities_file"] = probabilities_file return result_kwargs