def test_model_sizes_for_all_embedder(pytestconfig): """Make sure we have the model sizes documented for each model If this test is failing, run the following and enter the results in bio_embeddings/embed/__init__.py: ``` python -m bio_embeddings.utilities.model_size_main cpu python -m bio_embeddings.utilities.model_size_main gpu ``` """ models = read_config_file( pytestconfig.rootpath.joinpath( "bio_embeddings/utilities/defaults.yml")) set(models.keys()) doc_text: str = pytestconfig.rootpath.joinpath( "bio_embeddings/embed/__init__.py").read_text() # Quick and stupid rst parsing documented_embedder = set() for line in doc_text.split("=" * 46)[2].splitlines()[1:]: documented_embedder.add(line.split(" ")[0]) assert name_to_embedder.keys() - set(documented_embedder) == set() # Handle the non-embedder models assert set(documented_embedder) - name_to_embedder.keys() == { "bert_from_publication", "deepblast", "pb_tucker", "seqvec_from_publication", }
def main(): parser = ArgumentParser() parser.add_argument("embedder", help="Lowercase name of the embedder") parser.add_argument("--model-directory") args = parser.parse_args() if args.embedder == "all": for embedder_name in name_to_embedder.keys(): bisect_embedder_memory( embedder_name, str(Path(args.model_directory).joinpath(embedder_name))) else: bisect_embedder_memory(args.embedder, args.model_directory)
def main(): parser = ArgumentParser() parser.add_argument("embedder", help="Lowercase name of the embedder") parser.add_argument("--model-directory") parser.add_argument("--half-precision-model", action="store_true", default=False) args = parser.parse_args() print(args) if args.embedder == "all": for embedder_name in name_to_embedder.keys(): bisect_embedder_memory( embedder_name, str(Path(args.model_directory).joinpath(embedder_name)), half_precision_model=args.half_precision_model, ) else: bisect_embedder_memory( args.embedder, args.model_directory, half_precision_model=args.half_precision_model, )
def run(**kwargs): """ Run embedding protocol Parameters ---------- kwargs arguments (* denotes optional): sequences_file: Where sequences live prefix: Output prefix for all generated files protocol: Which embedder to use mapping_file: the mapping file generated by the pipeline when remapping indexes stage_name: The stage name Returns ------- Dictionary with results of stage """ check_required( kwargs, ["protocol", "prefix", "stage_name", "remapped_sequences_file", "mapping_file"], ) if kwargs["protocol"] not in name_to_embedder: if kwargs["protocol"] in ALL_PROTOCOLS: raise InvalidParameterError( f"The extra for the protocol {kwargs['protocol']} is missing. " "See https://docs.bioembeddings.com/#installation on how to install all extras" ) raise InvalidParameterError( "Invalid protocol selection: {}. Valid protocols are: {}".format( kwargs["protocol"], ", ".join(name_to_embedder.keys()) ) ) embedder_class = name_to_embedder[kwargs["protocol"]] if kwargs["protocol"] == "unirep" and kwargs.get("use_cpu") is not None: raise InvalidParameterError("UniRep does not support configuring `use_cpu`") result_kwargs = deepcopy(kwargs) # Download necessary files if needed # noinspection PyProtectedMember for file in embedder_class._necessary_files: if not result_kwargs.get(file): result_kwargs[file] = get_model_file(model=embedder_class.name, file=file) # noinspection PyProtectedMember for directory in embedder_class._necessary_directories: if not result_kwargs.get(directory): result_kwargs[directory] = get_model_directories_from_zip( model=embedder_class.name, directory=directory ) result_kwargs.setdefault("max_amino_acids", DEFAULT_MAX_AMINO_ACIDS[kwargs["protocol"]]) file_manager = get_file_manager(**kwargs) embedder: EmbedderInterface = embedder_class(**result_kwargs) _check_transform_embeddings_function(embedder, result_kwargs) return embed_and_write_batched(embedder, file_manager, result_kwargs, kwargs.get("half_precision", False))
def prepare_kwargs(**kwargs): required_kwargs = [ "protocol", "prefix", "stage_name", "remapped_sequences_file", "mapping_file", ] check_required(kwargs, required_kwargs) if kwargs["protocol"] not in name_to_embedder: if kwargs["protocol"] in ALL_PROTOCOLS: raise InvalidParameterError( f"The extra for the protocol {kwargs['protocol']} is missing. " "See https://docs.bioembeddings.com/#installation on how to install all extras" ) raise InvalidParameterError( "Invalid protocol selection: {}. Valid protocols are: {}".format( kwargs["protocol"], ", ".join(name_to_embedder.keys()))) embedder_class = name_to_embedder[kwargs["protocol"]] if kwargs["protocol"] == "unirep" and kwargs.get("use_cpu") is not None: raise InvalidParameterError( "UniRep does not support configuring `use_cpu`") # See parameter_blueprints.yml global_options = {"sequences_file", "simple_remapping", "start_time"} embed_options = { "decoder", "device", "discard_per_amino_acid_embeddings", "half_precision_model", "half_precision", "max_amino_acids", "reduce", "type", } known_parameters = (set(required_kwargs) | global_options | embed_options | set(embedder_class.necessary_files) | set(embedder_class.necessary_directories)) if embedder_class == "seqvec": # We support two ways of configuration for seqvec known_parameters.add("model_directory") if not set(kwargs) < known_parameters: # Complain louder if the input looks fishier for option in set(kwargs) - known_parameters: logger.warning( f"You set an unknown option for {embedder_class.name}: {option} (value: {kwargs[option]})" ) if kwargs.get("half_precision_model"): if kwargs["protocol"] not in [ "prottrans_t5_bfd", "prottrans_t5_uniref50" ]: raise InvalidParameterError( "`half_precision_model` is only supported with prottrans_t5_bfd and prottrans_t5_uniref50" ) if kwargs.get("half_precision") is False: # None remains allowed raise InvalidParameterError( "You can't have `half_precision_model` be true and `half_precision` be false. " "We suggest also setting `half_precision` to true, " "which will compute and save embeddings as half-precision floats" ) result_kwargs = deepcopy(kwargs) result_kwargs.setdefault("max_amino_acids", DEFAULT_MAX_AMINO_ACIDS[kwargs["protocol"]]) return embedder_class, result_kwargs