import ray import torch from biotransformers.utils.compute_utils import Mutation, get_list_probs, mutation_score from biotransformers.utils.constant import NATURAL_AAS_LIST from biotransformers.utils.logger import logger # noqa from biotransformers.utils.tqdm_utils import ProgressBar from biotransformers.utils.utils import init_model_sequences, load_fasta from biotransformers.wrappers.language_model import LanguageModel from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.loggers import CSVLogger from ..lightning_utils.data import BatchWithConstantNumberTokensDataModule from ..lightning_utils.models import LightningModule log = logger("transformers_wrapper") PathMsaFolder = str TokenProbsDict = Dict[int, Dict[str, float]] SequenceProbsList = List[TokenProbsDict] class TransformersWrapper: """ Abstract class that uses pretrained transformers model to evaluate a protein likelihood so as other insights. """ def __init__( self, model_dir: str, language_model_cls: Type[LanguageModel], num_gpus: int = 0,
import math import os from dataclasses import dataclass from typing import List, Tuple from Bio import SeqIO from biotransformers.utils.logger import logger log = logger("utils") def convert_bytes_size(size_bytes: int) -> Tuple[str, bool]: """[summary] Args: size_bytes: size in bytes Returns: Tuple[str,bool]: return the size with correct units and a condition to display the warning message. """ if size_bytes == 0: return "0B", False size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB") i = int(math.floor(math.log(size_bytes, 1024))) p = math.pow(1024, i) s = int(round(size_bytes / p, 2)) is_warning = i >= 3 # warning on size only for model in GB return "%s%s" % (s, size_name[i]), is_warning
from typing import Tuple import torch from biotransformers.utils.logger import logger log = logger("gpus_utils") def set_device(device: str, multi_gpu: bool) -> Tuple[str, bool]: """Set the correct device CPU/GPU Args: device (str) : could be cpu/cuda:0/cuda multi_gpu (bool) : use multi_gpu the same Node Returns: Tuple[str, bool]: * device: str * multi_gpu: bool """ n_gpus = torch.cuda.device_count() if multi_gpu: if not torch.cuda.is_available(): log.warning("No GPU available, use CPU device") return "cpu", False if not n_gpus > 1: log.warning( "Trying to use multi-gpu with only one device, use cuda:0") return "cuda:0", False else:
from biotransformers.bio_transformers import BioTransformers # noqa from biotransformers.utils.logger import logger # noqa from .version import VERSION # noqa log = logger("biotransformers") __version__ = VERSION
- ProtBert BFD: https://huggingface.co/Rostlab/prot_bert_bfd """ import copy from typing import Dict, List, Tuple import torch from biotransformers.lightning_utils.data import AlphabetDataLoader from biotransformers.utils.constant import DEFAULT_ROSTLAB_MODEL, ROSTLAB_LIST from biotransformers.utils.logger import logger # noqa from biotransformers.utils.utils import _generate_chunks, _get_num_batch_iter from biotransformers.wrappers.language_model import LanguageModel from ray.actor import ActorHandle from tqdm import tqdm from transformers import BertForMaskedLM, BertTokenizer log = logger("rostlab_wrapper") class RostlabWrapper(LanguageModel): """ Class that uses a rostlab type of pretrained transformers model to evaluate a protein likelihood so as other insights. """ def __init__(self, model_dir: str, device): if model_dir not in ROSTLAB_LIST: print(f"Model dir '{model_dir}' not recognized." f" Using '{DEFAULT_ROSTLAB_MODEL}' as default") model_dir = DEFAULT_ROSTLAB_MODEL super().__init__(model_dir=model_dir, device=device) self.tokenizer = BertTokenizer.from_pretrained(model_dir, do_lower_case=False,
specific to the ESM model developed by FAIR (https://github.com/facebookresearch/esm). """ from typing import Dict, List, Tuple import esm import torch from biotransformers.lightning_utils.data import AlphabetDataLoader from biotransformers.utils.constant import DEFAULT_ESM_MODEL, ESM_LIST from biotransformers.utils.logger import logger # noqa from biotransformers.utils.utils import _generate_chunks, _get_num_batch_iter from biotransformers.wrappers.language_model import LanguageModel from ray.actor import ActorHandle from tqdm import tqdm log = logger("esm_wrapper") path_msa_folder = str class ESMWrapper(LanguageModel): """ Class that uses an ESM type of pretrained transformers model to evaluate a protein likelihood so as other insights. """ def __init__(self, model_dir: str, device: str): if model_dir not in ESM_LIST: print( f"Model dir '{model_dir}' not recognized. Using '{DEFAULT_ESM_MODEL}' as default" ) model_dir = DEFAULT_ESM_MODEL super().__init__(model_dir=model_dir, device=device)